# Modeling:
    We begin with our imports 

In [1]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 
import wrangle as w

In [2]:
df = pd.read_csv('modeling_edu_data.csv')

df.head()

Unnamed: 0,has_college_degree,free_reduced_lunch,parents_married,is_first_child,nr_siblings,rides_bus,risk_cat
0,1,1,1,1,3.0,1,1
1,1,1,1,1,0.0,1,0
2,1,1,0,1,4.0,1,0
3,1,1,1,0,1.0,1,1
4,1,1,1,1,0.0,1,0


In [3]:
train, validate, test = w.split(df)

In [4]:
X_train = train.drop(columns='risk_cat')
y_train = train.risk_cat
X_validate= validate.drop(columns='risk_cat')
y_validate = validate.risk_cat
X_test = test.drop(columns='risk_cat')
y_test = test.risk_cat

In [5]:
df.risk_cat.value_counts()

risk_cat
1    20782
0     9859
Name: count, dtype: int64

# Create dataframes to house our predictions 

In [6]:
train_pred_df = pd.DataFrame()
validate_pred_df = pd.DataFrame()
test_pred_df = pd.DataFrame()

In [7]:
train_pred_df['actual'] = train.risk_cat
validate_pred_df['actual'] = validate.risk_cat
test_pred_df['actual'] = test.risk_cat

# Well kick off modeling with a baseline

In [8]:
train_pred_df['baseline'] = 1
validate_pred_df['baseline'] = 1
test_pred_df['baseline'] = 1

In [9]:
from sklearn.metrics import classification_report

In [10]:
print(classification_report(train_pred_df.actual, train_pred_df.baseline))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      5522
           1       0.68      1.00      0.81     11636

    accuracy                           0.68     17158
   macro avg       0.34      0.50      0.40     17158
weighted avg       0.46      0.68      0.55     17158



# Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=5, random_state=666)
clf.fit(X_train, y_train)
train_pred_df['clf'] = clf.predict(X_train)

In [12]:
print(classification_report(train_pred_df.actual, train_pred_df.clf))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      5522
           1       0.68      1.00      0.81     11636

    accuracy                           0.68     17158
   macro avg       0.34      0.50      0.40     17158
weighted avg       0.46      0.68      0.55     17158



# Validating our Decision Tree Model

In [13]:
validate_pred_df['clf'] = clf.predict(X_validate)

In [14]:
print(classification_report(validate_pred_df.actual, validate_pred_df.clf))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2393
           1       0.67      1.00      0.81      4961

    accuracy                           0.67      7354
   macro avg       0.34      0.50      0.40      7354
weighted avg       0.46      0.67      0.54      7354



In [15]:
for cols in train_pred_df.columns:
    print(train_pred_df[cols].value_counts())

actual
1    11636
0     5522
Name: count, dtype: int64
baseline
1    17158
Name: count, dtype: int64
clf
1    17158
Name: count, dtype: int64


# KNN 

In [16]:
from sklearn.neighbors import KNeighborsClassifier

In [17]:
knn = KNeighborsClassifier(n_neighbors=7, weights='uniform')

In [18]:
knn.fit(X_train, y_train)

In [19]:
train_pred_df['knn'] = knn.predict(X_train)

In [20]:
print(classification_report(train_pred_df.actual, train_pred_df.knn))

              precision    recall  f1-score   support

           0       0.36      0.31      0.33      5522
           1       0.69      0.74      0.71     11636

    accuracy                           0.60     17158
   macro avg       0.52      0.52      0.52     17158
weighted avg       0.58      0.60      0.59     17158



# Validating our KNN Model 

In [21]:
validate_pred_df['knn'] = knn.predict(X_validate)

In [22]:
print(classification_report(validate_pred_df.actual, validate_pred_df.knn))

              precision    recall  f1-score   support

           0       0.38      0.32      0.35      2393
           1       0.70      0.75      0.72      4961

    accuracy                           0.61      7354
   macro avg       0.54      0.53      0.53      7354
weighted avg       0.59      0.61      0.60      7354



# Logistic Regression 

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
# from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(C=1, random_state=666, intercept_scaling=1, solver='lbfgs')

In [25]:
logit.fit(X_train,y_train)

In [26]:
train_pred_df['logistic'] = logit.predict(X_train)

In [27]:
print(classification_report(train_pred_df.actual, train_pred_df.logistic))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      5522
           1       0.68      1.00      0.81     11636

    accuracy                           0.68     17158
   macro avg       0.34      0.50      0.40     17158
weighted avg       0.46      0.68      0.55     17158



# Random Forest Classifier

In [28]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=666)

In [29]:
rf.fit(X_train,y_train)

In [30]:
train_pred_df['random_forest'] = rf.predict(X_train)

In [31]:
print(classification_report(train_pred_df.actual, train_pred_df.random_forest))

              precision    recall  f1-score   support

           0       0.74      0.00      0.01      5522
           1       0.68      1.00      0.81     11636

    accuracy                           0.68     17158
   macro avg       0.71      0.50      0.41     17158
weighted avg       0.70      0.68      0.55     17158

