In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('STUDENT.csv')

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1044 entries, 0 to 1043
Data columns (total 35 columns):
id             1044 non-null int64
InitialName    1044 non-null object
school         992 non-null object
sex            1044 non-null object
age            971 non-null float64
address        1044 non-null object
famsize        1044 non-null object
Pstatus        1044 non-null object
Medu           1044 non-null int64
Fedu           1044 non-null int64
Mjob           1044 non-null object
Fjob           1044 non-null object
reason         428 non-null object
guardian       1044 non-null object
traveltime     1044 non-null int64
studytime      1044 non-null int64
failures       1044 non-null int64
schoolsup      1044 non-null object
famsup         1044 non-null object
paid           1044 non-null object
activities     1044 non-null object
nursery        1044 non-null object
higher         1044 non-null object
internet       1044 non-null object
romantic       1044 non-null object
f

In [4]:
# Task 1.1
df['G3'].value_counts()

PASS    661
FAIL    383
Name: G3, dtype: int64

In [5]:
# Task 1.2
def preprocess():
    import pandas as pd
    #Preprocess data
    df_raw = pd.read_csv('STUDENT.csv', index_col=0)
    df = df_raw.drop(['InitialName', 'guardian'], axis=1)
    
    #Map binaries
    df['address'] = df['address'].map({ 'U':0, 'R':1 })
    df['sex'] = df['sex'].map({ 'M':0, 'F':1 })
    df['famsize'] = df['famsize'].map({ 'LE3':0, 'GT3':1 })
    df['Pstatus'] = df['Pstatus'].map({ 'A':0, 'T':1 })
    df['schoolsup'] = df['schoolsup'].map({ 'no':0, 'yes':1 })
    df['famsup'] = df['famsup'].map({ 'no':0, 'yes':1 })
    df['paid'] = df['paid'].map({ 'no':0, 'yes':1 })
    df['activities'] = df['activities'].map({ 'no':0, 'yes':1 })
    df['nursery'] = df['nursery'].map({ 'no':0, 'yes':1 })
    df['higher'] = df['higher'].map({ 'no':0, 'yes':1 })
    df['internet'] = df['internet'].map({ 'no':0, 'yes':1 })
    df['romantic'] = df['romantic'].map({ 'no':0, 'yes':1 })
    df['G3'] = df['G3'].map({ 'FAIL': 0, 'PASS': 1 })

    #Fill via median for age
    df['age'].fillna(df['age'].median(), inplace=True)
    
    #Fill via 'none' since hot encode will flag
    df['school'].fillna('none', inplace=True)
    df['reason'].fillna('none', inplace=True)

    #Drop g1 & 2 empty rows
    cols_miss_drop =['G1', 'G2']
    mask = pd.isnull(df['G1'])
    for col in cols_miss_drop:
     mask = mask | pd.isnull(df[col])
    df = df[~mask]

    #Hot encode
    df = pd.get_dummies(df)
    return df

In [6]:
def analyse_feature_importance(dm_model, feature_names, n_to_display=20):
    # grab feature importances from the model
    importances = dm_model.feature_importances_

    # sort them out in descending order
    indices = np.argsort(importances)
    indices = np.flip(indices, axis=0)
    # limit to 20 features, you can leave this out to print out everything
    indices = indices[:n_to_display]
    for i in indices:
        print(feature_names[i], ':', importances[i])
        
def visualize_decision_tree(dm_model, feature_names, save_name):
    dotfile = StringIO()
    export_graphviz(dm_model, out_file=dotfile, feature_names=feature_names)
    graph = pydot.graph_from_dot_data(dotfile.getvalue())
    graph.write_png(save_name) # saved in the following file

In [7]:
df = preprocess()
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 982 entries, 0 to 1043
Data columns (total 46 columns):
sex                  982 non-null int64
age                  982 non-null float64
address              982 non-null int64
famsize              982 non-null int64
Pstatus              982 non-null int64
Medu                 982 non-null int64
Fedu                 982 non-null int64
traveltime           982 non-null int64
studytime            982 non-null int64
failures             982 non-null int64
schoolsup            982 non-null int64
famsup               982 non-null int64
paid                 982 non-null int64
activities           982 non-null int64
nursery              982 non-null int64
higher               982 non-null int64
internet             982 non-null int64
romantic             982 non-null int64
famrel               982 non-null int64
freetime             982 non-null int64
goout                982 non-null int64
Dalc                 982 non-null int64
Walc        

In [8]:
df['address'].value_counts()

0    714
1    268
Name: address, dtype: int64

In [9]:
df['school_none'].value_counts()

0    932
1     50
Name: school_none, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

y = df['G3']
x = df.drop(['G3'], axis=1)

#set seed for randomisation
rs = 10

#Convert x into numpy matrix for sklearn consumption
x_mat = x.as_matrix()
#Setup training and test datasets on a 70/30 split
x_train, x_test, y_train, y_test = train_test_split(x_mat, y, test_size=0.3, stratify=y, random_state=rs)

#simple decision tree training
model = DecisionTreeClassifier(random_state=rs)
model.fit(x_train, y_train)

#Check accuracy on the training sets
print("Train accuracy:", model.score(x_train, y_train))
#Check accuracy on the test sets
print("Test accuracy:", model.score(x_test, y_test))

y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

#Check which features have the largest impact on the decision tree?
import numpy as np

#grab feature importances from the model and feature name from the original x
importances = model.feature_importances_
feature_names = x.columns

#sort in descending order
indices = np.argsort(importances)
indices = np.flip(indices, axis=0)

#limit to 20 features
indices = indices[:20]

for i in indices:
    print(feature_names[i], ':', importances[i])

Train accuracy: 1.0
Test accuracy: 0.8677966101694915
             precision    recall  f1-score   support

          0       0.87      0.77      0.81       111
          1       0.87      0.93      0.90       184

avg / total       0.87      0.87      0.87       295

G2 : 0.728947131127325
G1 : 0.06554284489147653
age : 0.040021614204437325
goout : 0.01403683100070638
famsize : 0.01359354805634928
studytime : 0.013138089300880002
health : 0.01294153520897707
activities : 0.012387072640772425
Medu : 0.0117368185021462
famrel : 0.011285402405909811
school_THS : 0.010862685836463647
Mjob_at_home : 0.010640522268429247
traveltime : 0.009384349500628572
nursery : 0.008518794428043128
Dalc : 0.008162879290833343
Fjob_other : 0.008069062720225511
absences : 0.0058190356155472436
freetime : 0.005734348997002898
sex : 0.004137980882166929
school_DCHS : 0.003103485661625197


  if sys.path[0] == '':


In [11]:
import pydot
from io import StringIO
from sklearn.tree import export_graphviz
# visualize
dotfile = StringIO()
export_graphviz(model, out_file=dotfile, feature_names=x.columns)
graph = pydot.graph_from_dot_data(dotfile.getvalue())
graph.write_png("brendan-default_decision.png") # saved in the following file - will return True if successful

True

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
# grid search CV
params = {'criterion': ['gini', 'entropy'],
 'max_depth': range(2, 7),
 'min_samples_leaf': range(20, 60, 10)}
cv = GridSearchCV(param_grid=params, estimator=DecisionTreeClassifier(random_state=rs),cv=10)
cv.fit(x_train, y_train)
print("Train accuracy:", cv.score(x_train, y_train))
print("Test accuracy:", cv.score(x_test, y_test))
# test the best model
y_pred = cv.predict(x_test)
print(classification_report(y_test, y_pred))
# print parameters of the best model
print(cv.best_params_)

Train accuracy: 0.9199417758369723
Test accuracy: 0.9389830508474576
             precision    recall  f1-score   support

          0       0.95      0.88      0.92       111
          1       0.93      0.97      0.95       184

avg / total       0.94      0.94      0.94       295

{'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 20}


In [14]:
# grid search CV
params = {'criterion': ['gini', 'entropy'],
 'max_depth': range(1, 15),
 'min_samples_leaf': range(15, 25)}
cv = GridSearchCV(param_grid=params, estimator=DecisionTreeClassifier(random_state=rs),cv=10)
cv.fit(x_train, y_train)
print("Train accuracy:", cv.score(x_train, y_train))
print("Test accuracy:", cv.score(x_test, y_test))
# test the best model
y_pred = cv.predict(x_test)
print(classification_report(y_test, y_pred))
# print parameters of the best model
print(cv.best_params_)

Train accuracy: 0.9199417758369723
Test accuracy: 0.9389830508474576
             precision    recall  f1-score   support

          0       0.95      0.88      0.92       111
          1       0.93      0.97      0.95       184

avg / total       0.94      0.94      0.94       295

{'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 19}


In [15]:
analyse_feature_importance(cv.best_estimator_, x.columns, 20)

G2 : 0.9290414017229142
G1 : 0.0560628086217979
school_THS : 0.01459114197314507
schoolsup : 0.0003046476821429053
reason_reputation : 0.0
paid : 0.0
famrel : 0.0
romantic : 0.0
internet : 0.0
higher : 0.0
nursery : 0.0
activities : 0.0
famsup : 0.0
goout : 0.0
failures : 0.0
studytime : 0.0
traveltime : 0.0
Fedu : 0.0
Medu : 0.0
Pstatus : 0.0


In [16]:
visualize_decision_tree(cv.best_estimator_, x.columns, "brendan-optimal_decision.png")

In [17]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [18]:
x_train_scaler = scaler.fit_transform(x_train, y_train)

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
model = LogisticRegression(random_state=rs)
# fit it to training data
model.fit(x_train_scaler, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=10, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
# training and test accuracy
print("Train accuracy:", model.score(x_train_scaler, y_train))
print("Test accuracy:", model.score(x_test, y_test))
# classification report on test data
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

Train accuracy: 0.9344978165938864
Test accuracy: 0.6237288135593221
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       111
          1       0.62      1.00      0.77       184

avg / total       0.39      0.62      0.48       295



  'precision', 'predicted', average, warn_for)


In [24]:
feature_names = x.columns
coef = model.coef_[0]
# limit to 20 features, you can comment the following line to print out everything
coef = coef[:20]
for i in range(len(coef)):
    print(feature_names[i], ':', coef[i])

sex : 0.043979939300232834
age : -0.15336111499974964
address : 0.19969698512933917
famsize : 0.3666946995418554
Pstatus : -0.0695351252895923
Medu : -0.14180700422728906
Fedu : 0.079462013890026
traveltime : 0.34692001733683736
studytime : -0.14484653993929872
failures : -0.7496606393577496
schoolsup : -0.15017905521512137
famsup : 0.04355493331545878
paid : -0.13504295957502302
activities : -0.38709218427760594
nursery : -0.39336900504019934
higher : 0.12180631570219788
internet : 0.028764980449805407
romantic : 0.19972102181283796
famrel : -0.074284659093366
freetime : -0.24108707495649778


In [25]:
# grab feature importances from the model and feature name from the original X
coef = model.coef_[0]
feature_names = x.columns
# sort them out in descending order
indices = np.argsort(np.absolute(coef))
indices = np.flip(indices, axis=0)
# limit to 20 features, you can leave this out to print out everything
indices = indices[:20]
for i in indices:
    print(feature_names[i], ':', coef[i])

G2 : 3.768695032609562
G1 : 1.6624997633451108
failures : -0.7496606393577496
nursery : -0.39336900504019934
activities : -0.38709218427760594
school_THS : -0.37667841190589385
famsize : 0.3666946995418554
traveltime : 0.34692001733683736
school_DCHS : 0.3019935875583392
absences : -0.27929203006895265
Mjob_at_home : -0.2558044963150998
freetime : -0.24108707495649778
Fjob_health : -0.22423860232526643
romantic : 0.19972102181283796
address : 0.19969698512933917
Fjob_at_home : 0.1951435340402709
Mjob_teacher : 0.17412997397304394
Fjob_teacher : 0.1675800258704016
age : -0.15336111499974964
schoolsup : -0.15017905521512137


In [30]:
# grid search CV
params = {'C': [pow(10, x) for x in range(-8, 6)]}
# use all cores to tune logistic regression with C parameter
cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs),
cv=10, n_jobs=-1)
cv.fit(x_train, y_train)
# test the best model
print("Train accuracy:", cv.score(x_train_scaler, y_train))
print("Test accuracy:", cv.score(x_test, y_test))
y_pred = cv.predict(x_test)
print(classification_report(y_test, y_pred))
# print parameters of the best model
print(cv.best_params_)

Train accuracy: 0.6637554585152838
Test accuracy: 0.8847457627118644
             precision    recall  f1-score   support

          0       0.90      0.78      0.84       111
          1       0.88      0.95      0.91       184

avg / total       0.89      0.88      0.88       295

{'C': 1}
