In [31]:
#Preprocess data
import pandas as pd
df_raw = pd.read_csv('STUDENT.csv', index_col=0)
df = df_raw.drop(['InitialName', 'reason'], axis=1)
#Map binaries
address_map = { 'U':0, 'R':1 }
df['address'] = df['address'].map(address_map)

sex_map = { 'M':0, 'F':1 }
df['sex'] = df['sex'].map(sex_map)

famsize_map = { 'LE3':0, 'GT3':1 }
df['famsize'] = df['famsize'].map(famsize_map)

Pstatus_map = { 'A':0, 'T':1 }
df['Pstatus'] = df['Pstatus'].map(Pstatus_map)

schoolsup_map = { 'no':0, 'yes':1 }
df['schoolsup'] = df['schoolsup'].map(schoolsup_map)

famsup_map = { 'no':0, 'yes':1 }
df['famsup'] = df['famsup'].map(famsup_map)

paid_map = { 'no':0, 'yes':1 }
df['paid'] = df['paid'].map(paid_map)

activities_map = { 'no':0, 'yes':1 }
df['activities'] = df['activities'].map(activities_map)

nursery_map = { 'no':0, 'yes':1 }
df['nursery'] = df['nursery'].map(nursery_map)

higher_map = { 'no':0, 'yes':1 }
df['higher'] = df['higher'].map(higher_map)

internet_map = { 'no':0, 'yes':1 }
df['internet'] = df['internet'].map(internet_map)

romantic_map = { 'no':0, 'yes':1 }
df['romantic'] = df['romantic'].map(romantic_map)

g3_map = { 'FAIL': 0, 'PASS': 1 }
df['G3'] = df['G3'].map(g3_map)

#Flag school for missing vals (Dont need this cause no school is already 0)
#df['school_nan'] = pd.isnull(df['school'])
#df['school'].fillna(0, inplace=True)

#Fill via median for age
df['age'].fillna(df['age'].median(), inplace=True)

#Drop g1 & 2 empty rows
cols_miss_drop =['G1', 'G2']
mask = pd.isnull(df['G1'])
for col in cols_miss_drop:
 mask = mask | pd.isnull(df[col])
df = df[~mask]

#Hot encode
df = pd.get_dummies(df)

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

y = df['G3']
x = df.drop(['G3'], axis=1)

#set seed for randomisation
rs = 10

#Convert x into numpy matrix for sklearn consumption
x_mat = x.as_matrix()
#Setup training and test datasets on a 70/30 split
x_train, x_test, y_train, y_test = train_test_split(x_mat, y, test_size=0.3, stratify=y, random_state=rs)

#simple decision tree training
model = DecisionTreeClassifier(random_state=rs)
model.fit(x_train, y_train)

#Check accuracy on the training sets
print("Train accuracy:", model.score(x_train, y_train))
#Check accuracy on the test sets
print("Test accuracy:", model.score(x_test, y_test))

y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

#Check which features have the largest impact on the decision tree?
import numpy as np

#grab feature importances from the model and feature name from the original x
importances = model.feature_importances_
feature_names = x.columns

#sort in descending order
indices = np.argsort(importances)
indices = np.flip(indices, axis=0)

#limit to 20 features
indices = indices[:20]

for i in indices:
    print(feature_names[i], ':', importances[i])

Train accuracy: 1.0
Test accuracy: 0.8949152542372881
             precision    recall  f1-score   support

          0       0.91      0.80      0.85       111
          1       0.89      0.95      0.92       184

avg / total       0.90      0.89      0.89       295

G2 : 0.72916656950744
G1 : 0.06573907540770767
age : 0.02261518411558561
health : 0.017079516091144
traveltime : 0.01409528158186477
absences : 0.013736902880462809
famsize : 0.01359354805634928
Mjob_at_home : 0.01270951270951271
activities : 0.012387072640772425
goout : 0.011761664938745722
school_THS : 0.010862685836463647
guardian_other : 0.008944711487272973
school_DCHS : 0.008867101890357705
nursery : 0.008518794428043128
schoolsup : 0.006901544518654539
freetime : 0.006187389806575855
Medu : 0.006094117299191295
failures : 0.005689723712979528
Dalc : 0.005328362386548996
Fedu : 0.0051724761027086615


  if sys.path[0] == '':
