In [1]:
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import sklearn
import numpy as np
import pandas as pd
import scipy
from inspect import getmembers

In [2]:
missing_data = '-'

In [3]:
db = pd.read_csv('ODI-2018_clean.csv')

In [4]:
# create the discretized version of the money feature
db['moneyBins'] = pd.qcut(db['money'], 3, np.arange(3))

In [5]:
# try different feature sets
features1 = ['programme','DB','y_birth']
features2 = ['programme','DB','stat','gender']
features3 = ['programme','DB','IR','moneyBins']
target = 'ML'

features = features3

# drop rows where selected features are missing
columns = np.append(features, target)
classification_data = pd.DataFrame(db[columns].replace(missing_data, np.nan).dropna(), columns=columns)

# transform features to numerical categories
le = preprocessing.LabelEncoder()
for c in columns:
    classification_data[c] = le.fit_transform(classification_data[c])
    print(list(le.classes_))
    
# encode categorial features as multiple binary features
enc = preprocessing.OneHotEncoder()
binary_features = enc.fit_transform(classification_data[features])
binary_features.shape

['AI', 'BA', 'Bioinformatics', 'CLS', 'CS', 'Drug Discovery and Safety', 'Duisenberg Honor Programme', 'EOR', 'Economics', 'Exchange', 'Finance', 'Human Movement Science', 'MPA', 'Mathematics', 'PhD student', 'Physics', 'QRM', 'SBI']
['no', 'yes']
['no', 'yes']
[0, 1, 2]
['no', 'yes']


(204, 25)

In [6]:
# perform 10-times repeated 10-fold cross-validation
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=np.arange(10))
dt_scores, rf_scores, rf200_scores, lr_scores = [], [], [], []

for run in range(10):
    cv = StratifiedKFold(n_splits=10, random_state=run)

    dt_scores.extend(cross_val_score(DecisionTreeClassifier(random_state=run), binary_features, classification_data[target], cv=cv))
    rf_scores.extend(cross_val_score(RandomForestClassifier(random_state=run), binary_features, classification_data[target], cv=cv))
    rf200_scores.extend(cross_val_score(RandomForestClassifier(random_state=run, n_estimators=200), binary_features, classification_data[target], cv=cv))
    lr_scores.extend(cross_val_score(LogisticRegression(), binary_features, classification_data[target], cv=cv))
    
print(np.mean(dt_scores), np.mean(rf_scores), np.mean(rf200_scores), np.mean(lr_scores))

0.7210776942355889 0.7179461152882206 0.7224849624060152 0.768032581453634


In [7]:
print('compare dt and rf: ', scipy.stats.wilcoxon(dt_scores,rf_scores))
print('compare dt and lr: ', scipy.stats.wilcoxon(dt_scores,lr_scores))
print('compare rf and lr: ', scipy.stats.wilcoxon(rf_scores,lr_scores))
print('compare rf and rf200: ', scipy.stats.wilcoxon(rf_scores,rf200_scores))
print('compare dt and rf200: ', scipy.stats.wilcoxon(dt_scores,rf200_scores))
print('compare lr and rf200: ', scipy.stats.wilcoxon(rf200_scores,lr_scores))

compare dt and rf:  WilcoxonResult(statistic=661.0, pvalue=0.1877210029546681)
compare dt and lr:  WilcoxonResult(statistic=510.0, pvalue=4.293578279619104e-07)
compare rf and lr:  WilcoxonResult(statistic=292.0, pvalue=3.3590963679710135e-09)
compare rf and rf200:  WilcoxonResult(statistic=731.5, pvalue=0.24536743657730342)
compare dt and rf200:  WilcoxonResult(statistic=920.0, pvalue=0.6911152048437503)
compare lr and rf200:  WilcoxonResult(statistic=195.0, pvalue=3.3639202733373324e-09)


In [8]:
dt_full_dataset = DecisionTreeClassifier(random_state=0).fit(binary_features, classification_data[target])
dt_prediction_train = dt_full_dataset.predict(binary_features)
print(dt_full_dataset.tree_.node_count, dt_full_dataset.tree_.max_depth)
accuracy_score(dt_prediction_train, classification_data[target])

93 11


0.8382352941176471

In [9]:
rf_full_dataset = RandomForestClassifier(random_state=0).fit(binary_features, classification_data[target])
rf_prediction_train = rf_full_dataset.predict(binary_features)
depths = [t.tree_.max_depth for t in rf_full_dataset.estimators_]
print(depths, np.max(depths), np.min(depths), np.mean(depths))
nodes = [t.tree_.node_count for t in rf_full_dataset.estimators_]
print(nodes, np.max(nodes), np.min(nodes), np.mean(nodes))
accuracy_score(rf_prediction_train, classification_data[target])

[10, 9, 11, 9, 11, 11, 17, 12, 14, 12] 17 9 11.6
[79, 77, 79, 85, 83, 77, 87, 77, 87, 73] 87 73 80.4


0.8382352941176471

In [10]:
rf200_full_dataset = RandomForestClassifier(random_state=0,n_estimators=200).fit(binary_features, classification_data[target])
rf200_prediction_train = rf200_full_dataset.predict(binary_features)
depths = [t.tree_.max_depth for t in rf200_full_dataset.estimators_]
print(np.max(depths), np.min(depths), np.mean(depths))
nodes = [t.tree_.node_count for t in rf200_full_dataset.estimators_]
print(np.max(nodes), np.min(nodes), np.mean(nodes))
accuracy_score(rf200_prediction_train, classification_data[target])

17 9 12.805
111 65 87.42


0.8382352941176471

In [11]:
lr_full_dataset = LogisticRegression().fit(binary_features, classification_data[target])
lr_prediction_train = lr_full_dataset.predict(binary_features)
accuracy_score(lr_prediction_train, classification_data[target])

0.7843137254901961