In [4]:
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

In [2]:
missing_data = '-'

In [5]:
db = pd.read_csv('ODI-2018_clean.csv')

In [6]:
# create the discretized version of the money feature
db['moneyBins'] = pd.qcut(db['money'], 3, np.arange(3))

In [7]:
# try different feature sets
features1 = ['programme','DB','y_birth']
features2 = ['programme','DB','stat','gender']
features3 = ['programme','DB','IR','moneyBins']
target = 'ML'

features = features3

# drop rows where selected features are missing
columns = np.append(features, target)
classification_data = pd.DataFrame(db[columns].replace(missing_data, np.nan).dropna(), columns=columns)

# transform features to numerical categories
le = preprocessing.LabelEncoder()
for c in columns:
    classification_data[c] = le.fit_transform(classification_data[c])
    print(list(le.classes_))
    
# encode categorial features as multiple binary features
enc = preprocessing.OneHotEncoder()
binary_features = enc.fit_transform(classification_data[features])

['AI', 'BA', 'Bioinformatics', 'CLS', 'CS', 'Drug Discovery and Safety', 'Duisenberg Honor Programme', 'EOR', 'Economics', 'Exchange', 'Finance', 'Human Movement Science', 'MPA', 'Mathematics', 'PhD student', 'Physics', 'QRM', 'SBI']
['no', 'yes']
['no', 'yes']
[0, 1, 2]
['no', 'yes']


In [8]:
# perform 10-times repeated 10-fold cross-validation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)

dt_scores = cross_val_score(DecisionTreeClassifier(), binary_features, classification_data[target], cv=cv)
rf_scores = cross_val_score(RandomForestClassifier(), binary_features, classification_data[target], cv=cv)
lr_scores = cross_val_score(LogisticRegression(), binary_features, classification_data[target], cv=cv)
print(np.mean(dt_scores), np.mean(rf_scores), np.mean(lr_scores))

0.7185062656641604 0.7156177944862158 0.7600313283208021
