In [24]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyRegressor
import lightgbm
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV

In [25]:
train = pd.read_csv('data/train_preprocessed.csv', sep='\t')
test = pd.read_csv('data/test_preprocessed.csv', sep='\t')
train = train.drop(['Opportunity(Default)'], axis=1)
test = test.drop(['Opportunity(Default)'], axis=1)

# Separation of dataset
train_x = train.dropna()
train_y = np.array(train_x['Correct First Attempt']).astype(int)
train_x = train_x.drop(['Correct First Attempt'],axis = 1)
test_x = test.dropna()
test_y = np.array(test_x['Correct First Attempt']).astype(int)
test_x = test_x.drop(['Correct First Attempt'],axis = 1)

In [26]:
def loss_function(ypred, y):
    distance = np.square(np.subtract(ypred, y))
    avg = np.mean(distance)
    return np.sqrt(avg)

In [27]:
def normalize(x):
    norm = np.linalg.norm(x, ord=2, axis=1, keepdims=True)
    return x/norm

drop_cols = ['Anon Student Id', 'Problem Name', 'Problem Unit', 'Problem Section', 'Step Name']
tmp_train = train_x.drop(drop_cols, axis=1)
tmp_test = test_x.drop(drop_cols, axis=1)
train_norm_x = normalize(tmp_train)
test_norm_x = normalize(tmp_test)
for c in drop_cols:
    train_norm_x[c] = train_x[c]
    test_norm_x[c] = test_x[c]

In [29]:
# decision tree optimize

max_depth = range(5, 20)
min_samples_leaf = range(1, 9, 2)
tree_para = {'criterion':['gini','entropy'], 'splitter':['best', 'random'], 'max_depth':max_depth, 'min_samples_leaf':min_samples_leaf}
model = GridSearchCV(tree.DecisionTreeClassifier(), tree_para, n_jobs=-1)
model.fit(train_norm_x, train_y)
result = model.predict(test_norm_x)
print("optimze decision tree error is %f" % loss_function(result, test_y))
print("best parameters", model.best_params_ )



optimze decision tree error is 0.450225
best parameters {'criterion': 'entropy', 'max_depth': 14, 'min_samples_leaf': 7, 'splitter': 'best'}
