In [35]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyRegressor
import lightgbm
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV

In [36]:
train = pd.read_csv('data/train_preprocessed.csv', sep='\t')
test = pd.read_csv('data/test_preprocessed.csv', sep='\t')
train = train.drop(['Opportunity(Default)'], axis=1)
test = test.drop(['Opportunity(Default)'], axis=1)

# Separation of dataset
train_x = train.dropna()
train_y = np.array(train_x['Correct First Attempt']).astype(int)
train_x = train_x.drop(['Correct First Attempt'],axis = 1)
test_x = test.dropna()
test_y = np.array(test_x['Correct First Attempt']).astype(int)
test_x = test_x.drop(['Correct First Attempt'],axis = 1)

print(train_x.shape[:])
print(test_x.shape[:])
print(train_x.head())

(232744, 12)
(666, 12)
   Problem View  KC Count  Opportunity Avg  Anon Student Id  Problem Name  \
0             1         0              0.0              170            69   
1             1         0              0.0              170            69   
2             1         1              1.0              170            69   
3             1         3              1.0              170            69   
4             1         1              1.0              170            69   

   Problem Unit  Problem Section  Step Name  Personal Rate  Problem Rate  \
0            15               33      34382       0.748749      0.710197   
1            15               33      14346       0.748749      0.710197   
2            15               33      58300       0.748749      0.710197   
3            15               33      60417       0.748749      0.710197   
4            15               33      30098       0.748749      0.710197   

   Step Rate   KC Rate  
0   0.840631  0.655650  
1   0.8

In [37]:
def loss_function(ypred, y):
    distance = np.square(np.subtract(ypred, y))
    avg = np.mean(distance)
    return np.sqrt(avg)

In [38]:
def normalize(x):
    norm = np.linalg.norm(x, ord=2, axis=1, keepdims=True)
    return x/norm

drop_cols = ['Anon Student Id', 'Problem Name', 'Problem Unit', 'Problem Section', 'Step Name']
tmp_train = train_x.drop(drop_cols, axis=1)
tmp_test = test_x.drop(drop_cols, axis=1)
train_norm_x = normalize(tmp_train)
test_norm_x = normalize(tmp_test)
for c in drop_cols:
    train_norm_x[c] = train_x[c]
    test_norm_x[c] = test_x[c]

In [39]:
tree = tree.DecisionTreeClassifier()
model = BaggingRegressor(base_estimator=tree, n_estimators=100, max_samples=1.0, bootstrap=True)
model.fit(train_norm_x, train_y)
result = model.predict(test_norm_x)
print("tree bagging error", loss_function(result, test_y))

tree bagging error 0.36569588934452557


In [40]:
model = MLPRegressor(hidden_layer_sizes=(100, 5, 100), activation='tanh', solver='adam')
model.fit(train_norm_x, train_y)
result = model.predict(test_norm_x)
print("MLPRegressor error", loss_function(result, test_y))

MLPRegressor error 0.38955283124050977


In [41]:
model = LogisticRegression(penalty='l2')
model.fit(train_norm_x, train_y)
result = model.predict(test_norm_x)
print("Logistic Regression error", loss_function(result, test_y))

Logistic Regression error 0.43495883620084


In [42]:
model = RandomForestRegressor()
model.fit(train_norm_x, train_y)
result = model.predict(test_norm_x)
print("RandomForest error", loss_function(result, test_y))

RandomForest error 0.3685936261810034


In [43]:
model = neighbors.KNeighborsRegressor()
model.fit(train_norm_x, train_y)
result = model.predict(test_norm_x)
print("KNN error", loss_function(result, test_y))

KNN error 0.3981941217026672


In [44]:
model = DummyRegressor()
model.fit(train_norm_x, train_y)
result = model.predict(test_norm_x)
print("Dummy Regression error", loss_function(result, test_y))

Dummy Regression error 0.3927966234985601


In [45]:
model = XGBClassifier()
model.fit(train_norm_x, train_y)
result = model.predict(test_norm_x)
print("XGBoost error", loss_function(result, test_y))

XGBoost error 0.427999045773683


In [46]:
model = AdaBoostRegressor()
model.fit(train_norm_x, train_y)
result = model.predict(test_norm_x)
print("Adaboost error", loss_function(result, test_y))

Adaboost error 0.38108937717196817


In [47]:
model = GradientBoostingClassifier(n_estimators=200)
model.fit(train_norm_x, train_y)
result = model.predict(test_norm_x)
print("Gradient Decision Tree error", loss_function(result, test_y))

Gradient Decision Tree error 0.4155390146215788


In [49]:
knn = neighbors.KNeighborsRegressor()
model = BaggingRegressor(base_estimator=knn, n_estimators=100, max_samples=1.0, bootstrap=True)
model.fit(train_norm_x, train_y)
result = model.predict(test_norm_x)
print("knn bagging error", loss_function(result, test_y))

knn bagging error 0.3860914329463872
