In [15]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyRegressor
import lightgbm
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV

In [16]:
train = pd.read_csv('data/train_preprocessed.csv', sep='\t')
test = pd.read_csv('data/test_preprocessed.csv', sep='\t')
train = train.drop(['Opportunity(Default)'], axis=1)
test = test.drop(['Opportunity(Default)'], axis=1)

# Separation of dataset
train_x = train.dropna()
train_y = np.array(train_x['Correct First Attempt']).astype(int)
train_x = train_x.drop(['Correct First Attempt'],axis = 1)
test_x = test.dropna()
test_y = np.array(test_x['Correct First Attempt']).astype(int)
test_x = test_x.drop(['Correct First Attempt'],axis = 1)

print(train_x.shape[:])
print(test_x.shape[:])
print(train_x.head())

(232744, 12)
(666, 12)
   Problem View  KC Count  Opportunity Avg  Anon Student Id  Problem Name  \
0             1         0              0.0              170            69   
1             1         0              0.0              170            69   
2             1         1              1.0              170            69   
3             1         3              1.0              170            69   
4             1         1              1.0              170            69   

   Problem Unit  Problem Section  Step Name  Personal Rate  Problem Rate  \
0            15               33      34382       0.748749      0.710197   
1            15               33      14346       0.748749      0.710197   
2            15               33      58300       0.748749      0.710197   
3            15               33      60417       0.748749      0.710197   
4            15               33      30098       0.748749      0.710197   

   Step Rate   KC Rate  
0   0.840631  0.655650  
1   0.8

In [17]:
# RMSE
def loss_function(ypred, y):
    distance = np.square(np.subtract(ypred, y))
    avg = np.mean(distance)
    return np.sqrt(avg)

Decision Tree

In [18]:
model = tree.DecisionTreeClassifier()
model = model.fit(train_x, train_y)
ypred = model.predict(test_x)
ypred = ypred.astype(float)
print('Decision Tree', loss_function(ypred, test_y))

Decision Tree 0.5927489783638191


KNN

In [20]:
# KNN
model = neighbors.KNeighborsRegressor()
model.fit(train_x, train_y)
result = model.predict(test_x)
print("KNN error", loss_function(result, test_y))

KNN error 0.39992491787826867


DummyRegressor

In [21]:
model = DummyRegressor()
model.fit(train_x, train_y)
result = model.predict(test_x)
print("Dummy Regression error", loss_function(result, test_y))

Dummy Regression error 0.3927966234985601


Random Forest

In [22]:
model = RandomForestRegressor()
model.fit(train_x, train_y)
result = model.predict(test_x)
print("RandomForest error", loss_function(result, test_y))

RandomForest error 0.5164275819971735


XGBoost

In [26]:
model = XGBClassifier()
model.fit(train_x, train_y)
result = model.predict(test_x)
print("XGBoost error", loss_function(result, test_y))

XGBoost error 0.5466249160062124


AdaBoost

In [25]:
model = AdaBoostRegressor()
model.fit(train_x, train_y)
result = model.predict(test_x)
print("Adaboost error", loss_function(result, test_y))

Adaboost error 0.5382797121241062


Gradient Boosting

In [29]:
model = GradientBoostingClassifier(n_estimators=200)
model.fit(train_x, train_y)
result = model.predict(test_x)
print("Gradient Decision Tree error", loss_function(result, test_y))

Gradient Decision Tree error 0.5507297915523577


Logistic Regression

In [30]:
model = LogisticRegression(penalty='l2')
model.fit(train_x, train_y)
result = model.predict(test_x)
print("Logistic Regression error", loss_function(result, test_y))

Logistic Regression error 0.43495883620084


MLP

In [32]:
model = MLPRegressor(hidden_layer_sizes=(100, 5, 100), activation='tanh', solver='adam')
model.fit(train_x, train_y)
result = model.predict(test_x)
print("MLPRegressor error", loss_function(result, test_y))

MLPRegressor error 0.387724036278484


### Bagging

Bagging avoids overfitting of data and is used for both regression and classification models, specifically for decision tree algorithms.

Decision Tree Bagging

In [42]:
tree = tree.DecisionTreeClassifier()
model = BaggingRegressor(base_estimator=tree, n_estimators=100, max_samples=1.0, bootstrap=True)
model.fit(train_x, train_y)
result = model.predict(test_x)
print("tree bagging error", loss_function(result, test_y))

tree bagging error 0.5173741160668709


KNN Bagging

In [44]:
knn = neighbors.KNeighborsRegressor()
model = BaggingRegressor(base_estimator=knn, n_estimators=100, max_samples=1.0, bootstrap=True)
model.fit(train_x, train_y)
result = model.predict(test_x)
print("knn bagging error", loss_function(result, test_y))

knn bagging error 0.3894057380532917


Normalization

In [37]:
def normalize(x):
    norm = np.linalg.norm(x, ord=2, axis=1, keepdims=True)
    return x/norm

drop_cols = ['Anon Student Id', 'Problem Name', 'Problem Unit', 'Problem Section', 'Step Name']
tmp_train = train_x.drop(drop_cols, axis=1)
tmp_test = test_x.drop(drop_cols, axis=1)
train_norm_x = normalize(tmp_train)
test_norm_x = normalize(tmp_test)
for c in drop_cols:
    train_norm_x[c] = train_x[c]
    test_norm_x[c] = test_x[c]

In [38]:
model = MLPRegressor(hidden_layer_sizes=(100, 5, 100), activation='tanh', solver='adam')
model.fit(train_norm_x, train_y)
result = model.predict(test_norm_x)
print("MLPRegressor error", loss_function(result, test_y))

MLPRegressor error 0.39197824823498756


In [39]:
model = LogisticRegression(penalty='l2')
model.fit(train_norm_x, train_y)
result = model.predict(test_norm_x)
print("Logistic Regression error", loss_function(result, test_y))

Logistic Regression error 0.43495883620084


In [40]:
model = RandomForestRegressor()
model.fit(train_norm_x, train_y)
result = model.predict(test_norm_x)
print("RandomForest error", loss_function(result, test_y))

RandomForest error 0.3691960261239356
