# Prediction Demo

In [1]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import time
import numpy as np
from preprocess import Preprocessor
# Scikit Learn Libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn import metrics
# XGBoost Library
from xgboost import XGBClassifier


p_train = Preprocessor('train')
p_test = Preprocessor('test')

data/date.csv loaded! Number of courses: 39
data/object.csv loaded! Number of modules: 26750
data/train/log_train.csv loaded! Size of log data: 4677908
data/train/enrollment_train.csv loaded! Number of enrollments: 72395
data/train/truth_train.csv loaded! Number of labels: 72395

data/date.csv loaded! Number of courses: 39
data/object.csv loaded! Number of modules: 26750
data/test/log_test.csv loaded! Size of log data: 1548480
data/test/enrollment_test.csv loaded! Number of enrollments: 24013
data/test/truth_test.csv loaded! Number of labels: 24013



In [2]:
X_train, y_train = p_train.get_values_all()

Event_count extracted! Number of features: 7; time: 1.364033 seconds
Weekly_session_count extracted! Number of features: 6; time: 54.613707 seconds
Problem_video_ratio extracted! Number of features: 2; time: 6.584003 seconds
Shape of the features dataframe: (72395, 16)
Finish training data preprocessing! Time: 62.925739 seconds
The shape of X: (72395, 15); shape of y: (72395,)



In [3]:
X_test, y_test = p_test.get_values_all()

Event_count extracted! Number of features: 7; time: 0.384003 seconds
Weekly_session_count extracted! Number of features: 6; time: 25.412285 seconds
Problem_video_ratio extracted! Number of features: 2; time: 1.312000 seconds
Shape of the features dataframe: (24013, 16)
Finish testing data preprocessing! Time: 27.264274 seconds
The shape of X: (24013, 15); shape of y: (24013,)



## Random Forest

In [4]:
clf = RandomForestClassifier(n_estimators=200, max_depth=12, random_state=0, min_samples_split=2, n_jobs=-1)
clf = clf.fit(X_train, y_train)
expected = y_test
predicted = clf.predict(X_test)

print ('Classifier: %s\n' % (clf,))
print ('Classification report: \n %s \n' % (metrics.classification_report(expected, predicted),))
print ('Confusion matrix:\n%s\n' % metrics.confusion_matrix(expected, predicted))
print ('Testing Score: %f' % clf.score(X_test,y_test))

Classifier: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

Classification report: 
              precision    recall  f1-score   support

          0       0.77      0.56      0.65      4902
          1       0.89      0.96      0.92     19111

avg / total       0.87      0.88      0.87     24013
 

Confusion matrix:
[[ 2730  2172]
 [  800 18311]]

Testing Score: 0.876234


## Logistic Regression

In [5]:
clf = LogisticRegression(tol=1e-3, C=1.5, random_state=0)
clf = clf.fit(X_train, y_train)
expected = y_test
predicted = clf.predict(X_test)

print ('Classifier: %s\n' % (clf,))
print ('Classification report: \n %s \n' % (metrics.classification_report(expected, predicted),))
print ('Confusion matrix:\n%s\n' % metrics.confusion_matrix(expected, predicted))
print ('Testing Score: %f' % clf.score(X_test,y_test))

Classifier: LogisticRegression(C=1.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.001,
          verbose=0, warm_start=False)

Classification report: 
              precision    recall  f1-score   support

          0       0.78      0.54      0.64      4902
          1       0.89      0.96      0.92     19111

avg / total       0.87      0.87      0.87     24013
 

Confusion matrix:
[[ 2624  2278]
 [  736 18375]]

Testing Score: 0.874485


# XGBoost

In [6]:
clf = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.05, min_child_weight=2, 
                        n_jobs=-1, max_delta_step=1, objective='binary:logistic', gamma=3 ,subsample=1)
clf = clf.fit(X_train, y_train)
expected = y_test
predicted = clf.predict(X_test)

print ('Classifier: %s\n' % (clf,))
print ('Classification report: \n %s \n' % (metrics.classification_report(expected, predicted),))
print ('Confusion matrix:\n%s\n' % metrics.confusion_matrix(expected, predicted))
print ('Testing Score: %f' % clf.score(X_test,y_test))

Classifier: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=3, learning_rate=0.05, max_delta_step=1,
       max_depth=6, min_child_weight=2, missing=None, n_estimators=200,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

Classification report: 
              precision    recall  f1-score   support

          0       0.77      0.57      0.66      4902
          1       0.90      0.96      0.93     19111

avg / total       0.87      0.88      0.87     24013
 

Confusion matrix:
[[ 2781  2121]
 [  808 18303]]

Testing Score: 0.878024


  if diff:
  if diff:


### Partial fit

In [7]:
X_train_balance, y_train_balance = p_train.get_values_partial(0.5)

Event_count extracted! Number of features: 7; time: 1.660004 seconds
Weekly_session_count extracted! Number of features: 6; time: 51.527321 seconds
Problem_video_ratio extracted! Number of features: 2; time: 3.099999 seconds
Shape of the features dataframe: (72395, 16)
The ratio of 1 in labels:  65.62%
The shape of X: (43712, 13); shape of y: (43712,)



In [8]:
X_test, y_test = p_test.get_values_partial(1)

Event_count extracted! Number of features: 7; time: 0.376340 seconds
Weekly_session_count extracted! Number of features: 6; time: 16.183331 seconds
Problem_video_ratio extracted! Number of features: 2; time: 1.031994 seconds
Shape of the features dataframe: (24013, 16)
The ratio of 1 in labels:  79.59%
The shape of X: (24013, 13); shape of y: (24013,)



In [10]:
clf_partial = RandomForestClassifier(n_estimators=200, max_depth=12, random_state=0, min_samples_split=2, n_jobs=-1)
clf_partial.fit(X_train_balance, y_train_balance)
expected = y_test
predicted = clf_partial.predict(X_test)

print ('Classifier: %s\n' % (clf_partial,))
print ('Classification report: \n %s \n' % (metrics.classification_report(expected, predicted),))
print ('Confusion matrix:\n%s\n' % metrics.confusion_matrix(expected, predicted))
print ('Testing Score: %f' % clf_partial.score(X_test,y_test))

Classifier: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

Classification report: 
              precision    recall  f1-score   support

          0       0.67      0.66      0.67      4902
          1       0.91      0.92      0.91     19111

avg / total       0.86      0.86      0.86     24013
 

Confusion matrix:
[[ 3252  1650]
 [ 1618 17493]]

Testing Score: 0.863907
