main.py

In [4]:
import pandas as pd
import time
import numpy as np
from preprocess import Preprocessor
# Scikit Learn Libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn import metrics
# XGBoost Library
from xgboost import XGBClassifier

def random_forest(X_train, y_train, X_test, y_test):

    clf = RandomForestClassifier(n_estimators=200, max_depth=12, random_state=0, min_samples_split=2, n_jobs=-1)
    clf = clf.fit(X_train, y_train)
    expected = y_test
    predicted = clf.predict(X_test)

    print("Classification report for classifier %s:\n%s\n"
          % (clf, metrics.classification_report(expected, predicted)))
    print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
    print ("Testing Score:")
    print (clf.score(X_test,y_test))
    print ('')
    
def logistic_regression(X_train, y_train, X_test, y_test):
    
    clf = LogisticRegression(tol=1e-3, C=1.5, random_state=0)
    clf = clf.fit(X_train, y_train)
    expected = y_test
    predicted = clf.predict(X_test)

    print("Classification report for classifier %s:\n%s\n"
          % (clf, metrics.classification_report(expected, predicted)))
    print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
    print ("Testing Score:")
    print (clf.score(X_test,y_test))
    print ('')
    
def XGboost(X_train, y_train, X_test, y_test):

    clf = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.05, min_child_weight=2, 
                        n_jobs=-1, max_delta_step=1, objective='binary:logistic', gamma=3 ,subsample=1)
    clf = clf.fit(X_train, y_train)
    expected = y_test
    predicted = clf.predict(X_test)

    print("Classification report for classifier %s:\n%s\n"
          % (clf, metrics.classification_report(expected, predicted)))
    print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
    print ("Testing Score:")
    print (clf.score(X_test,y_test))
    print ('')
    
def voting(X_train, y_train, X_test, y_test):
    
    clf1 = RandomForestClassifier(n_estimators=200, max_depth=12, random_state=0, min_samples_split=3, n_jobs=-1)
    clf2 = LogisticRegression(tol=1e-3, C=1.5, random_state=0)
    clf3 = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.05, min_child_weight=2, 
                        n_jobs=-1, max_delta_step=1, objective='binary:logistic', gamma=3 ,subsample=1)
    
    clf = VotingClassifier(estimators=[('rf', clf1), ('lr', clf2), ('xgb', clf3)], voting='hard')
    clf = clf.fit(X_train, y_train)
    expected = y_test
    predicted = clf.predict(X_test)

    print("Classification report for classifier %s:\n%s\n"
          % (clf, metrics.classification_report(expected, predicted)))
    print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
    print ("Testing Score:")
    print (clf.score(X_test,y_test))
    
def main():
    
    # Build preprocessor
    p_train = Preprocessor('train')
    p_test = Preprocessor('test')
    
    # Get value
    X_train, y_train = p_train.get_values_all()
    X_test, y_test = p_test.get_values_all()
    
    print ('==========Classification==========')
    # Random forest
    random_forest(X_train, y_train, X_test, y_test)
    
    # Logistic regression
    logistic_regression(X_train, y_train, X_test, y_test)
    
    # XGboost
    XGboost(X_train, y_train, X_test, y_test)
    
    print ('==========Voting==========')
    # Voting
    voting(X_train, y_train, X_test, y_test)

if __name__== "__main__":
    main()

('data/date.csv loaded! Number of courses:', 39)
('data/object.csv loaded! Number of moduels:', 26750)
('data/train/log_train.csv loaded! Size of log data:', 4677908)
('data/train/enrollment_train.csv loaded! Number of enrollments:', 72395)
('data/train/truth_train.csv loaded! Number of labels:', 72395)
()
('data/date.csv loaded! Number of courses:', 39)
('data/object.csv loaded! Number of moduels:', 26750)
('data/test/log_test.csv loaded! Size of log data:', 1548480)
('data/test/enrollment_test.csv loaded! Number of enrollments:', 24013)
('data/test/truth_test.csv loaded! Number of labels:', 24013)
()
Event_count extracted! Number of features: 7; time: 1.349625 seconds
Weekly_session_count extracted! Number of features: 6; time: 66.430914 seconds
Problem_video_ratio extracted! Number of features: 2; time: 3.251933 seconds
('Shape of the features dataframe: ', (72395, 16))
Finish training data preprocessing! Time: 71.225868 seconds
The shape of X: (72395, 15); shape of y: (72395,)
()
E



Classification report for classifier LogisticRegression(C=1.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.001, verbose=0, warm_start=False):
              precision    recall  f1-score   support

           0       0.78      0.54      0.64      4902
           1       0.89      0.96      0.92     19111

   micro avg       0.87      0.87      0.87     24013
   macro avg       0.84      0.75      0.78     24013
weighted avg       0.87      0.87      0.87     24013


Confusion matrix:
[[ 2624  2278]
 [  736 18375]]
Testing Score:
0.8744846541456711
()
Classification report for classifier XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=3, learning_rate=0.05, max_delta_step=1,
       max_depth=6, min_child_weight=2, missing=None, n_estimators=200,
       n_jobs=-1, nthread=None, obje

import library

In [1]:
import pandas as pd
import time
import numpy as np
from preprocess import Preprocessor
# Scikit Learn Libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.ensemble import VotingClassifier
# XGBoost Library
from xgboost import XGBClassifier

import data

In [2]:
# Build preprocessor
p_train = Preprocessor('train')
p_test = Preprocessor('test')

# Get value
X_train, y_train = p_train.get_values_all()
X_test, y_test = p_test.get_values_all()

('data/date.csv loaded! Number of courses:', 39)
('data/object.csv loaded! Number of moduels:', 26750)
('data/train/log_train.csv loaded! Size of log data:', 4677908)
('data/train/enrollment_train.csv loaded! Number of enrollments:', 72395)
('data/train/truth_train.csv loaded! Number of labels:', 72395)
()
('data/date.csv loaded! Number of courses:', 39)
('data/object.csv loaded! Number of moduels:', 26750)
('data/test/log_test.csv loaded! Size of log data:', 1548480)
('data/test/enrollment_test.csv loaded! Number of enrollments:', 24013)
('data/test/truth_test.csv loaded! Number of labels:', 24013)
()
Event_count extracted! Number of features: 7; time: 1.221201 seconds
Weekly_session_count extracted! Number of features: 6; time: 62.330931 seconds
Problem_video_ratio extracted! Number of features: 2; time: 3.067496 seconds
('Shape of the features dataframe: ', (72395, 16))
Finish training data preprocessing! Time: 66.794579 seconds
The shape of X: (72395, 15); shape of y: (72395,)
()
E

random forest

In [8]:
t0 = time.time()
clf = RandomForestClassifier(n_estimators=10,random_state=0)
clf = clf.fit(X_train, y_train)
expected = y_test
predicted = clf.predict(X_test)

print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
print 'Score: '
print clf.score(X_test,y_test)
t1 = time.time()
print cross_val_score(clf, X_train, y_train, cv=5)
print t1 - t0

Classification report for classifier RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False):
              precision    recall  f1-score   support

           0       0.67      0.60      0.63      4902
           1       0.90      0.93      0.91     19111

   micro avg       0.86      0.86      0.86     24013
   macro avg       0.79      0.76      0.77     24013
weighted avg       0.85      0.86      0.86     24013


Confusion matrix:
[[ 2926  1976]
 [ 1411 17700]]
Score: 
0.8589514013242827
[0.85600829 0.86007321 0.86214518 0.85731059 0.86089239]
1.02198886871


SVM

In [13]:
t0 = time.time()
clf = svm.SVC(gamma='scale') 
clf = clf.fit(X_train, y_train)
expected = y_test
predicted = clf.predict(X_test)

print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
print 'Score: '
print clf.score(X_test,y_test)
print 'Cross validation score: '
t1 = time.time()
print t1 - t0
#print cross_val_score(clf, X_train, y_train, cv=3)

Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
              precision    recall  f1-score   support

           0       0.75      0.53      0.62      4902
           1       0.89      0.95      0.92     19111

   micro avg       0.87      0.87      0.87     24013
   macro avg       0.82      0.74      0.77     24013
weighted avg       0.86      0.87      0.86     24013


Confusion matrix:
[[ 2619  2283]
 [  876 18235]]
Score: 
0.8684462582767668
Cross validation score: 
552.128526926


Neural network

In [10]:
t0 = time.time()
clf = MLPClassifier(hidden_layer_sizes=(50,50))
clf = clf.fit(X_train, y_train)
expected = y_test
predicted = clf.predict(X_test)

print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
print 'Score: '
print clf.score(X_test,y_test)
print 'Cross validation score: '
t1 = time.time()
print t1 - t0
#print cross_val_score(clf, X_train, y_train, cv=3)

Classification report for classifier MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False):
              precision    recall  f1-score   support

           0       0.72      0.57      0.64      4902
           1       0.90      0.94      0.92     19111

   micro avg       0.87      0.87      0.87     24013
   macro avg       0.81      0.76      0.78     24013
weighted avg       0.86      0.87      0.86     24013


Confusion matrix:
[[ 2812  2090]
 [ 1079 18032]]
Score: 
0.8680298171823596
Cross validation score: 
76.4239008427


Logistic regression

In [11]:
# Logistic regression
t0 = time.time()
clf = LogisticRegression()
clf = clf.fit(X_train, y_train)
expected = y_test
predicted = clf.predict(X_test)

print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
print 'Score: '
print clf.score(X_test,y_test)
print 'Cross validation score: '
t1 = time.time()
print t1 - t0
#print cross_val_score(clf, X_train, y_train, cv=3)



Classification report for classifier LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False):
              precision    recall  f1-score   support

           0       0.80      0.50      0.62      4902
           1       0.88      0.97      0.92     19111

   micro avg       0.87      0.87      0.87     24013
   macro avg       0.84      0.74      0.77     24013
weighted avg       0.87      0.87      0.86     24013


Confusion matrix:
[[ 2465  2437]
 [  604 18507]]
Score: 
0.8733602631907716
Cross validation score: 
0.772509098053


Gradient boosting

In [12]:
# Gradient Boosting
t0 = time.time()
clf = GradientBoostingClassifier()
clf = clf.fit(X_train, y_train)
expected = y_test
predicted = clf.predict(X_test)

print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
print 'Score: '
print clf.score(X_test,y_test)
print 'Cross validation score: '
t1 = time.time()
print t1 - t0
print cross_val_score(clf, X_train, y_train, cv=5)

Classification report for classifier GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False):
              precision    recall  f1-score   support

           0       0.76      0.57      0.65      4902
           1       0.90      0.95      0.92     19111

   micro avg       0.88      0.88      0.88     24013
   macro avg       0.83      0.76      0.79     24013
weighted avg       0.87      0.88      0.87     24013


Confusion matrix:
[[ 2811  2091]
 [  874 18237]]
Score: 
0.8765252155082663
Cross validat

XGboosting

In [16]:
# Ada Boosting

clf = AdaBoostClassifier()
clf = clf.fit(X_train, y_train)
expected = y_test
predicted = clf.predict(X_test)

print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
print 'Score: '
print clf.score(X_test,y_test)
print 'Cross validation score: '
print cross_val_score(clf, X_train, y_train, cv=5)

Classification report for classifier AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None):
              precision    recall  f1-score   support

           0       0.77      0.55      0.64      4902
           1       0.89      0.96      0.92     19111

   micro avg       0.88      0.88      0.88     24013
   macro avg       0.83      0.75      0.78     24013
weighted avg       0.87      0.88      0.87     24013


Confusion matrix:
[[ 2686  2216]
 [  784 18327]]
Score: 
0.8750676716778412
Cross validation score: 
[0.87216851 0.87457697 0.8751295  0.87540576 0.8743611 ]
