main.py

In [4]:
import pandas as pd
import time
import numpy as np
from preprocess import Preprocessor
# Scikit Learn Libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn import metrics
# XGBoost Library
from xgboost import XGBClassifier

def random_forest(X_train, y_train, X_test, y_test):

    t0 = time.time()
    clf = RandomForestClassifier(n_estimators=200, max_depth=12, random_state=0, min_samples_split=2, n_jobs=-1)
    clf = clf.fit(X_train, y_train)
    expected = y_test
    predicted = clf.predict(X_test)
    t1 = time.time()

    print("Classification report for classifier %s:\n%s\n"
          % (clf, metrics.classification_report(expected, predicted)))
    print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
    print ("Testing Score:")
    print (clf.score(X_test,y_test))
    print ('Time')
    print (t1-t0)
    print ('')
    
def logistic_regression(X_train, y_train, X_test, y_test):
        
    t0 = time.time()
    clf = LogisticRegression(tol=1e-3, C=1.5, random_state=0)
    clf = clf.fit(X_train, y_train)
    expected = y_test
    predicted = clf.predict(X_test)

    print("Classification report for classifier %s:\n%s\n"
          % (clf, metrics.classification_report(expected, predicted)))
    print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
    print ("Testing Score:")
    print (clf.score(X_test,y_test))
    print ('Time')
    print (t1-t0)
    print ('')
    
def XGboost(X_train, y_train, X_test, y_test):

    t0 = time.time()
    clf = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.05, min_child_weight=2, 
                        n_jobs=-1, max_delta_step=1, objective='binary:logistic', gamma=3 ,subsample=1)
    clf = clf.fit(X_train, y_train)
    expected = y_test
    predicted = clf.predict(X_test)

    print("Classification report for classifier %s:\n%s\n"
          % (clf, metrics.classification_report(expected, predicted)))
    print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
    print ("Testing Score:")
    print (clf.score(X_test,y_test))
    print ('Time')
    print (t1-t0)
    print ('')
    
def voting(X_train, y_train, X_test, y_test):
    
    t0 = time.time()
    clf1 = RandomForestClassifier(n_estimators=200, max_depth=12, random_state=0, min_samples_split=3, n_jobs=-1)
    clf2 = LogisticRegression(tol=1e-3, C=1.5, random_state=0)
    clf3 = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.05, min_child_weight=2, 
                        n_jobs=-1, max_delta_step=1, objective='binary:logistic', gamma=3 ,subsample=1)
    
    clf = VotingClassifier(estimators=[('rf', clf1), ('lr', clf2), ('xgb', clf3)], voting='hard')
    clf = clf.fit(X_train, y_train)
    expected = y_test
    predicted = clf.predict(X_test)

    print("Classification report for classifier %s:\n%s\n"
          % (clf, metrics.classification_report(expected, predicted)))
    print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
    print ("Testing Score:")
    print (clf.score(X_test,y_test))
    print ('Time')
    print (t1-t0)
    
def main():
    
    # Build preprocessor
    p_train = Preprocessor('train')
    p_test = Preprocessor('test')
    
    # Get value
    X_train, y_train = p_train.get_values_all()
    X_test, y_test = p_test.get_values_all()
    
    print ('==========Classification==========')
    # Random forest
    random_forest(X_train, y_train, X_test, y_test)
    
    # Logistic regression
    logistic_regression(X_train, y_train, X_test, y_test)
    
    # XGboost
    XGboost(X_train, y_train, X_test, y_test)
    
    print ('==========Voting==========')
    # Voting
    voting(X_train, y_train, X_test, y_test)

if __name__== "__main__":
    main()

('data/date.csv loaded! Number of courses:', 39)
('data/object.csv loaded! Number of moduels:', 26750)
('data/train/log_train.csv loaded! Size of log data:', 4677908)
('data/train/enrollment_train.csv loaded! Number of enrollments:', 72395)
('data/train/truth_train.csv loaded! Number of labels:', 72395)
()
('data/date.csv loaded! Number of courses:', 39)
('data/object.csv loaded! Number of moduels:', 26750)
('data/test/log_test.csv loaded! Size of log data:', 1548480)
('data/test/enrollment_test.csv loaded! Number of enrollments:', 24013)
('data/test/truth_test.csv loaded! Number of labels:', 24013)
()
Event_count extracted! Number of features: 7; time: 1.349625 seconds
Weekly_session_count extracted! Number of features: 6; time: 66.430914 seconds
Problem_video_ratio extracted! Number of features: 2; time: 3.251933 seconds
('Shape of the features dataframe: ', (72395, 16))
Finish training data preprocessing! Time: 71.225868 seconds
The shape of X: (72395, 15); shape of y: (72395,)
()
E



Classification report for classifier LogisticRegression(C=1.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.001, verbose=0, warm_start=False):
              precision    recall  f1-score   support

           0       0.78      0.54      0.64      4902
           1       0.89      0.96      0.92     19111

   micro avg       0.87      0.87      0.87     24013
   macro avg       0.84      0.75      0.78     24013
weighted avg       0.87      0.87      0.87     24013


Confusion matrix:
[[ 2624  2278]
 [  736 18375]]
Testing Score:
0.8744846541456711
()
Classification report for classifier XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=3, learning_rate=0.05, max_delta_step=1,
       max_depth=6, min_child_weight=2, missing=None, n_estimators=200,
       n_jobs=-1, nthread=None, obje

import library

In [24]:
import pandas as pd
import time
import numpy as np
from preprocess import Preprocessor
# Scikit Learn Libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
# XGBoost Library
from xgboost import XGBClassifier

import data

In [7]:
# Build preprocessor
p_train = Preprocessor('train')
p_test = Preprocessor('test')

# Get value
X_train, y_train = p_train.get_values_all()
X_test, y_test = p_test.get_values_all()

('data/date.csv loaded! Number of courses:', 39)
('data/object.csv loaded! Number of moduels:', 26750)
('data/train/log_train.csv loaded! Size of log data:', 4677908)
('data/train/enrollment_train.csv loaded! Number of enrollments:', 72395)
('data/train/truth_train.csv loaded! Number of labels:', 72395)
()
('data/date.csv loaded! Number of courses:', 39)
('data/object.csv loaded! Number of moduels:', 26750)
('data/test/log_test.csv loaded! Size of log data:', 1548480)
('data/test/enrollment_test.csv loaded! Number of enrollments:', 24013)
('data/test/truth_test.csv loaded! Number of labels:', 24013)
()
Event_count extracted! Number of features: 7; time: 1.196238 seconds
Weekly_session_count extracted! Number of features: 6; time: 66.593662 seconds
Problem_video_ratio extracted! Number of features: 2; time: 3.366534 seconds
('Shape of the features dataframe: ', (72395, 16))
Finish training data preprocessing! Time: 71.351416 seconds
The shape of X: (72395, 15); shape of y: (72395,)
()
E

random forest

In [10]:
t0 = time.time()
clf = RandomForestClassifier(n_estimators=500,random_state=0)
clf = clf.fit(X_train, y_train)
print 'Cross Validation Score:'
print cross_val_score(clf, X_train, y_train, cv=5)
t1 = time.time()
print 'Cross Validation Time:'
print t1 - t0

Cross Validation Score:
[0.87327348 0.87126183 0.87395538 0.8691208  0.87256527]
Cross Validation Time:
194.506865978


SVM

In [None]:
t0 = time.time()
clf = svm.SVC(gamma='scale') 
clf = clf.fit(X_train, y_train)
print 'Cross Validation Score:'
print cross_val_score(clf, X_train, y_train, cv=5)
t1 = time.time()
print 'Cross Validation Time:'
print t1 - t0

Cross Validation Score:


Neural network

In [12]:
t0 = time.time()
clf = MLPClassifier(hidden_layer_sizes=(50,50))
clf = clf.fit(X_train, y_train)
print 'Cross Validation Score:'
print cross_val_score(clf, X_train, y_train, cv=5)
t1 = time.time()
print 'Cross Validation Time:'
print t1 - t0



Cross Validation Score:
[0.86754144 0.86732509 0.87126183 0.86097106 0.86800663]
Cross Validation Time:
472.91376996


Logistic regression

In [13]:
t0 = time.time()
clf = LogisticRegression()
clf = clf.fit(X_train, y_train)
print 'Cross Validation Score:'
print cross_val_score(clf, X_train, y_train, cv=5)
t1 = time.time()
print 'Cross Validation Time:'
print t1 - t0

Cross Validation Score:
[0.87569061 0.87326473 0.87699427 0.8711237  0.8736704 ]
Cross Validation Time:
1.92082500458


Gradient boosting

In [16]:
t0 = time.time()
clf = GradientBoostingClassifier()
clf = clf.fit(X_train, y_train)
print 'Cross Validation Score:'
print cross_val_score(clf, X_train, y_train, cv=5)
t1 = time.time()
print 'Cross Validation Time:'
print t1 - t0

Cross Validation Score:
[0.87672652 0.87416258 0.87885904 0.87215968 0.87394668]
Cross Validation Time:
29.6431109905


XGboosting

In [17]:
t0 = time.time()
clf = XGBClassifier()
clf = clf.fit(X_train, y_train)
print 'Cross Validation Score:'
print cross_val_score(clf, X_train, y_train, cv=5)
t1 = time.time()
print 'Cross Validation Time:'
print t1 - t0

Cross Validation Score:
[0.87810773 0.8747151  0.87858278 0.87367912 0.87449924]
Cross Validation Time:
27.1639420986


In [19]:
clf1 = RandomForestClassifier(n_estimators=500, random_state=0)
clf2 = LogisticRegression()
clf3 = GradientBoostingClassifier()
clf = VotingClassifier(estimators=[('rf', clf1), ('lr', clf2), ('xgb', clf3)], voting='hard')
clf = clf.fit(X_train, y_train)
print 'Cross Validation Score:'
print cross_val_score(clf, X_train, y_train, cv=5)
t1 = time.time()
print 'Cross Validation Time:'
print t1 - t0

Cross Validation Score:
[0.87845304 0.87409351 0.87885904 0.87278127 0.8750518 ]
Cross Validation Time:
511.561968088
