In [45]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

In [46]:
exam_info_df = pd.read_csv('data/raw/365_exam_info.csv')
quiz_info_df = pd.read_csv('data/raw/365_quiz_info.csv')
student_engagement_df = pd.read_csv('data/raw/365_student_engagement.csv')
student_hub_questions_df = pd.read_csv('data/raw/365_student_hub_questions.csv')
student_info_df = pd.read_csv('data/raw/365_student_info.csv')
student_learning_df = pd.read_csv('data/raw/365_student_learning.csv')
student_purchases_df = pd.read_csv('data/raw/365_student_purchases.csv')

In [47]:
student_engagement_df.date_engaged = pd.to_datetime(student_engagement_df.date_engaged)
student_hub_questions_df.date_question_asked = pd.to_datetime(student_hub_questions_df.date_question_asked)
student_info_df.date_registered = pd.to_datetime(student_info_df.date_registered)
student_learning_df.date_watched = pd.to_datetime(student_learning_df.date_watched)
student_purchases_df.date_purchased = pd.to_datetime(student_purchases_df.date_purchased)

In [48]:
def has_student_engaged_with_quizzes(student_id: str):
    return (student_engagement_df[student_engagement_df.student_id == student_id].engagement_quizzes == 1).any()

def has_student_engaged_with_exams(student_id: str):
    return (student_engagement_df[student_engagement_df.student_id == student_id].engagement_exams == 1).any()

In [6]:
df = student_info_df.copy()

sum_minutes_watched = pd.merge(student_info_df.student_id, student_learning_df[['student_id', 'minutes_watched']], on='student_id', how='left')
sum_minutes_watched = sum_minutes_watched.fillna(0)
sum_minutes_watched = sum_minutes_watched.groupby('student_id').sum().reset_index()

days_engaged = student_engagement_df[['student_id', 'date_engaged']].groupby('student_id').size().reset_index()
days_engaged = pd.merge(student_info_df.student_id, days_engaged, on='student_id', how='left').fillna(0).astype(np.int32)
days_engaged = days_engaged.rename(columns={0: 'days_engaged'})

df = pd.merge(df.student_id, sum_minutes_watched, on='student_id')
df = pd.merge(df, days_engaged, on='student_id', how='left')

df['engaged_with_quizzes'] = df.student_id.map(has_student_engaged_with_quizzes)
df['engaged_with_exams'] = df.student_id.map(has_student_engaged_with_exams)
df['engaged_with_qa'] = df.student_id.isin(student_hub_questions_df.student_id)
df['subscribed'] = df.student_id.isin(student_purchases_df.student_id)

df = df[~((df.subscribed == True) & (df.days_engaged == 0))]
df = df.drop('student_id', axis=1)

In [7]:
# df.to_csv('data/processed/ml_dataset.csv', index=False)

In [49]:
df = pd.read_csv('data/processed/ml_dataset.csv')
df

Unnamed: 0,minutes_watched,days_engaged,engaged_with_quizzes,engaged_with_exams,engaged_with_qa,subscribed
0,0.3,1,False,False,False,False
1,0.0,0,False,False,False,False
2,531.2,29,True,True,False,True
3,0.0,0,False,False,False,False
4,0.0,0,False,False,False,False
...,...,...,...,...,...,...
35118,7.8,1,True,False,False,False
35119,10.4,1,True,False,False,False
35120,0.1,1,False,False,False,False
35121,0.0,0,False,False,False,False


In [64]:
X = df.drop('subscribed', axis=1)
y = df.subscribed

ros = RandomOverSampler()
rus = RandomUnderSampler()

scaler = StandardScaler()

X_oversampled, y_oversampled = ros.fit_resample(X, y)
X_resampled, y_resampled = rus.fit_resample(X_oversampled, y_oversampled)

X_scaled = scaler.fit_transform(X_resampled)

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2)

In [67]:
X_train.shape

(52952, 5)

In [11]:
grid_search = {
    'logistic_regression': {'classifier': LogisticRegression(), 'params': {'C': np.logspace(-4, 4, 20)}},
    'rfc': {'classifier': RandomForestClassifier(), 'params': {'n_estimators': [1, 5, 10], 'max_depth': [1, 5, 10]}},
    'xgb_classifier': {'classifier': XGBClassifier(), 'params': {'n_estimators': [1, 5, 10], 'max_depth': [1, 5, 10]}},
    'knn': {'classifier': KNeighborsClassifier(), 'params': {'n_neighbors': list(range(1, 31))}}
}

In [14]:
scores = []

for classifier_name, classifier_params in grid_search.items():
    gs = GridSearchCV(classifier_params['classifier'], classifier_params['params'], cv=5, scoring='f1_micro')
    gs.fit(X_train, y_train)
    scores.append(
        {'classifier': classifier_name,
         'best_score': gs.best_score_,
         'best_params': gs.best_params_,
         'best_estimator': gs.best_estimator_}
    )

In [17]:
scores_df = pd.DataFrame(scores).sort_values(by='best_score', ascending=False)
scores_df

Unnamed: 0,classifier,best_score,best_params,best_estimator
3,knn,0.962287,{'n_neighbors': 2},KNeighborsClassifier(n_neighbors=2)
2,xgb_classifier,0.94546,"{'max_depth': 10, 'n_estimators': 10}","XGBClassifier(base_score=None, booster=None, c..."
1,rfc,0.941891,"{'max_depth': 10, 'n_estimators': 10}","(DecisionTreeClassifier(max_depth=10, max_feat..."
0,logistic_regression,0.90684,{'C': 1.623776739188721},LogisticRegression(C=1.623776739188721)


In [39]:
classifier = scores_df.iloc[1].best_estimator
classifier.fit(X_train, y_train)

In [40]:
report = classification_report(y_test, classifier.predict(X_test), output_dict=True)
pd.DataFrame(report).transpose()

Unnamed: 0,precision,recall,f1-score,support
False,0.954927,0.94468,0.949776,6616.0
True,0.945316,0.955452,0.950357,6622.0
accuracy,0.950068,0.950068,0.950068,0.950068
macro avg,0.950122,0.950066,0.950066,13238.0
weighted avg,0.95012,0.950068,0.950066,13238.0


In [41]:
roc_auc_score(y_test, classifier.predict(X_test))

0.9500655449553118