In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt

We load and take a look at the data.

In [2]:
df = pd.read_csv('ouladstr.csv')

In [3]:
df.sample(3)

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,...,page,questionnaire,quiz,repeatactivity,resource,sharedsubpage,subpage,url,score,final_result
972529,4.0,1.0,328844,0.0,3.0,2.0,2.0,0.0,0,70,...,0,0,0,0,0,0,0,1,,2.0
551770,2.0,3.0,690365,0.0,1.0,1.0,1.0,1.0,0,90,...,0,0,0,0,0,0,0,0,,2.0
530959,2.0,3.0,646173,0.0,10.0,3.0,3.0,1.0,0,30,...,0,0,31,0,0,0,0,0,,2.0


In [4]:
df.loc[(df.id_student==390029) & (df.code_module==3.0)]

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,...,page,questionnaire,quiz,repeatactivity,resource,sharedsubpage,subpage,url,score,final_result
697300,3.0,1.0,390029,0.0,5.0,2.0,3.0,0.0,0,120,...,0,0,0,0,0,0,0,0,,2.0
697301,3.0,1.0,390029,0.0,5.0,2.0,3.0,0.0,0,120,...,0,0,0,0,0,0,0,3,,2.0
697302,3.0,1.0,390029,0.0,5.0,2.0,3.0,0.0,0,120,...,0,0,0,0,0,0,0,0,,2.0
697303,3.0,1.0,390029,0.0,5.0,2.0,3.0,0.0,0,120,...,0,0,0,0,1,0,4,0,,2.0
697304,3.0,1.0,390029,0.0,5.0,2.0,3.0,0.0,0,120,...,0,0,0,0,0,0,2,0,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
697383,3.0,1.0,390029,0.0,5.0,2.0,3.0,0.0,0,120,...,0,0,0,0,0,0,1,0,,2.0
697384,3.0,1.0,390029,0.0,5.0,2.0,3.0,0.0,0,120,...,0,0,0,0,0,0,0,0,,2.0
697385,3.0,1.0,390029,0.0,5.0,2.0,3.0,0.0,0,120,...,0,0,0,0,1,0,2,0,,2.0
697386,3.0,1.0,390029,0.0,5.0,2.0,3.0,0.0,0,120,...,0,0,0,0,0,0,0,0,,2.0


Each ('id_student','code_module','code_presentation') tuple corresponds to a particular student in a particular session ("presentation") of a particular class ("module"). As seen above, there can be multiple lines of data (in the above example, 88) corresponding to a single such tuple. These correspond to different activities / learning analytic events associated to the same student over the course of the module. 

The temporal dimension is a novel feature of the Stream OULAD which we will attempt to further exploit in our project. For now (for baseline modelling), we shall ignore this temporal aspect, and combine all data lines associated to the same student.

Specifically, we ignore the time fields (and the intermediate score field, which in any case is often missing), merge the common identifying fields (which should be equal across all data lines corresponding to the same student in the same module presentation), and sum up the remaining fields (and so obtain a "total activity" snapshot for the student in the module presentation.)

In [5]:
dfs = df.drop(['dataplus','date','score'], axis=1)
student_feats = ['gender','region','highest_education','imd_band','age_band',
                 'num_of_prev_attempts','disability','studied_credits','final_result']
tag = ['id_student','code_module','code_presentation']
X1 = dfs.drop(student_feats,axis=1).groupby(tag).sum()
X2y = dfs[tag+student_feats].groupby(tag).mean()
dfc = pd.concat([X1,X2y],axis=1)

In [6]:
dfc

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dualpane,externalquiz,folder,forumng,glossary,homepage,htmlactivity,oucollaborate,oucontent,ouelluminate,...,url,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,disability,studied_credits,final_result
id_student,code_module,code_presentation,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
6516,0.0,3.0,0,0,0,451,0,497,0,0,1505,0,...,143,0.0,1.0,3.0,8.0,2.0,0.0,0.0,60.0,2.0
8462,3.0,1.0,0,12,0,36,0,196,0,27,64,0,...,23,0.0,10.0,3.0,3.0,2.0,0.0,0.0,90.0,0.0
8462,3.0,3.0,0,0,0,2,0,7,0,1,0,0,...,0,0.0,10.0,3.0,3.0,2.0,1.0,0.0,60.0,0.0
11391,0.0,1.0,0,0,0,193,0,138,0,0,553,0,...,5,0.0,0.0,3.0,9.0,2.0,0.0,0.0,240.0,2.0
23629,1.0,0.0,0,0,0,87,0,36,0,0,0,0,...,0,1.0,0.0,1.0,2.0,0.0,2.0,0.0,60.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2698257,0.0,1.0,0,0,0,201,0,165,0,0,331,0,...,5,0.0,8.0,1.0,6.0,0.0,0.0,0.0,120.0,2.0
2698535,2.0,2.0,0,0,0,309,0,98,0,0,6,0,...,6,0.0,5.0,1.0,5.0,0.0,0.0,0.0,60.0,0.0
2698535,4.0,1.0,2,0,0,1953,0,391,0,0,773,0,...,90,0.0,5.0,1.0,5.0,0.0,0.0,0.0,60.0,2.0
2698577,1.0,3.0,0,0,0,103,0,204,0,5,253,0,...,4,1.0,5.0,1.0,5.0,1.0,0.0,0.0,60.0,1.0


We see that we have 28174 distinct (student, module, presentation) tuples. We now do baseline model testing on this combined dataset: we run a dummy classifier that predicts the most common outcome for "final_result" (2=Pass, rather than 0=Withdrawn, 1=Fail, or 3=Distinction) and compare this to a logistic regressor and a random forest (both run on all of the features on the combined dataset above).

In [7]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.base import clone

In [8]:
model_names = ['baseline','log','rf']

# base estimators (cloned per fit to avoid state carryover)
base_models = {
    'baseline': DummyClassifier(strategy="most_frequent"),
    'log': Pipeline([('scale', StandardScaler()), ('log',LogisticRegression(max_iter=1000))]),
    'rf': Pipeline([('scale', StandardScaler()), ('log',RandomForestClassifier())]),
}

model_accs = {name: None for name in model_names}

for name in model_names:
    model = clone(base_models[name])
    
    X_tt, X_val, y_tt, y_val = train_test_split(dfc.drop(['final_result'],axis=1), 
                                                dfc['final_result'], 
                                                test_size=0.2)
    model.fit(X_tt, y_tt)
    pred = model.predict(X_val)

    model_accs[name] = accuracy_score(y_val, pred)


In [9]:
model_accs

{'baseline': 0.41490683229813663,
 'log': 0.5730257320319432,
 'rf': 0.613664596273292}

It appears the non-dummy classifiers do significantly better than the baseline (but also do not do that great), so there is hope for improvement?

In [10]:
from sklearn.model_selection import KFold, cross_val_score

model_cv_accs = {name: None for name in model_names}

for name in model_names:
    model = clone(base_models[name])
    
    model_cv_accs[name] = cross_val_score(model,dfc.drop(['final_result'],axis=1),
                                          dfc['final_result'], cv=5)

In [11]:
model_cv_accs

{'baseline': array([0.41987578, 0.41987578, 0.41969831, 0.41969831, 0.41977281]),
 'log': array([0.55598935, 0.55847382, 0.56947649, 0.5703638 , 0.56780263]),
 'rf': array([0.60053239, 0.60372671, 0.62377995, 0.61685892, 0.61341853])}

Cross-validation results show similar / consistent trends.