In [133]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display

# Load Data

In [134]:
student_vle = pd.read_csv('content/studentVle.csv')
vle_info = pd.read_csv('content/vle.csv')
clean_reg = pd.read_csv('content/cleaned_registrations.csv')
display(student_vle.head())
display(vle_info.head())

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click
0,AAA,2013J,28400,546652,-10,4
1,AAA,2013J,28400,546652,-10,1
2,AAA,2013J,28400,546652,-10,1
3,AAA,2013J,28400,546614,-10,11
4,AAA,2013J,28400,546714,-10,1


Unnamed: 0,id_site,code_module,code_presentation,activity_type,week_from,week_to
0,546943,AAA,2013J,resource,,
1,546712,AAA,2013J,oucontent,,
2,546998,AAA,2013J,resource,,
3,546888,AAA,2013J,url,,
4,547035,AAA,2013J,resource,,


In [135]:
clean_reg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25371 entries, 0 to 25370
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 25371 non-null  object 
 1   code_presentation           25371 non-null  object 
 2   id_student                  25371 non-null  int64  
 3   gender                      25371 non-null  object 
 4   highest_education           25371 non-null  object 
 5   age_band                    25371 non-null  object 
 6   num_of_prev_attempts        25371 non-null  int64  
 7   disability                  25371 non-null  object 
 8   final_result                25371 non-null  object 
 9   date_registration           25371 non-null  float64
 10  module_presentation_length  25371 non-null  int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 2.1+ MB


In [136]:
student_vle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10655280 entries, 0 to 10655279
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   code_module        object
 1   code_presentation  object
 2   id_student         int64 
 3   id_site            int64 
 4   date               int64 
 5   sum_click          int64 
dtypes: int64(4), object(2)
memory usage: 487.8+ MB


We are using the first 90 days of student activity to predict course outcomes, so we only want to consider interactions completed in the first 90 days of the course.  We will drop interactions occuring after that point.

In [137]:
student_vle = student_vle[student_vle.date < 91]
student_vle.shape

(5486878, 6)

# Merge Tables

In [138]:
vle = pd.merge(student_vle,vle_info, how = 'left', on =['id_site','code_module','code_presentation'], 
               validate = 'm:1').drop(columns = ['week_from','week_to'])
display(vle.info())
print('nulls')
print(vle.isna().sum())
vle.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5486878 entries, 0 to 5486877
Data columns (total 7 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   code_module        object
 1   code_presentation  object
 2   id_student         int64 
 3   id_site            int64 
 4   date               int64 
 5   sum_click          int64 
 6   activity_type      object
dtypes: int64(4), object(3)
memory usage: 334.9+ MB


None

nulls
code_module          0
code_presentation    0
id_student           0
id_site              0
date                 0
sum_click            0
activity_type        0
dtype: int64


Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click,activity_type
0,AAA,2013J,28400,546652,-10,4,forumng
1,AAA,2013J,28400,546652,-10,1,forumng
2,AAA,2013J,28400,546652,-10,1,forumng
3,AAA,2013J,28400,546614,-10,11,homepage
4,AAA,2013J,28400,546714,-10,1,oucontent


# Count activities per day

In [139]:
daily_counts = vle.groupby(by = ['code_module','code_presentation','id_student','date']).count()

In [140]:
daily_counts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,id_site,sum_click,activity_type
code_module,code_presentation,id_student,date,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAA,2013J,11391,-5,11,11,11
AAA,2013J,11391,0,10,10,10
AAA,2013J,11391,1,16,16,16
AAA,2013J,11391,2,2,2,2
AAA,2013J,11391,6,2,2,2


In [141]:
daily_counts.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 889237 entries, ('AAA', '2013J', 11391, -5) to ('GGG', '2014J', 2684003, 72)
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype
---  ------         --------------   -----
 0   id_site        889237 non-null  int64
 1   sum_click      889237 non-null  int64
 2   activity_type  889237 non-null  int64
dtypes: int64(3)
memory usage: 24.8+ MB


## Determine average number of activities per day

In [142]:
daily_average = daily_counts.groupby(level = ['code_module','code_presentation','id_student']).mean()
daily_average = daily_average.reset_index().drop(columns = ['sum_click','activity_type'])
daily_average = daily_average.rename({'id_site':'avg_daily_activities'}, axis = 1)
daily_average.to_csv('student_vle_daily_average.csv', index = False)

In [143]:
daily_average.head(10)

Unnamed: 0,code_module,code_presentation,id_student,avg_daily_activities
0,AAA,2013J,11391,4.695652
1,AAA,2013J,28400,6.388889
2,AAA,2013J,30268,6.333333
3,AAA,2013J,31604,5.803922
4,AAA,2013J,32885,5.4
5,AAA,2013J,38053,5.206897
6,AAA,2013J,45462,4.470588
7,AAA,2013J,45642,6.333333
8,AAA,2013J,52130,4.836735
9,AAA,2013J,53025,6.690909


In [144]:
daily_average.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29158 entries, 0 to 29157
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   code_module           29158 non-null  object 
 1   code_presentation     29158 non-null  object 
 2   id_student            29158 non-null  int64  
 3   avg_daily_activities  29158 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 911.3+ KB


In [145]:
daily_average.describe()

Unnamed: 0,id_student,avg_daily_activities
count,29158.0,29158.0
mean,708430.3,5.708507
std,551769.9,2.19644
min,6516.0,1.0
25%,507813.2,4.270411
50%,589861.0,5.444444
75%,643757.0,6.903226
max,2698588.0,41.0


## Determine total number of days studied

In [146]:
days_studied = daily_counts.groupby(as_index = True, level = ['code_module','code_presentation','id_student']).count()
days_studied = days_studied.reset_index().drop(columns = ['sum_click','activity_type'])
days_studied = days_studied.rename({'id_site':'total_days_studied'}, axis = 1)
days_studied.to_csv('student_vle_days_studied.csv', index = False)

In [147]:
days_studied.head(10)

Unnamed: 0,code_module,code_presentation,id_student,total_days_studied
0,AAA,2013J,11391,23
1,AAA,2013J,28400,36
2,AAA,2013J,30268,12
3,AAA,2013J,31604,51
4,AAA,2013J,32885,35
5,AAA,2013J,38053,58
6,AAA,2013J,45462,34
7,AAA,2013J,45642,39
8,AAA,2013J,52130,49
9,AAA,2013J,53025,55


In [148]:
days_studied.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29158 entries, 0 to 29157
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   code_module         29158 non-null  object
 1   code_presentation   29158 non-null  object
 2   id_student          29158 non-null  int64 
 3   total_days_studied  29158 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 911.3+ KB


In [149]:
days_studied.describe()


Unnamed: 0,id_student,total_days_studied
count,29158.0,29158.0
mean,708430.3,30.497188
std,551769.9,23.106595
min,6516.0,1.0
25%,507813.2,12.0
50%,589861.0,26.0
75%,643757.0,44.0
max,2698588.0,115.0


# Clicks Per Day

In [150]:
vle.head()

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click,activity_type
0,AAA,2013J,28400,546652,-10,4,forumng
1,AAA,2013J,28400,546652,-10,1,forumng
2,AAA,2013J,28400,546652,-10,1,forumng
3,AAA,2013J,28400,546614,-10,11,homepage
4,AAA,2013J,28400,546714,-10,1,oucontent


In [151]:
clicks_per_day = vle.groupby(by = ['code_module','code_presentation','id_student','date']).sum()
clicks_per_day = clicks_per_day.reset_index()
clicks_per_day.head()

Unnamed: 0,code_module,code_presentation,id_student,date,id_site,sum_click
0,AAA,2013J,11391,-5,6013476,98
1,AAA,2013J,11391,0,5467293,49
2,AAA,2013J,11391,1,8748125,127
3,AAA,2013J,11391,2,1093521,4
4,AAA,2013J,11391,6,1093276,3


In [152]:
avg_clicks_per_day = clicks_per_day.groupby(by = ['code_module','code_presentation','id_student']).mean()
avg_clicks_per_day = avg_clicks_per_day.reset_index().drop(columns = ['date','id_site'])
avg_clicks_per_day.rename({'sum_click':'mean_clicks_per_day'}, axis = 1, inplace = True)
avg_clicks_per_day.head()

Unnamed: 0,code_module,code_presentation,id_student,mean_clicks_per_day
0,AAA,2013J,11391,25.521739
1,AAA,2013J,28400,22.805556
2,AAA,2013J,30268,23.416667
3,AAA,2013J,31604,19.45098
4,AAA,2013J,32885,19.542857


## Combine module, presentation, student into one index for easier grouping

In [153]:
display(clean_reg.info())
clean_reg.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25371 entries, 0 to 25370
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 25371 non-null  object 
 1   code_presentation           25371 non-null  object 
 2   id_student                  25371 non-null  int64  
 3   gender                      25371 non-null  object 
 4   highest_education           25371 non-null  object 
 5   age_band                    25371 non-null  object 
 6   num_of_prev_attempts        25371 non-null  int64  
 7   disability                  25371 non-null  object 
 8   final_result                25371 non-null  object 
 9   date_registration           25371 non-null  float64
 10  module_presentation_length  25371 non-null  int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 2.1+ MB


None

Unnamed: 0,code_module,code_presentation,id_student,gender,highest_education,age_band,num_of_prev_attempts,disability,final_result,date_registration,module_presentation_length
0,AAA,2013J,11391,M,HE Qualification,55<=,0,N,Pass,-159.0,268
1,AAA,2013J,28400,F,HE Qualification,35-55,0,N,Pass,-53.0,268
2,AAA,2013J,31604,F,A Level or Equivalent,35-55,0,N,Pass,-52.0,268
3,AAA,2013J,32885,F,Lower Than A Level,0-35,0,N,Pass,-176.0,268
4,AAA,2013J,38053,M,A Level or Equivalent,35-55,0,N,Pass,-110.0,268


In [154]:
clean_reg = pd.merge(clean_reg, daily_average, 
                     how = 'inner', 
                     on = ['code_module','code_presentation', 'id_student'])


In [155]:
clean_reg = pd.merge(clean_reg, days_studied, 
                     how = 'inner', 
                     on = ['code_module','code_presentation', 'id_student'])

In [156]:
clean_reg = pd.merge(clean_reg, avg_clicks_per_day, 
                     how = 'inner', 
                     on = ['code_module','code_presentation', 'id_student'])

In [157]:
display(clean_reg.info())
display(clean_reg.describe())
clean_reg

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24882 entries, 0 to 24881
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 24882 non-null  object 
 1   code_presentation           24882 non-null  object 
 2   id_student                  24882 non-null  int64  
 3   gender                      24882 non-null  object 
 4   highest_education           24882 non-null  object 
 5   age_band                    24882 non-null  object 
 6   num_of_prev_attempts        24882 non-null  int64  
 7   disability                  24882 non-null  object 
 8   final_result                24882 non-null  object 
 9   date_registration           24882 non-null  float64
 10  module_presentation_length  24882 non-null  int64  
 11  avg_daily_activities        24882 non-null  float64
 12  total_days_studied          24882 non-null  int64  
 13  mean_clicks_per_day         248

None

Unnamed: 0,id_student,num_of_prev_attempts,date_registration,module_presentation_length,avg_daily_activities,total_days_studied,mean_clicks_per_day
count,24882.0,24882.0,24882.0,24882.0,24882.0,24882.0,24882.0
mean,707264.4,0.155012,-64.426332,256.174343,5.734353,33.440278,18.959878
std,552111.8,0.465312,44.736574,13.114598,2.050426,23.072836,11.279956
min,6516.0,0.0,-209.0,234.0,1.0,1.0,1.0
25%,506544.8,0.0,-93.0,241.0,4.341463,15.0,11.428571
50%,589451.5,0.0,-52.0,262.0,5.478261,29.0,16.42265
75%,641787.8,0.0,-29.0,268.0,6.894457,48.0,23.8
max,2698588.0,6.0,82.0,269.0,37.0,115.0,224.666667


Unnamed: 0,code_module,code_presentation,id_student,gender,highest_education,age_band,num_of_prev_attempts,disability,final_result,date_registration,module_presentation_length,avg_daily_activities,total_days_studied,mean_clicks_per_day
0,AAA,2013J,11391,M,HE Qualification,55<=,0,N,Pass,-159.0,268,4.695652,23,25.521739
1,AAA,2013J,28400,F,HE Qualification,35-55,0,N,Pass,-53.0,268,6.388889,36,22.805556
2,AAA,2013J,31604,F,A Level or Equivalent,35-55,0,N,Pass,-52.0,268,5.803922,51,19.450980
3,AAA,2013J,32885,F,Lower Than A Level,0-35,0,N,Pass,-176.0,268,5.400000,35,19.542857
4,AAA,2013J,38053,M,A Level or Equivalent,35-55,0,N,Pass,-110.0,268,5.206897,58,18.275862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24877,GGG,2014J,2640965,F,Lower Than A Level,0-35,0,N,Fail,-4.0,269,3.800000,5,8.200000
24878,GGG,2014J,2645731,F,Lower Than A Level,35-55,0,N,Distinction,-23.0,269,4.750000,8,10.125000
24879,GGG,2014J,2648187,F,A Level or Equivalent,0-35,0,Y,Pass,-129.0,269,4.600000,5,7.000000
24880,GGG,2014J,2679821,F,Lower Than A Level,35-55,0,N,Withdrawn,-49.0,269,5.090909,11,24.272727


In [158]:
clean_reg['pct_days_studied'] = clean_reg['total_days_studied'] / 90
clean_reg

Unnamed: 0,code_module,code_presentation,id_student,gender,highest_education,age_band,num_of_prev_attempts,disability,final_result,date_registration,module_presentation_length,avg_daily_activities,total_days_studied,mean_clicks_per_day,pct_days_studied
0,AAA,2013J,11391,M,HE Qualification,55<=,0,N,Pass,-159.0,268,4.695652,23,25.521739,0.255556
1,AAA,2013J,28400,F,HE Qualification,35-55,0,N,Pass,-53.0,268,6.388889,36,22.805556,0.400000
2,AAA,2013J,31604,F,A Level or Equivalent,35-55,0,N,Pass,-52.0,268,5.803922,51,19.450980,0.566667
3,AAA,2013J,32885,F,Lower Than A Level,0-35,0,N,Pass,-176.0,268,5.400000,35,19.542857,0.388889
4,AAA,2013J,38053,M,A Level or Equivalent,35-55,0,N,Pass,-110.0,268,5.206897,58,18.275862,0.644444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24877,GGG,2014J,2640965,F,Lower Than A Level,0-35,0,N,Fail,-4.0,269,3.800000,5,8.200000,0.055556
24878,GGG,2014J,2645731,F,Lower Than A Level,35-55,0,N,Distinction,-23.0,269,4.750000,8,10.125000,0.088889
24879,GGG,2014J,2648187,F,A Level or Equivalent,0-35,0,Y,Pass,-129.0,269,4.600000,5,7.000000,0.055556
24880,GGG,2014J,2679821,F,Lower Than A Level,35-55,0,N,Withdrawn,-49.0,269,5.090909,11,24.272727,0.122222


In [159]:
clean_reg.to_csv('content/regs_with_activity_stats.csv', index = False)