In [328]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Merge registrations, student info, and course info

In [329]:
registrations = pd.read_csv('content/studentRegistration.csv')
courseInfo = pd.read_csv('content/courses.csv')
students = pd.read_csv('content/studentInfo.csv')
vle = pd.read_csv('content/studentVle.csv')


In [330]:
registrations.head()

Unnamed: 0,code_module,code_presentation,id_student,date_registration,date_unregistration
0,AAA,2013J,11391,-159.0,
1,AAA,2013J,28400,-53.0,
2,AAA,2013J,30268,-92.0,12.0
3,AAA,2013J,31604,-52.0,
4,AAA,2013J,32885,-176.0,


In [331]:
students.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass


Check that studentInfo and StudentRegistration are 1:1 and merge tables

In [332]:
len(registrations) == len(students)

True

In [333]:
full_registrations = pd.merge(students, registrations, \
                         on = ['code_module','code_presentation','id_student'], \
                         validate = 'one_to_one')

In [334]:
full_registrations.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,date_registration,date_unregistration
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass,-159.0,
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass,-53.0,
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn,-92.0,12.0
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass,-52.0,
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass,-176.0,


# Add course length to registration table

In [335]:
courseInfo.head()

Unnamed: 0,code_module,code_presentation,module_presentation_length
0,AAA,2013J,268
1,AAA,2014J,269
2,BBB,2013J,268
3,BBB,2014J,262
4,BBB,2013B,240


In [336]:
full_registrations = pd.merge(full_registrations, courseInfo, \
                         on=['code_module','code_presentation'], \
                         validate='many_to_one')

In [337]:
full_registrations.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,date_registration,date_unregistration,module_presentation_length
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass,-159.0,,268
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass,-53.0,,268
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn,-92.0,12.0,268
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass,-52.0,,268
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass,-176.0,,268


# Dealing with missing data

In [338]:
full_registrations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32593 entries, 0 to 32592
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 32593 non-null  object 
 1   code_presentation           32593 non-null  object 
 2   id_student                  32593 non-null  int64  
 3   gender                      32593 non-null  object 
 4   region                      32593 non-null  object 
 5   highest_education           32593 non-null  object 
 6   imd_band                    31482 non-null  object 
 7   age_band                    32593 non-null  object 
 8   num_of_prev_attempts        32593 non-null  int64  
 9   studied_credits             32593 non-null  int64  
 10  disability                  32593 non-null  object 
 11  final_result                32593 non-null  object 
 12  date_registration           32548 non-null  float64
 13  date_unregistration         100

Drop entries missing registration date information.  We need registration information later to get statistics on activities.

In [339]:
full_registrations.dropna(subset=['date_registration'], inplace=True)

We want this model to be generalizable to countries outside of the UK, so we have to drop information that is not translatable to other countries.  This includes IMD band and regions, since those are very nation specific.

In [340]:
full_registrations.drop(columns=['imd_band','region', 'studied_credits'], inplace=True)

Our next big source of missing data is `date_unregistration`.  This is actually meaningful because those NaNs represent students who completed the course.  However, those are going to cause us problems in the future and we need to get some kind of data there.  It makes sense to me to say that the date of unregistration for students completing the course is the date of the end of the course.  We aren't losing any information because our assumption is that all of the students with a value in the `date_unregistration` also have 'withdrawn' in the `final_result` column.  Let's check these assumptions and fill in the NaNs.

In [341]:
full_registrations[full_registrations['date_unregistration'].isna() != True]['final_result'].value_counts()

Withdrawn    10024
Fail             9
Name: final_result, dtype: int64

That's unexpected, we have 9 results that are not withdrawn, but are 'Fail' instead.  Let's inspect those.

In [342]:
full_registrations[(full_registrations['date_unregistration'].isna() != True) \
                   & (full_registrations['final_result'] == 'Fail')]

Unnamed: 0,code_module,code_presentation,id_student,gender,highest_education,age_band,num_of_prev_attempts,disability,final_result,date_registration,date_unregistration,module_presentation_length
2777,BBB,2013J,362907,F,Lower Than A Level,35-55,2,N,Fail,-37.0,0.0,268
2786,BBB,2013J,365288,F,A Level or Equivalent,0-35,0,N,Fail,-75.0,0.0,268
3340,BBB,2013J,554243,F,Lower Than A Level,0-35,0,N,Fail,-35.0,166.0,268
14625,DDD,2013J,315082,M,A Level or Equivalent,0-35,0,N,Fail,-73.0,0.0,261
14768,DDD,2013J,403052,F,HE Qualification,0-35,0,N,Fail,-22.0,0.0,261
15577,DDD,2013J,582954,M,Lower Than A Level,0-35,0,N,Fail,-81.0,-7.0,261
24057,FFF,2013J,234004,M,A Level or Equivalent,0-35,0,N,Fail,-33.0,0.0,268
24537,FFF,2013J,523777,M,A Level or Equivalent,35-55,1,N,Fail,-60.0,0.0,268
25685,FFF,2013J,601640,M,HE Qualification,0-35,0,N,Fail,-58.0,-4.0,268


In [343]:
full_registrations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32548 entries, 0 to 32592
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 32548 non-null  object 
 1   code_presentation           32548 non-null  object 
 2   id_student                  32548 non-null  int64  
 3   gender                      32548 non-null  object 
 4   highest_education           32548 non-null  object 
 5   age_band                    32548 non-null  object 
 6   num_of_prev_attempts        32548 non-null  int64  
 7   disability                  32548 non-null  object 
 8   final_result                32548 non-null  object 
 9   date_registration           32548 non-null  float64
 10  date_unregistration         10033 non-null  float64
 11  module_presentation_length  32548 non-null  int64  
dtypes: float64(2), int64(3), object(7)
memory usage: 3.2+ MB


These look strange, and there are only 9 of them.  I'll drop them.  It's possible that the 0.0s are supposed to be NaNs, but I'm not sure.  We don't lose much data this way.

In [344]:
full_registrations = full_registrations[(full_registrations['date_unregistration'].isna()) \
                                        | (full_registrations['final_result'] == 'Withdrawn')]

In [345]:
full_registrations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32539 entries, 0 to 32592
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 32539 non-null  object 
 1   code_presentation           32539 non-null  object 
 2   id_student                  32539 non-null  int64  
 3   gender                      32539 non-null  object 
 4   highest_education           32539 non-null  object 
 5   age_band                    32539 non-null  object 
 6   num_of_prev_attempts        32539 non-null  int64  
 7   disability                  32539 non-null  object 
 8   final_result                32539 non-null  object 
 9   date_registration           32539 non-null  float64
 10  date_unregistration         10024 non-null  float64
 11  module_presentation_length  32539 non-null  int64  
dtypes: float64(2), int64(3), object(7)
memory usage: 3.2+ MB


Okay, now we have one-to-one correspondence with unregistration dates and 'Withdrawn' status.  We can fill in the rest of NaNs in the unregistration column with the course ending date.

In [346]:
full_registrations['date_unregistration'].fillna(full_registrations['module_presentation_length'], \
                                                inplace = True)

In [347]:
full_registrations.isna().sum()

code_module                   0
code_presentation             0
id_student                    0
gender                        0
highest_education             0
age_band                      0
num_of_prev_attempts          0
disability                    0
final_result                  0
date_registration             0
date_unregistration           0
module_presentation_length    0
dtype: int64

# Data Validation

In [348]:
full_registrations.describe()

Unnamed: 0,id_student,num_of_prev_attempts,date_registration,date_unregistration,module_presentation_length
count,32539.0,32539.0,32539.0,32539.0,32539.0
mean,706247.6,0.163097,-69.415932,192.701681,256.015428
std,548440.3,0.479381,49.265399,105.929137,13.178156
min,3733.0,0.0,-322.0,-317.0,234.0
25%,508583.5,0.0,-101.0,136.0,241.0
50%,590251.0,0.0,-57.0,241.0,262.0
75%,644423.0,0.0,-29.0,268.0,268.0
max,2698591.0,6.0,167.0,444.0,269.0


Registrations seem to begin up to 322 days before the start of presentations.  However, when I contacted Online University, they suggested that registrations for a presentation began about 6 months before the presentation began.  Let's see how many registrations we have before that mark.  We will give a little.

In [349]:
early_reg = full_registrations[full_registrations['date_registration'] < -180]

In [350]:
early_reg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 795 entries, 41 to 31210
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 795 non-null    object 
 1   code_presentation           795 non-null    object 
 2   id_student                  795 non-null    int64  
 3   gender                      795 non-null    object 
 4   highest_education           795 non-null    object 
 5   age_band                    795 non-null    object 
 6   num_of_prev_attempts        795 non-null    int64  
 7   disability                  795 non-null    object 
 8   final_result                795 non-null    object 
 9   date_registration           795 non-null    float64
 10  date_unregistration         795 non-null    float64
 11  module_presentation_length  795 non-null    int64  
dtypes: float64(2), int64(3), object(7)
memory usage: 80.7+ KB


In [351]:
early_reg.describe()

Unnamed: 0,id_student,num_of_prev_attempts,date_registration,date_unregistration,module_presentation_length
count,795.0,795.0,795.0,795.0,795.0
mean,719310.1,0.210063,-215.651572,131.596226,249.979874
std,631481.9,0.572149,33.8052,149.327912,12.520697
min,23632.0,0.0,-322.0,-317.0,234.0
25%,435652.5,0.0,-233.5,0.0,241.0
50%,563230.0,0.0,-200.0,234.0,241.0
75%,587062.5,0.0,-191.0,241.0,261.0
max,2697181.0,4.0,-181.0,268.0,268.0


There are only about 800 who registered more than 6 months before the presentation start, and most of those registered less than 7 months before the start.  We will drop those registered more than 7 months before the course start, since those earlier registrations seem fishy.

In [352]:
full_registrations = full_registrations[full_registrations['date_registration'] > -210]
full_registrations.describe()

Unnamed: 0,id_student,num_of_prev_attempts,date_registration,date_unregistration,module_presentation_length
count,32226.0,32226.0,32226.0,32226.0,32226.0
mean,706009.5,0.162819,-67.658723,193.562031,256.170825
std,547399.4,0.478512,46.060061,104.965323,13.145075
min,3733.0,0.0,-209.0,-206.0,234.0
25%,509109.0,0.0,-99.0,138.0,241.0
50%,590826.0,0.0,-56.0,241.0,262.0
75%,644843.8,0.0,-29.0,268.0,268.0
max,2698591.0,6.0,167.0,444.0,269.0


We have not lost too much data.  We also have course withdrawals AFTER the end of the course.  That does not seem to make sense.  We will drop those.

In [353]:
full_registrations = full_registrations[full_registrations['date_unregistration'] 
                                        < full_registrations['module_presentation_length']+1]

In [354]:
full_registrations.describe()

Unnamed: 0,id_student,num_of_prev_attempts,date_registration,date_unregistration,module_presentation_length
count,32225.0,32225.0,32225.0,32225.0,32225.0
mean,706013.2,0.162824,-67.66014,193.554259,256.170458
std,547407.5,0.478519,46.060073,104.95768,13.145114
min,3733.0,0.0,-209.0,-206.0,234.0
25%,509102.0,0.0,-99.0,138.0,241.0
50%,590828.0,0.0,-56.0,241.0,262.0
75%,644847.0,0.0,-29.0,268.0,268.0
max,2698591.0,6.0,167.0,269.0,269.0


In [355]:
full_registrations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32225 entries, 0 to 32592
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 32225 non-null  object 
 1   code_presentation           32225 non-null  object 
 2   id_student                  32225 non-null  int64  
 3   gender                      32225 non-null  object 
 4   highest_education           32225 non-null  object 
 5   age_band                    32225 non-null  object 
 6   num_of_prev_attempts        32225 non-null  int64  
 7   disability                  32225 non-null  object 
 8   final_result                32225 non-null  object 
 9   date_registration           32225 non-null  float64
 10  date_unregistration         32225 non-null  float64
 11  module_presentation_length  32225 non-null  int64  
dtypes: float64(2), int64(3), object(7)
memory usage: 3.2+ MB


We are trying to look at student interaction activity as a predictor of course outcome, so students that unregister before the class begins won't help us.  We will drop them, too.

In [356]:
full_registrations = full_registrations[full_registrations['date_unregistration'] > 0]
full_registrations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29265 entries, 0 to 32592
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 29265 non-null  object 
 1   code_presentation           29265 non-null  object 
 2   id_student                  29265 non-null  int64  
 3   gender                      29265 non-null  object 
 4   highest_education           29265 non-null  object 
 5   age_band                    29265 non-null  object 
 6   num_of_prev_attempts        29265 non-null  int64  
 7   disability                  29265 non-null  object 
 8   final_result                29265 non-null  object 
 9   date_registration           29265 non-null  float64
 10  date_unregistration         29265 non-null  float64
 11  module_presentation_length  29265 non-null  int64  
dtypes: float64(2), int64(3), object(7)
memory usage: 2.9+ MB


We lost a lot there, about 10% of our database.  That hurts, but it was necessary.
Now we need to figure out how soon we want to try to predict student outcomes.  A big part of this is preventing withdrawals, so lets let's look at the data for when people withdraw.

# When do we start our predictions?
We won't be able to intervene for students that withdraw during our data gathering phase.  We might as well drop them from the dataset as well.  But what period do we draw data from?

In [357]:
print('Student withdrawals')
full_registrations[full_registrations['final_result'] == 'Withdrawn']['date_unregistration'].describe()

Student withdrawals


count    6986.000000
mean       89.120813
std        69.913514
min         1.000000
25%        26.000000
50%        73.000000
75%       144.000000
max       269.000000
Name: date_unregistration, dtype: float64

For students who withdraw after the start of the course and before the end, the average withdraws about three months in, half withdraw before in the 11th week, and a quarter withdraw before the end of the first month.  How many do we already lose if take data for the first 2 months in order to predict student outcomes

In [358]:
display(full_registrations[full_registrations['final_result'] == 'Withdrawn'].shape)
display(full_registrations[(full_registrations['date_unregistration'] > 30) 
                 & (full_registrations['date_unregistration'] < full_registrations['module_presentation_length'])].shape)
display(vle.shape)
vle[vle.date < 30].shape

(6986, 12)

(4877, 12)

(10655280, 6)

(2907355, 6)

If we use the data from the first 30 days to predict outcomes, we can potentially keep 5/7ths of the withdrawing students but only use 3/10s of our interaction data.

In [359]:
display(full_registrations[full_registrations['final_result'] == 'Withdrawn'].shape)
display(full_registrations[(full_registrations['date_unregistration'] > 60)
                  & (full_registrations['date_unregistration'] < full_registrations['module_presentation_length'])].shape)
display(vle.shape)
vle[vle.date < 60].shape

(6986, 12)

(3785, 12)

(10655280, 6)

(4455870, 6)

If we use data from the first 60 days of instruction, we can keep a little more than 1/2 of the withdrawing students and use a little less than 1/2 of our data. 

In [360]:
display(full_registrations[full_registrations['final_result'] == 'Withdrawn'].shape)
display(full_registrations[(full_registrations['date_unregistration'] > 90)
                 & (full_registrations['date_unregistration'] < full_registrations['module_presentation_length'])].shape)
display(vle.shape)
vle[vle.date < 90].shape

(6986, 12)

(2999, 12)

(10655280, 6)

(5454846, 6)

If we wait until 90 days, we can intervene for less than half of the students and use a little more than half of the vle data.  We want the greatest chance of success for our first model, so let's use the first 90 days of data to predict, even though it means we lose more than half of the withdrawing students.

In [361]:
full_registrations = full_registrations[full_registrations['date_unregistration'] > 90]
full_registrations = full_registrations.drop(columns = ['date_unregistration'])

In [362]:
full_registrations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25371 entries, 0 to 32592
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 25371 non-null  object 
 1   code_presentation           25371 non-null  object 
 2   id_student                  25371 non-null  int64  
 3   gender                      25371 non-null  object 
 4   highest_education           25371 non-null  object 
 5   age_band                    25371 non-null  object 
 6   num_of_prev_attempts        25371 non-null  int64  
 7   disability                  25371 non-null  object 
 8   final_result                25371 non-null  object 
 9   date_registration           25371 non-null  float64
 10  module_presentation_length  25371 non-null  int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 2.3+ MB


# Looks clean, let's save it

In [363]:
full_registrations.to_csv('content/cleaned_registrations.csv', index = False)

In [364]:
full_registrations.describe()

Unnamed: 0,id_student,num_of_prev_attempts,date_registration,module_presentation_length
count,25371.0,25371.0,25371.0,25371.0
mean,706675.0,0.158606,-64.434867,256.149856
std,551819.0,0.469981,44.875864,13.120316
min,6516.0,0.0,-209.0,234.0
25%,506212.5,0.0,-93.0,241.0
50%,589441.0,0.0,-52.0,262.0
75%,641322.0,0.0,-29.0,268.0
max,2698588.0,6.0,167.0,269.0
