In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Merge registrations, student info, and course info

In [10]:
registrations = pd.read_csv('../content/studentRegistration.csv')
courseInfo = pd.read_csv('../content/courses.csv')
students = pd.read_csv('../content/studentInfo.csv')
student_vle = pd.read_csv('../content/studentVle.csv')
vle_info = pd.read_csv('../content/vle.csv')
student_assessments = pd.read_csv('../content/studentAssessment.csv', skiprows=[128223,64073])
assessments_info = pd.read_csv('../content/assessments.csv')

index_columns = ['code_module','code_presentation','id_student']

#Limit Dates to Prediction window
prediction_window = 120

full_registrations = pd.merge(students, registrations, on=index_columns, validate='1:1')

full_registrations = pd.merge(full_registrations, courseInfo, \
                         on=['code_module','code_presentation'], validate='many_to_one')

assessments = pd.merge(student_assessments,assessments_info, how = 'left', on='id_assessment')


# Data Validation
We will drop: 

* Rows in assessments with no assessment score
* The `is_banked` and `weight` columsn in assessments, because we won't use those.
* Rows in assessments occuring after the prediction window.
* Rows in registrations without registration dates or IMD band information, 
* Rows in registrations with unregistration dates that are not labeled as 'Withdrawn' 
* Rows in registrations representing students who unregistered before the end of the prediction window.
* The `date_unregistration` column from registrations since we are not predicting when they withdraw, just whether.
* Rows in the studentVLE table occuring after the prediction window

In [11]:
# Assessments
assessments.dropna(subset = ['score'], inplace=True)
assessments.drop(columns = ['is_banked', 'weight'], inplace=True)
assessments = assessments[assessments['date_submitted'] <= prediction_window]

# Registrations
full_registrations.dropna(subset=['date_registration','imd_band'], inplace=True)
not_withdrawn = full_registrations['date_unregistration'].isna()
withdrawn_after_predict = (full_registrations['final_result'] == 'Withdrawn') \
                            & (full_registrations['date_unregistration'] > prediction_window)
full_registrations = full_registrations[not_withdrawn | withdrawn_after_predict]
full_registrations.drop(columns=['date_unregistration'], inplace=True)

# VLE
student_vle = student_vle[student_vle.date <= prediction_window]

# Virtual Learning Environment Activity Statistics 
Next, lets isolate activity statistics from the virtual learning environment interactions and add those to our dataset. 

We will collect:
* Total number of days worked during the prediction window
* Total number of activities interacted with during the prediction window
* Total number of clicks during the prediction window

In [12]:
vle = pd.merge(student_vle,vle_info, 
               how = 'left', \
               on =['id_site','code_module', 'code_presentation'], \
               validate = 'm:1').drop(columns = ['week_from','week_to'])

total_activities = vle.groupby(by=index_columns).count().reset_index()
total_activities = total_activities.drop(columns=['date','sum_click','activity_type'])

#intermediate grouped dataframe
date_grouped = vle.groupby(by=index_columns + ['date']).count().reset_index()

days_studied = date_grouped.groupby(by=index_columns).count().reset_index()
days_studied = days_studied.drop(columns=['id_site','sum_click','activity_type'])

clicks = vle.groupby(by=index_columns).sum().reset_index()
clicks = clicks.drop(columns=['id_site','date'])

full_registrations = pd.merge(full_registrations, days_studied, on=index_columns)
full_registrations = pd.merge(full_registrations, total_activities, on=index_columns)
full_registrations = pd.merge(full_registrations, clicks, on=index_columns)

# Assessment Data
We will collect the number of assessments completed and the average scores.

In [13]:
num_assessments = assessments.groupby(by = index_columns).count().reset_index()
num_assessments.drop(columns = ['date_submitted','score','date','assessment_type'], inplace=True)

avg_score = assessments.groupby(by = index_columns).mean().reset_index()
avg_score.drop(columns = ['date_submitted','id_assessment','date'], inplace=True)

full_registrations = pd.merge(full_registrations, num_assessments, on=index_columns)
full_registrations = pd.merge(full_registrations, avg_score, on=index_columns)

new_cols = {'id_assessment':'assessments_completed',
            'score':'average_score','date':'days_studied',
            'id_site':'activities_completed','sum_click':'total_clicks'}
full_registrations['code_module'] = LabelEncoder().fit_transform(full_registrations['code_module'])
full_registrations = full_registrations.rename(columns = new_cols)

In [15]:
full_registrations.to_csv('../content/cleaned_registrations.csv', index = False)