<img src="https://www.dbs.ie/images/default-source/logos/dbs-logo-2019-small.png" align = left/>

#  Open University Learning Analytics Dataset Cleaning

Capstone Project

Claire Connaughton (10266499)

In [None]:
import os
import pickle
import pydotplus
import numpy as np
import pandas as pd
from functools import reduce
from plotnine import *
import plotnine
plotnine.options.figure_size = (5.2,3.2)
import seaborn as sns
sns.set()
sns.set_style("white")
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.image as mpimg
from scipy import interp
from scipy.stats import skew, norm, probplot, boxcox, f_oneway
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler
from sklearn.compose import make_column_transformer
from IPython.display import Image  
from sklearn.tree import export_graphviz
from six import StringIO
from collections import Counter
from pandas_profiling import ProfileReport
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

# Import dataset

In [None]:
# Load OULA dataset

try:
    data = pd.read_csv('oulad_final.csv')
    print("The 'OULA' dataset has {} samples with {} features each.".format(*data.shape))
except:
    print("Dataset could not be loaded. Is the dataset missing?")

# Describe Data 

In [None]:
data.info()

In [None]:
# Change the datatype of 'id_student' from int to object

data['id_student'] = data['id_student'].astype(object)

In [None]:
# Drop date_registration and date_unregistration columns since the information is
# already captured in the monthly format
data.drop(columns=['date_registration', 'date_unregistration'], inplace=True)

In [None]:
data.describe()

In [None]:
data.head(5)

In [None]:
ProfileReport(data)

The initial inspection of the data has revealed some data quality issues including missing values, highly correlated columns and categorical variables which need to be treated.

# Verify Data Quality

Check for Missing Values

In [None]:
def missing_values_table(data):
        mis_val = data.isnull().sum()
        mis_val_percent = 100 * data.isnull().sum() / len(data)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("The OULA dataset has " + str(data.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

In [None]:
missing_values_table(data)

In [None]:
# Treat weighted_score column by replacing null values with 0 because those students did not make any submissions

data['weighted_score'] = data['weighted_score'].replace(np.nan).fillna(0)

In [None]:
# Treat null values in the total_late_submission column as 100% late because they did not make any late submission

data['total_late_submission'] = data['total_late_submission'].replace(np.nan).fillna(1.0)

In [None]:
# Get median total_assignments
data.total_assessments.median()

In [None]:
# Replace total_assessments NaNs with median number
data['total_assessments'] = np.where( (data['total_assessments'].isnull()),
                                           data.total_assessments.median(),
                                           data['total_assessments']
                                    )

In [None]:
# Treat null values in the late_rate column as 100% late because they did not make any late submission

data['late_rate'] = data['late_rate'].replace(np.nan).fillna(1.0)

In [None]:
# Treat null values in the fail_rate column as 100% failed 
# because they did not make any submission

data['fail_rate'] = data['fail_rate'].replace(np.nan).fillna(1.0)

In [None]:
# Treat the Total_click column by replacing null values with 0 because those students did not interact with the VLE.

data['total_click'] = data['total_click'].replace(np.nan).fillna(0)

In [None]:
# Treat the AVG_click column by replacing null values with 0 because those students did not interact with the VLE.

data['AVG_click'] = data['AVG_click'].replace(np.nan).fillna(0)

In [None]:
# Treat the sum_click column by replacing null values with 0 because those students did not interact with the VLE.

data['sum_click'] = data['sum_click'].replace(np.nan).fillna(0)

In [None]:
# To treat the missing imd_band values, first find what is the most frequent band in each region
regions_list = list(data\
                    [data['imd_band'].isnull()]['region']\
                    .unique())

for i in regions_list:
    result = data[data['region'] == i].imd_band.mode()
    print(f'{i} IMD band : \n', result)

In [None]:
# Replace all null values with respective most frequent imd_bands
regions_list = list(data\
                    [data['imd_band'].isnull()]['region']\
                    .unique())

for i in regions_list:
    data['imd_band'] = np.where( ( (data['imd_band'].isnull()) & (data['region'] == i) ),
                                           data[data['region'] == i].imd_band.mode(),
                                           data['imd_band']
                                    )

In [None]:
# Double check that there are no null values

missing_values_table(data)

All null values have been successfully treated

**********************************

Check for duplicate values and treat if required

In [None]:
print("Percentage of duplicated values in OULA is ", data.duplicated().sum() * 100 / len(data))

There are no duplicated values in the dataset.

***************************

# Numeric Variable Cleaning

Check the distribution for the numeric variables

In [None]:
# Create a dataframe with just numeric variables. 
# Drop the procrastination column because it is a binary variable. 

df_num = data.select_dtypes(include=np.number)

In [None]:
# Create statistics summaries with skew, mean, and median with just numerical columns


for col in df_num.columns:

    skew = df_num[col].skew()
    mean = df_num[col].mean()
    median = df_num[col].median()
    
    print(f'\tSummary for {col.upper()}')
    print(f'Skewness of {col}\t: {skew}')
    print(f'Mean {col} :\t {mean}')
    print(f'Median {col} :\t {median} \n')

Every variable has some amount of skewness. Inpsect this futher using distplots.

In [None]:
#create  dist plots
fig,ax =plt.subplots(ncols=6,nrows=2,figsize =(20,10))
index = 0
ax = ax.flatten()

for col,value in df_num.items ():
    if col !="Type":
        sns.distplot(value,ax =ax[index])
        index += 1

In [None]:
# Drop columns which are more categorical than continuous

df_num.drop(columns=['num_of_prev_attempts', 'total_assessments', 'total_late_submission' ], inplace=True)

In [None]:
# Check box plots for outliers

#create  boxplots
fig,ax =plt.subplots(ncols=4,nrows=2,figsize =(20,10))
index = 0
ax = ax.flatten()

for col,value in df_num.items ():
    if col !="Type":
        sns.boxplot(y=col,data=df_num,ax =ax[index])
        index += 1

Evidence of outliers in most of the boxplots

In [None]:
# Complete Log transformation on the skewed target variable, weighted_score

weighted_score_log=np.log(data["weighted_score"])
print ("Log normalised skweness for weighted_score is" , weighted_score_log.skew())

In [None]:
# Check for evidence of multicollinearity using a correlation heatmap

plt.figure(figsize=(6,4))
sns.heatmap(df_num.corr(), annot=True, cmap="coolwarm", );

Total_click and Average_click are the most correlated (0.56). Total_click and weighted_score are then next most correlated (0.36).

Weighted_score is negatively correlated with fail_rate(-0.37) and late_rate(-0.32)

In [None]:
# Check to see which features are strongly correlated with weighted score

df_num\
.drop(columns=['weighted_score'])\
.corrwith(df_num['weighted_score']).plot.bar(
        figsize = (6, 4), title = "Correlation with Weighted Score", fontsize = 12,
        rot = 90, grid = True);

In [None]:
data.corrwith(data['weighted_score']).sort_values(ascending=False)

Weighted_score is most strongly positively correlated with total_assessments and total_click and weakly correlated with fail_rate and late_rate.

The procrastination variable does not seem to add more value as it is less correlated than late_rate. Therefore, it may need to be dropped. There is no correlation between module_presentation_length or sum_click with weighted_score so they should be dropped. 

In [None]:
data.drop(columns=['sum_click', 'module_presentation_length'], inplace=True)

In [None]:
# Double check that the three columns have been dropped 

print("The 'OULA' dataset has {} samples with {} features each.".format(*data.shape))

# Categorial Variable Cleaning

In [None]:
# Produce a dataframe with just categorical columns

categoricals= data.select_dtypes(exclude=np.number)

# Drop id_student because it will not add to the analysis

categoricals = categoricals.drop(['id_student'], axis = 1)

categoricals.head()

In [None]:
categoricals.info()

In [None]:
# Visualise the categorical variables which need to be collapsed 

fig, ax = plt.subplots(1,3, figsize=(15, 5))

sns.countplot(categoricals.code_module, ax=ax[0], palette="Blues_d")
sns.countplot(categoricals.code_presentation, ax=ax[1], palette="Blues_d")
sns.countplot(categoricals.age_band, ax=ax[2], palette="Blues_d")

print("Count plots for code_module, code_presentation, age_band")

In [None]:
f, ax = plt.subplots(figsize=(7, 3))

sns.countplot(y= "region", data= categoricals, palette="Blues_d")

print("Count plot for region")

In [None]:
f, ax = plt.subplots(figsize=(7, 3))

sns.countplot(y= "highest_education", data= categoricals, palette="Blues_d")

print("Count plot for highest_education")

In [None]:
f, ax = plt.subplots(figsize=(7, 3))

sns.countplot(y= "imd_band", data= categoricals, palette="Blues_d")

print("Count plot for imd_band")

In [None]:
# Transform the Highest Education category

# Rename 'no formal quals' into 'lower than a level'
data['highest_education'] = np.where( (data['highest_education'] == 'No Formal quals'),
                                           'Lower Than A Level',
                                           data['highest_education']
                                    )

# Rename post-grads
data['highest_education'] = np.where( (data['highest_education'] == 'Post Graduate Qualification'),
                                           'HE Qualification',
                                           data['highest_education']
                                    )


In [None]:
# Visualise the cleaned categorical variables

fig, ax = plt.subplots(1,1, figsize=(15, 5))
# Highest Education category
g_1 = sns.countplot(y ='highest_education', 
              data = data,
              color='grey',
              order = data.highest_education.value_counts().index);

patch_h = []    
for patch in g_1.patches:
    reading = patch.get_height()
    patch_h.append(reading)

idx_tallest = np.argmax(patch_h)   

g_1.patches[idx_tallest].set_facecolor('#a834a8')  
g_1.set_title('Highest Education Level Attained', fontsize = 18)

In [None]:
# Replace 55+ and 35-55 groups with 35+
data['age_band'] = np.where( (data['age_band'] == '55<='),
                                           '35+',
                                           data['age_band']
                                    )

data['age_band'] = np.where( (data['age_band'] == '35-55'),
                                           '35+',
                                           data['age_band']
                                    )

In [None]:
# Inspect the age_band category
fig, ax = plt.subplots(1,1, figsize=(15, 5))

g_1 = sns.countplot(x ='age_band', 
              data = data,
              color='grey',
              order = data.age_band.value_counts().index);

patch_h = []    
for patch in g_1.patches:
    reading = patch.get_height()
    patch_h.append(reading)

idx_tallest = np.argmax(patch_h)   

g_1.patches[idx_tallest].set_facecolor('#a834a8')  
g_1.set_title('Age Band of Students', fontsize = 18)

In [None]:
categoricals['region'].value_counts()

In [None]:
# Nothern UK is all Northern England, Scotland and Ireland (assuming Ireland is N.I.)

data['region'] = np.where( (data['region'] == 'Yorkshire Region'),
                                           'North UK',
                                           data['region']
                                    )

data['region'] = np.where( (data['region'] == 'North Region'),
                                           'North UK',
                                           data['region']
                                    )

data['region'] = np.where( (data['region'] == 'Scotland'),
                                           'North UK',
                                           data['region']
                                    )
data['region'] = np.where( (data['region'] == 'Ireland'),
                                           'North UK',
                                           data['region']
                                    )

# Southern UK is London and all Southern England

data['region'] = np.where( (data['region'] == 'London Region'),
                                           'South UK',
                                           data['region']
                                    )
data['region'] = np.where( (data['region'] == 'South Region'),
                                           'South UK',
                                           data['region']
                                    )
data['region'] = np.where( (data['region'] == 'South East Region'),
                                           'East UK',
                                           data['region']
                                    )

data['region'] = np.where( (data['region'] == 'South West Region'),
                                           'West UK',
                                           data['region']
                                    )
# Eastern UK is all Eastern England

data['region'] = np.where( (data['region'] == 'East Anglian Region'),
                                           'East UK',
                                           data['region']
                                    )

data['region'] = np.where( (data['region'] == 'East Midlands Region'),
                                           'East UK',
                                           data['region']
                                    )

# Western UK is Western England and Wales 

data['region'] = np.where( (data['region'] == 'North Western Region'),
                                           'West UK',
                                           data['region']
                                    )



data['region'] = np.where( (data['region'] == 'West Midlands Region'),
                                           'West UK',
                                           data['region']
                                    )


data['region'] = np.where( (data['region'] == 'Wales'),
                                           'West UK',
                                           data['region']
                                    )

In [None]:
data['region'].value_counts()

In [None]:
# Inspect the region category
fig, ax = plt.subplots(1,1, figsize=(15, 5))

g_1 = sns.countplot(x ='region', 
              data = data,
              color='grey',
              order = data.region.value_counts().index);

patch_h = []    
for patch in g_1.patches:
    reading = patch.get_height()
    patch_h.append(reading)

idx_tallest = np.argmax(patch_h)   

g_1.patches[idx_tallest].set_facecolor('#a834a8')  
g_1.set_title('Regions', fontsize = 18)

In [None]:
# Inspect the imd bands

data['imd_band'].value_counts()

In [None]:
# Create three categories: disadvantaged (0-30%), middle class (30-80%), privileged (80-100%)

data['imd_band'] = np.where( (data['imd_band'] == '0-10%'),
                                           'Disadvantaged',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '10-20'),
                                           'Disadvantaged',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '20-30%'),
                                           'Disadvantaged',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '30-40%'),
                                           'Middle Class',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '40-50%'),
                                           'Middle Class',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '50-60%'),
                                           'Middle Class',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '60-70%'),
                                           'Middle Class',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '70-80%'),
                                           'Middle Class',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '80-90%'),
                                           'Privileged',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '90-100%'),
                                           'Privileged',
                                           data['imd_band']
                                    )

In [None]:
data['imd_band'].value_counts()

In [None]:
# Inspect the imd_band category
fig, ax = plt.subplots(1,1, figsize=(15, 5))

g_1 = sns.countplot(x ='imd_band', 
              data = data,
              color='grey',
              order = data.imd_band.value_counts().index);

patch_h = []    
for patch in g_1.patches:
    reading = patch.get_height()
    patch_h.append(reading)

idx_tallest = np.argmax(patch_h)   

g_1.patches[idx_tallest].set_facecolor('#a834a8')  
g_1.set_title('Socio-Economic Status of the Students', fontsize = 18)

# Target Variable

In [None]:
# Inspect the frequency counts of the final_result column

data['final_result'].value_counts()

In [None]:
# Inspect the 'final_result' category

fig, ax = plt.subplots(1,1, figsize=(15, 5))

g_1 = sns.countplot(x ='final_result', 
              data = data,
              color='grey',
              order = data.final_result.value_counts().index);

patch_h = []    
for patch in g_1.patches:
    reading = patch.get_height()
    patch_h.append(reading)

idx_tallest = np.argmax(patch_h)   

g_1.patches[idx_tallest].set_facecolor('#a834a8')  
g_1.set_title('Final Outcome of the Students', fontsize = 18)

In [None]:
# Create a new column, 'Dropout', which treats students who withdrew as dropouts
# '0' : Not Withdrawn, '1': 'Withdrawn'

data['dropout'] = [0 if result in ['Distinction', 'Pass', 'Fail'] else 1 for result in data['final_result']]

In [None]:
# Visualise student dropouts
fig, ax = plt.subplots(1,1, figsize=(15, 5))

g_1 = sns.countplot(x ='dropout', 
              data = data,
              color='grey',
              order = data.dropout.value_counts().index);

patch_h = []    
for patch in g_1.patches:
    reading = patch.get_height()
    patch_h.append(reading)

idx_tallest = np.argmax(patch_h)   

g_1.patches[idx_tallest].set_facecolor('#a834a8')
g_1.set_xticklabels(['Completed Course', 'Dropout'])
g_1.set_title('Student Dropouts', fontsize = 18)

In [None]:
data['dropout'].value_counts()

Create a student_failed column to store students who failed. 

In [None]:
# Create a 'student_failed' column which indicates whether the student failed the course. 
# '0' : Did not fail, '1': 'Failed'

data['student_failed'] = [0 if result in ['Distinction', 'Pass', 'Withdrawn'] else 1 for result in data['final_result']]

In [None]:
data['student_failed'].value_counts()

In [None]:
# Visualise student fails
fig, ax = plt.subplots(1,1, figsize=(15, 5))

g_1 = sns.countplot(x ='student_failed', 
              data = data,
              color='grey',
              order = data.student_failed.value_counts().index);

patch_h = []    
for patch in g_1.patches:
    reading = patch.get_height()
    patch_h.append(reading)

idx_tallest = np.argmax(patch_h)   

g_1.patches[idx_tallest].set_facecolor('#a834a8')
g_1.set_xticklabels(['Did not Fail', 'Failed'])
g_1.set_title('Student Fails', fontsize = 18)

In [None]:
# Drop the final_result column because it's been collapsed into the 'dropout' and 'student_failed' column

data.drop('final_result', axis = 1, inplace = True)

In [None]:
print("The 'OULA' dataset has {} samples with {} features each.".format(*data.shape))

In [None]:
data.info()

In [None]:
# Create new csv file containing the cleaned OULA dataset 
data.to_csv('oulad_cleaned.csv', index=False)

# END