<img src="https://www.dbs.ie/images/default-source/logos/dbs-logo-2019-small.png" align = left/>

#  Open University Learning Analytics - Student Dropout and Failure for Modules CCC and DDD
Capstone Project

Claire Connaughton (10266499)

*********************************

 Only two modules have records of the exams: 'DDD' and 'CCC'. Module CCC has two exams. Isolate these modules to extract a result for assignments, exams and overall weighted grade. 

# Import Relevant Libraries

In [None]:
import os
import pickle
import pydotplus
import numpy as np
import pandas as pd
from functools import reduce
from plotnine import *
import plotnine
plotnine.options.figure_size = (5.2,3.2)
import seaborn as sns
sns.set_style("white")
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.image as mpimg
from scipy import interp
from scipy.stats import skew, norm, probplot, boxcox, f_oneway
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler
from sklearn.compose import make_column_transformer
from IPython.display import Image  
from sklearn.tree import export_graphviz
from six import StringIO
from collections import Counter
from sklearn.decomposition import PCA as sklearnPCA
import xgboost as xgb
from sklearn.datasets import dump_svmlight_filefrom sklearn.metrics 
import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_splitfrom sklearn.utils 
import resample 
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

# Import Datasets

In [None]:
# Import all of the csv files
studentInfo = pd.read_csv('studentInfo.csv')
assessments = pd.read_csv('assessments.csv')
courses = pd.read_csv('courses.csv')
studentAssessment = pd.read_csv('studentAssessment.csv')
studentRegistration = pd.read_csv('studentRegistration.csv')
studentVle = pd.read_csv('studentVle.csv')
vle = pd.read_csv('vle.csv')

# Merge Datasets and extract the CCC and DDD modules only

In [None]:
# Merge table assessment and table studentAssessment on id_assessment
merge_student_ass = pd.merge(assessments, studentAssessment, how='left', on=['id_assessment'])

In [None]:
# Investigate the CCC and DDD module further
CCC = merge_student_ass[merge_student_ass['code_module'] == 'CCC']
DDD= merge_student_ass[merge_student_ass['code_module'] == 'DDD']

In [None]:
# Check to see if all assessment data is present for module DDD
pd.crosstab(DDD.code_presentation, DDD.assessment_type).plot.barh(stacked = True);

In [None]:
# Check to see if all assessment data is present for module CCC

pd.crosstab(CCC.code_presentation, CCC.assessment_type).plot.barh(stacked = True);

All assessment types are present for module CCC but the data is only available for 2014. 

Module DDD has no CMA except from in 2013.

Merge all of the datasets and then filter it to include just the data for module CCC & DDD from 2014.

In [None]:
# Merge with an inner join
regCourses = pd.merge(studentRegistration , courses, on=['code_module', 'code_presentation'], how='inner')


In [None]:
# Merge with an inner join
regCoursesInfo = pd.merge(regCourses, studentInfo, on=['code_module', 'code_presentation', 'id_student'], how='inner')

In [None]:
# Create the total_click per student column

total_click_per_student = studentVle\
.groupby(['code_module', 'code_presentation', 'id_student'])\
.agg(total_click = ("sum_click",sum))\
.reset_index()

In [None]:
# Merge with other tables

merged = pd.merge(regCoursesInfo, total_click_per_student, on=['id_student', 'code_module', 'code_presentation'], how='inner')

In [None]:
merged.info()

In [None]:
# Create a late_rate column

# Calculate the difference between the submission dates
lateSubmission = merge_student_ass.assign(submission_days=merge_student_ass['date_submitted']-merge_student_ass['date'])
# Make a column indicating if the submission was late or not 
lateSubmission = lateSubmission.assign(late_submission=lateSubmission['submission_days'] > 0)

# Aggregate per student per module presentation
total_late_per_student = lateSubmission\
.groupby(['id_student', 'code_module', 'code_presentation'])\
.agg(total_late_submission = ('late_submission', sum))\
.reset_index()

# Make a df with total number of all assessments per student per module presentation
total_count_assessments = lateSubmission[['id_student', 'code_module', 'code_presentation', 'id_assessment']]\
.groupby(['id_student', 'code_module', 'code_presentation'])\
.size()\
.reset_index(name='total_assessments')

# Merge df with total late assessements and total count assessments
late_rate_per_student = pd.merge(total_late_per_student, total_count_assessments, on=['id_student', 'code_module', 'code_presentation'], how='left')
# Make a new column with late submission rate
late_rate_per_student['late_rate'] = late_rate_per_student['total_late_submission'] / late_rate_per_student['total_assessments']

late_rate_per_student

In [None]:
merged = pd.merge(merged, late_rate_per_student, on=['id_student', 'code_module', 'code_presentation'], how='inner')

In [None]:
merged.info()

In [None]:
# drop the unneeded columns in merge_student_ass

merge_student_ass.drop(columns=['date_submitted', 'is_banked', 'date' ], inplace=True)

In [None]:
# Merge the merge_studeny_ass table onto the merged table

merged = pd.merge(merged, merge_student_ass, on=['id_student', 'code_module', 'code_presentation'], how='inner')

In [None]:
# Filter out the CCC and DDD modules CCC_DDD

CCC = merged[merged['code_module'] == 'CCC']
DDD= merged[merged['code_module'] == 'DDD']

In [None]:
frames = [CCC, DDD]

CCC_DDD = pd.concat(frames)

In [None]:
CCC_DDD.code_module.value_counts()

There are more students in the DDD group than the CCC group.

Create a new dataframe with just the data from CCC and DDD for 2014 only.

In [None]:
CCC_DDD = CCC_DDD.drop(CCC_DDD[CCC_DDD['code_presentation'] == '2013B'].index)

In [None]:
CCC_DDD = CCC_DDD.drop(CCC_DDD[CCC_DDD['code_presentation'] == '2013J'].index)

# Calculate Assessment Scores for Module DDD

Since module DDD has only an exam and a TMA assignment and module CCC has an exam, TMA and CMA. The weighted grades have to be calculated separately. 

In [None]:
# Calculate assignment grade for DDD

DDD_assignment = CCC_DDD[(CCC_DDD['code_module'] == 'DDD') & (CCC_DDD['assessment_type'] != 'Exam')]

In [None]:
DDD_assignment.head()

In [None]:
# Rename the score column as 'assignment score' and 'date_submitted' as 'assignment_sub_date'

DDD_assignment.rename(columns = {'score' : 'assignment_score', 'date_submitted': 'assignment_sub_date'}, inplace = True)

In [None]:
# Drop unnecessary columns

DDD_assignment.drop(columns=['id_assessment', 'assessment_type', 'weight'], inplace=True)

In [None]:
DDD_assignment.head(3)

Create the weighted score for the DDD module exams

In [None]:
# Create a dataframe to just hold exam results for DDD

DDD_exams = CCC_DDD[(CCC_DDD['code_module'] == 'DDD') & (CCC_DDD['assessment_type'] == 'Exam')]

In [None]:
DDD_exams.info()

In [None]:
# Rename 'score column' to 'exam_score' and 'date_submitted' to 'exam_sub_date'

DDD_exams.rename(columns = {'score' : 'exam_score', 'date_submitted': 'exam_sub_date'}, inplace = True)

In [None]:
# Drop unnecessary columns

DDD_exams.drop(columns=['id_assessment', 'assessment_type', 'weight'], inplace=True)

In [None]:
DDD_exams.info()

In [None]:
# Merge DDD_assignments and DDD_exams

DDD_grades = pd.merge(DDD_assignment, DDD_exams, on=['id_student', 'code_module', 'code_presentation', 'date_registration', 'date_unregistration', 'module_presentation_length', 'gender', 'region', 'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts', 'studied_credits', 'disability', 'final_result', 'total_click','total_late_submission', 'total_assessments', 'late_rate'], how='left')


In [None]:
DDD_grades.info()

Create an overall score for module DDD

In [None]:
# Treat nulll values in exam_score
# Change all nulls to 0 because that student did not complete the assignment/exam

DDD_grades['assignment_score'] = DDD_grades['assignment_score'].replace(np.nan).fillna(0)
DDD_grades['exam_score'] = DDD_grades['exam_score'].replace(np.nan).fillna(0)
# Create an overall score which combines the assignment and exam scores 

DDD_grades['overall_score']= (DDD_grades['assignment_score'] * 0.5) + (DDD_grades['exam_score'] * 0.5)

In [None]:
DDD_grades.info()

# Calculate Assessment Scores for Module CCC

Since module DDD does not have a CMA and the CMA is only weighted as 25%, treat the TMA as 100% weighting.

Create a weighted assignment score and exam score for Module CCC

In [None]:
# Create a dataframe to hold the weighted assignment grade for CCC

CCC_assignment = CCC_DDD[(CCC_DDD['code_module'] == 'CCC') & (CCC_DDD['assessment_type'] != 'Exam')]

In [None]:
# Drop the CMA assigments

CCC_assignment= CCC_assignment.drop(CCC_assignment[CCC_assignment['assessment_type'] == 'CMA'].index)

In [None]:
# Rename the score column as 'assignment score' and 'date_submitted' as 'assignment_sub_date'

CCC_assignment.rename(columns = {'score' : 'assignment_score', 'date_submitted': 'assignment_sub_date'}, inplace = True)

In [None]:
# Drop unnecessary columns

CCC_assignment.drop(columns=['id_assessment', 'assessment_type', 'weight'], inplace=True)

Create a score for the CCC module exams

In [None]:
# Create a dataframe to just hold exam results for DDD

CCC_exams = CCC_DDD[(CCC_DDD['code_module'] == 'DDD') & (CCC_DDD['assessment_type'] == 'Exam')]

In [None]:
# Rename 'score column' to 'exam_score' and 'date_submitted' to 'exam_sub_date'

CCC_exams.rename(columns = {'score' : 'exam_score', 'date_submitted': 'exam_sub_date'}, inplace = True)

In [None]:
# Drop unnecessary columns

CCC_exams.drop(columns=['id_assessment', 'assessment_type', 'weight' ], inplace=True)

In [None]:
# Merge CCC_assignments and CCC_exams

CCC_grades = pd.merge(CCC_assignment, CCC_exams, on=['id_student', 'code_module', 'code_presentation', 'date_registration', 'date_unregistration', 'module_presentation_length', 'gender', 'region', 'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts', 'studied_credits', 'disability', 'final_result', 'total_click','total_late_submission', 'total_assessments', 'late_rate'], how='left')


In [None]:
# Create an overall score

# Treat nulll values in exam_score
# Change all nulls to 0 because that student did not complete the assignment/exam

CCC_grades['assignment_score'] = CCC_grades['assignment_score'].replace(np.nan).fillna(0)
CCC_grades['exam_score'] = CCC_grades['exam_score'].replace(np.nan).fillna(0)
# Create an overall score which combines the assignment and exam scores 
CCC_grades['overall_score']= (CCC_grades['assignment_score'] * 0.33) + (CCC_grades['exam_score'] * 0.66)

In [None]:
CCC_grades.info()

In [None]:
# Create an overall score which combines the assignment and exam scores 

CCC_grades['overall_score']= (CCC_grades['assignment_score'] * 0.33) + (CCC_grades['exam_score'] * 0.66)

In [None]:
# Merge DDD_grades and CCC_grades

frames= [DDD_grades, CCC_grades]
grades = pd.concat(frames)

In [None]:
grades.info()

# Data Cleaning

In [None]:
# Rename the dataframe from 'grades' to 'data'

data=grades

In [None]:
# Change the datatype of 'id_student' from int to object

data['id_student'] = data['id_student'].astype(object)

In [None]:
# Reset the columns so that id_student is listed first

col_list = list(data.columns)
col_list.insert(0,col_list.pop(col_list.index('id_student')))
data = data.loc[:,col_list]

In [None]:
# Drop date_registration and date_unregistration columns they are no longer required
data.drop(columns=['date_registration', 'date_unregistration'], inplace=True)

In [None]:
# Check for missing values

def missing_values_table(data):
        mis_val = data.isnull().sum()
        mis_val_percent = 100 * data.isnull().sum() / len(data)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("The OULA dataset has " + str(data.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

In [None]:
missing_values_table(data)

In [None]:
# To treat the missing imd_band values, first find what is the most frequent band in each region
regions_list = list(data\
                    [data['imd_band'].isnull()]['region']\
                    .unique())

for i in regions_list:
    result = data[data['region'] == i].imd_band.mode()
    print(f'{i} IMD band : \n', result)

In [None]:
# Replace all null values with respective most frequent imd_bands
regions_list = list(data\
                    [data['imd_band'].isnull()]['region']\
                    .unique())

for i in regions_list:
    data['imd_band'] = np.where( ( (data['imd_band'].isnull()) & (data['region'] == i) ),
                                           data[data['region'] == i].imd_band.mode(),
                                           data['imd_band']
                                    )

In [None]:
missing_values_table(data)

In [None]:
# Check for duplicates

print("Percentage of duplicated values in OULA is ", data.duplicated().sum() * 100 / len(data))

In [None]:
# Drop the duplicated values

data= data.drop_duplicates(subset='id_student', keep= 'first')

In [None]:
print("Percentage of duplicated values in OULA is ", data.duplicated().sum() * 100 / len(data))

# Feature Engineering

In [None]:
# Transform the Highest Education category

# Rename 'no formal quals' into 'lower than a level'
data['highest_education'] = np.where( (data['highest_education'] == 'No Formal quals'),
                                           'Lower Than A Level',
                                           data['highest_education']
                                    )

# Rename post-grads
data['highest_education'] = np.where( (data['highest_education'] == 'Post Graduate Qualification'),
                                           'HE Qualification',
                                           data['highest_education']
                                    )


In [None]:
# Replace 55+ and 35-55 groups with 35+
data['age_band'] = np.where( (data['age_band'] == '55<='),
                                           '35+',
                                           data['age_band']
                                    )

data['age_band'] = np.where( (data['age_band'] == '35-55'),
                                           '35+',
                                           data['age_band']
                                    )

In [None]:
# Nothern UK is all Northern England, Scotland and Ireland (assuming Ireland is N.I.)

data['region'] = np.where( (data['region'] == 'Yorkshire Region'),
                                           'North UK',
                                           data['region']
                                    )

data['region'] = np.where( (data['region'] == 'North Region'),
                                           'North UK',
                                           data['region']
                                    )

data['region'] = np.where( (data['region'] == 'Scotland'),
                                           'North UK',
                                           data['region']
                                    )
data['region'] = np.where( (data['region'] == 'Ireland'),
                                           'North UK',
                                           data['region']
                                    )

# Southern UK is London and all Southern England

data['region'] = np.where( (data['region'] == 'London Region'),
                                           'South UK',
                                           data['region']
                                    )
data['region'] = np.where( (data['region'] == 'South Region'),
                                           'South UK',
                                           data['region']
                                    )
data['region'] = np.where( (data['region'] == 'South East Region'),
                                           'East UK',
                                           data['region']
                                    )

data['region'] = np.where( (data['region'] == 'South West Region'),
                                           'West UK',
                                           data['region']
                                    )
# Eastern UK is all Eastern England

data['region'] = np.where( (data['region'] == 'East Anglian Region'),
                                           'East UK',
                                           data['region']
                                    )

data['region'] = np.where( (data['region'] == 'East Midlands Region'),
                                           'East UK',
                                           data['region']
                                    )

# Western UK is Western England and Wales 

data['region'] = np.where( (data['region'] == 'North Western Region'),
                                           'West UK',
                                           data['region']
                                    )



data['region'] = np.where( (data['region'] == 'West Midlands Region'),
                                           'West UK',
                                           data['region']
                                    )


data['region'] = np.where( (data['region'] == 'Wales'),
                                           'West UK',
                                           data['region']
                                    )

In [None]:
# Create three categories: disadvantaged (0-30%), middle class (30-80%), privileged (80-100%)

data['imd_band'] = np.where( (data['imd_band'] == '0-10%'),
                                           'Disadvantaged',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '10-20'),
                                           'Disadvantaged',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '20-30%'),
                                           'Disadvantaged',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '30-40%'),
                                           'Middle Class',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '40-50%'),
                                           'Middle Class',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '50-60%'),
                                           'Middle Class',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '60-70%'),
                                           'Middle Class',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '70-80%'),
                                           'Middle Class',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '80-90%'),
                                           'Privileged',
                                           data['imd_band']
                                    )
data['imd_band'] = np.where( (data['imd_band'] == '90-100%'),
                                           'Privileged',
                                           data['imd_band']
                                    )

In [None]:
# Create a new column, 'Dropout', which treats students who withdrew as dropouts
# '0' : Not Withdrawn, '1': 'Withdrawn'

data['dropout'] = [0 if result in ['Distinction', 'Pass', 'Fail'] else 1 for result in data['final_result']]

In [None]:
# Create a 'student_failed' column which indicates whether the student failed the course. 
# '0' : Did not fail, '1': 'Failed'

data['student_failed'] = [0 if result in ['Distinction', 'Pass', 'Withdrawn'] else 1 for result in data['final_result']]

In [None]:
# Drop the final_result column because it's been collapsed into the 'dropout' and 'student_failed' column

data.drop('final_result', axis = 1, inplace = True) 

In [None]:
# Show boxplots for student results of dropouts
fig, ax = plt.subplots(1, 3, figsize=(15, 5)) 
sns.boxplot(x="dropout", y='assignment_score', ax=ax[0], data=data)
sns.boxplot(x="dropout", y='exam_score', ax=ax[1], data=data)
sns.boxplot(x="dropout", y='overall_score', ax=ax[2], data=data)

In [None]:
# Show boxplots for student results of fails 
fig, ax = plt.subplots(1, 3, figsize=(15, 5)) 
sns.boxplot(x='student_failed', y='assignment_score', ax=ax[0], data=data)
sns.boxplot(x='student_failed', y='exam_score', ax=ax[1], data=data)
sns.boxplot(x='student_failed', y='overall_score', ax=ax[2], data=data)

# Model Preparation for Student Failure

In [None]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

#find design matrix for linear regression model using 'rating' as response variable 
y, X = dmatrices('student_failed ~ code_module+code_presentation+gender+region+highest_education+imd_band+age_band+num_of_prev_attempts+disability+studied_credits+total_click+late_rate', data=data, return_type='dataframe')

#calculate VIF for each explanatory variable
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['variable'] = X.columns

#view VIF for each explanatory variable 
vif

No variable exceeds the threshold of a VIF score of 5 so all the variables will be retained.

In [None]:
# Drop the ID column because it's unnecessary for model learning predictions

data.drop('id_student', axis = 1, inplace = True)

In [None]:
#Declaring independent variable i.e X
#Declaring Target variable i.e y

x = data.drop(['student_failed', 'dropout'], axis = 1)
y = data['student_failed']

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler
from sklearn.compose import make_column_transformer

# Set encoding and scaling instructions
column_transform = make_column_transformer(
    (OneHotEncoder(), ['code_presentation', 'gender', 'region', 'age_band', 'disability']),
    (OrdinalEncoder(), ['highest_education', 'imd_band']),
    (RobustScaler(), ['num_of_prev_attempts', 'studied_credits', 'total_click', 'late_rate'])
)

# Apply column transformer to features
X_encoded = column_transform.fit_transform(x)

In [None]:
#Splitting data into train and test dataset

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_encoded, y , test_size = 0.2, random_state  = 365)

In [None]:
from sklearn.model_selection import KFold 
kf = KFold(n_splits=5, shuffle=True, random_state=40) 

In [None]:
from sklearn.utils import resample 
from imblearn.over_sampling import SMOTE 
smote = SMOTE()  

In [None]:
x_sm, y_sm =smote.fit_resample(X_encoded,y) 

print(X_encoded.shape, y.shape) 
print(x_sm.shape, y_sm.shape) 
sns.countplot(y_sm) 

# Modelling for Student Failure

******************************

In [None]:
from sklearn.metrics import confusion_matrix 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, cross_val_predict, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score, mean_absolute_error
from sklearn import metrics, tree

def logistic_regression(x, y): 
    x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.20, random_state=40, stratify = y) 
    logreg_model = LogisticRegression() 
    logreg_model.fit(x_train, y_train) 

    pred_train = logreg_model.predict(x_train) 
    pred_test = logreg_model.predict(x_test) 
    cm_train = confusion_matrix(y_train, pred_train) 
    cm_test = confusion_matrix(y_test, pred_test)
    score = round(accuracy_score(y_test, pred_test), 3)
    cm1 = confusion_matrix(y_test, pred_test)
    sns.heatmap(cm1, annot=True, fmt=".0f")
    plt.xlabel('Predicted Values')
    plt.ylabel('Actual Values')
    plt.title('Accuracy Score: {0}'.format(score), size = 15)
    plt.show()
    print("Accuracy of Test Model : ",  logreg_model.score(x_test, y_test)) 
    print("Mean Absolute Error : ", mean_absolute_error(pred_test, y_test))
    print("Train Data Set") 
    print(metrics.classification_report(y_train,pred_train) ) 
    print("Test Data Set ") 
    print(metrics.classification_report(y_test,pred_test) ) 
    return  None 

In [None]:
logistic_regression(x_sm, y_sm) 
warnings.filterwarnings('ignore')

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

def decision_tree(x, y): 
    x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.20, random_state=40, stratify = y) 
    dt = DecisionTreeClassifier() 
    dt.fit(x_train, y_train) 

    pred_train = dt.predict(x_train) 
    pred_test = dt.predict(x_test) 
    confusion_matrix_train = confusion_matrix(y_train, pred_train) 
    confusion_matrix_test = confusion_matrix(y_test, pred_test)
    score = round(accuracy_score(y_test, pred_test), 3)
    cm1 = confusion_matrix(y_test, pred_test)
    sns.heatmap(cm1, annot=True, fmt=".0f")
    plt.xlabel('Predicted Values')
    plt.ylabel('Actual Values')
    plt.title('Accuracy Score: {0}'.format(score), size = 15)
    plt.show()
    print("Accuracy of Test Model : ",  dt.score(x_test, y_test)) 
    print("Mean Absolute Error : ", mean_absolute_error(pred_test, y_test))
    print("Train Data Set") 
    print(metrics.classification_report(y_train,pred_train) ) 
    print("Test Data Set ") 
    print(metrics.classification_report(y_test,pred_test) ) 
    return  None 

In [None]:
decision_tree(x_sm, y_sm) 
warnings.filterwarnings('ignore')

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

def random_forest(x, y): 
    x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.20, random_state=40, stratify = y) 
    rf = RandomForestClassifier() 
    rf.fit(x_train, y_train) 

    pred_train = rf.predict(x_train) 
    pred_test = rf.predict(x_test) 
    confusion_matrix_train = confusion_matrix(y_train, pred_train) 
    confusion_matrix_test = confusion_matrix(y_test, pred_test)
    score = round(accuracy_score(y_test, pred_test), 3)
    cm1 = confusion_matrix(y_test, pred_test)
    sns.heatmap(cm1, annot=True, fmt=".0f")
    plt.xlabel('Predicted Values')
    plt.ylabel('Actual Values')
    plt.title('Accuracy Score: {0}'.format(score), size = 15)
    plt.show()
    print("Accuracy of Test Model : ",  rf.score(x_test, y_test)) 
    print("Mean Absolute Error : ", mean_absolute_error(pred_test, y_test))
    print("Train Data Set") 
    print(metrics.classification_report(y_train,pred_train) ) 
    print("Test Data Set ") 
    print(metrics.classification_report(y_test,pred_test) ) 
    return  None 

In [None]:
random_forest(x_sm, y_sm) 
warnings.filterwarnings('ignore')

# Support Vector Machine

In [None]:
from sklearn.svm import SVC 

def support_vector_machine(x, y): 
    x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.25, random_state=40, stratify = y) 
    svm_model_linear = SVC(kernel = 'linear', C = 1).fit(x_train, y_train) 
    svm_predictions = svm_model_linear.predict(x_test)

    pred_train = svm_model_linear.predict(x_train) 
    pred_test = svm_model_linear.predict(x_test) 
    confusion_matrix_train = confusion_matrix(y_train, pred_train) 
    confusion_matrix_test = confusion_matrix(y_test, pred_test)
    score = round(accuracy_score(y_test, pred_test), 3)
    cm1 = confusion_matrix(y_test, pred_test)
    sns.heatmap(cm1, annot=True, fmt=".0f")
    plt.xlabel('Predicted Values')
    plt.ylabel('Actual Values')
    plt.title('Accuracy Score: {0}'.format(score), size = 15)
    plt.show()
    print("Accuracy of Test Model : ",  svm_model_linear.score(x_test, y_test)) 
    print("Mean Absolute Error : ", mean_absolute_error(pred_test, y_test))
    print("Train Data Set") 
    print(metrics.classification_report(y_train,pred_train) ) 
    print("Test Data Set ") 
    print(metrics.classification_report(y_test,pred_test) ) 
    return  None 

In [None]:
support_vector_machine(x_sm, y_sm) 
warnings.filterwarnings('ignore')

# XGBoost

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.25)

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True)
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtrain_svm = xgb.DMatrix('dtrain.svm')
dtest_svm = xgb.DMatrix('dtest.svm')

In [None]:
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 50  # the number of training iterations

In [None]:
# training and testing - numpy matrices
from sklearn.metrics import precision_score
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
# extracting most confident predictions
best_preds = np.asarray([np.argmax(line) for line in preds])
print ("Numpy array precision:", precision_score(y_test, best_preds, average='macro'))

In [None]:
print(accuracy_score(y_test, best_preds))
print(recall_score(y_test, best_preds, average='macro'))
print(f1_score(y_test, best_preds, average='macro'))
print(precision_score(y_test, best_preds, average='macro'))

In [None]:
score = round(accuracy_score(y_test, best_preds), 3)
cm1 = confusion_matrix(y_test, best_preds)
sns.heatmap(cm1, annot=True, fmt=".0f")
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Accuracy Score: {0}'.format(score), size = 15)
plt.show()
print(metrics.classification_report(y_test,best_preds) )

# Adding PCA

In [None]:
pca = sklearnPCA().fit(x_sm)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

About 12 components explain 95% of the variance in student failure

In [None]:
sklearn_pca = sklearnPCA(n_components=12)

print("===========Data Summary===========")
pca_train_x = sklearn_pca.fit_transform(x_sm)
print("PCA Training Data :", pca_train_x.shape)

pca_test_x = sklearn_pca.fit_transform(x_sm)
print("PCA Testing Data :", pca_test_x.shape)

In [None]:
# Re-run logistic regression with PCA

logistic_regression(pca_train_x, y_sm) 
warnings.filterwarnings('ignore')

In [None]:
decision_tree(pca_train_x, y_sm) 
warnings.filterwarnings('ignore')

In [None]:
random_forest(pca_train_x, y_sm) 
warnings.filterwarnings('ignore')

In [None]:
support_vector_machine(pca_train_x, y_sm) 
warnings.filterwarnings('ignore')

In [None]:
# XGBoost

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.25)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True)
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtrain_svm = xgb.DMatrix('dtrain.svm')
dtest_svm = xgb.DMatrix('dtest.svm')

In [None]:
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 50  # the number of training iterations

In [None]:
# training and testing - numpy matrices
from sklearn.metrics import precision_score
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
# extracting most confident predictions
best_preds = np.asarray([np.argmax(line) for line in preds])
print ("Numpy array precision:", precision_score(y_test, best_preds, average='macro'))

In [None]:
print(accuracy_score(y_test, best_preds))
print(recall_score(y_test, best_preds, average='macro'))
print(f1_score(y_test, best_preds, average='macro'))
print(precision_score(y_test, best_preds, average='macro'))

In [None]:
score = round(accuracy_score(y_test, best_preds), 3)
cm1 = confusion_matrix(y_test, best_preds)
sns.heatmap(cm1, annot=True, fmt=".0f")
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Accuracy Score: {0}'.format(score), size = 15)
plt.show()
print(metrics.classification_report(y_test,best_preds) )

# Predicting Student Dropouts

In [None]:
#Declaring independent variable i.e X
#Declaring Target variable i.e y

x = data.drop(['dropout'], axis = 1)
y = data['dropout']

In [None]:
# Set encoding and scaling instructions
column_transform = make_column_transformer(
    (OneHotEncoder(), ['code_module', 'code_presentation', 'gender', 'region', 'age_band', 'disability']),
    (OrdinalEncoder(), ['highest_education', 'imd_band']),
    (RobustScaler(), ['num_of_prev_attempts', 'studied_credits', 'total_click', 'late_rate'])
)

# Apply column transformer to features
X_encoded = column_transform.fit_transform(x)

In [None]:
# Splitting data into train and test dataset
x_train, x_test, y_train, y_test = train_test_split(X_encoded, y , test_size = 0.2, random_state  = 365)

In [None]:
smote = SMOTE()  

In [None]:
x_sm, y_sm =smote.fit_resample(X_encoded,y) 

print(X_encoded.shape, y.shape) 
print(x_sm.shape, y_sm.shape) 
sns.countplot(y_sm) 

In [None]:
logistic_regression(x_sm, y_sm) 
warnings.filterwarnings('ignore')

In [None]:
decision_tree(x_sm, y_sm) 
warnings.filterwarnings('ignore')

In [None]:
random_forest(x_sm, y_sm) 
warnings.filterwarnings('ignore')

In [None]:
support_vector_machine(x_sm, y_sm) 
warnings.filterwarnings('ignore')

In [None]:
# XGBoost

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.25)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True)
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtrain_svm = xgb.DMatrix('dtrain.svm')
dtest_svm = xgb.DMatrix('dtest.svm')

param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 50  # the number of training iterations

In [None]:
# training and testing - numpy matrices
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
# extracting most confident predictions
best_preds = np.asarray([np.argmax(line) for line in preds])
print ("Numpy array precision:", precision_score(y_test, best_preds, average='macro'))

In [None]:
print(accuracy_score(y_test, best_preds))
print(recall_score(y_test, best_preds, average='macro'))
print(f1_score(y_test, best_preds, average='macro'))
print(precision_score(y_test, best_preds, average='macro'))

In [None]:
score = round(accuracy_score(y_test, best_preds), 3)
cm1 = confusion_matrix(y_test, best_preds)
sns.heatmap(cm1, annot=True, fmt=".0f")
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Accuracy Score: {0}'.format(score), size = 15)
plt.show()
print(metrics.classification_report(y_test,best_preds) )

# Adding PCA

In [None]:
pca = sklearnPCA().fit(x_sm)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

In [None]:
# Use 13 principal components
sklearn_pca = sklearnPCA(n_components=13)

print("===========Data Summary===========")
pca_train_x = sklearn_pca.fit_transform(x_sm)
print("PCA Training Data :", pca_train_x.shape)

pca_test_x = sklearn_pca.fit_transform(x_sm)
print("PCA Testing Data :", pca_test_x.shape)

In [None]:
# Re-run logistic regression with PCA

logistic_regression(pca_train_x, y_sm) 
warnings.filterwarnings('ignore')

In [None]:
decision_tree(pca_train_x, y_sm) 
warnings.filterwarnings('ignore')

In [None]:
random_forest(pca_train_x, y_sm) 
warnings.filterwarnings('ignore')

In [None]:
support_vector_machine(pca_train_x, y_sm) 
warnings.filterwarnings('ignore')

In [None]:
# XGBoost

X_train, X_test, y_train, y_test = train_test_split(pca_train_x, y_sm, test_size=0.25)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True)
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtrain_svm = xgb.DMatrix('dtrain.svm')
dtest_svm = xgb.DMatrix('dtest.svm')

param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 50  # the number of training iterations

# training and testing - numpy matrices
from sklearn.metrics import precision_score
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
# extracting most confident predictions
best_preds = np.asarray([np.argmax(line) for line in preds])
print ("Numpy array precision:", precision_score(y_test, best_preds, average='macro'))

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
print(accuracy_score(y_test, best_preds))
print(recall_score(y_test, best_preds, average='macro'))
print(f1_score(y_test, best_preds, average='macro'))
print(precision_score(y_test, best_preds, average='macro'))

score = round(accuracy_score(y_test, best_preds), 3)
cm1 = confusion_matrix(y_test, best_preds)
sns.heatmap(cm1, annot=True, fmt=".0f")
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Accuracy Score: {0}'.format(score), size = 15)
plt.show()
print(metrics.classification_report(y_test,best_preds) )

In [None]:
# Create new csv file containing the final dataset for the purpose of ANN

data.to_csv('oulad_modelling.csv', index=False)

# Compare Model Performance

Models to predict student failure

In [None]:
# Plotting Bar Chart for Accuracy of different classifiers

plt.figure(figsize=(12, 3))
model_accuracies = [0.84, 0.79, 0.79, 0.74, 0.65, 0.64]
model_names = ['RandomForest','XGBoost','ANN - MLP wth PCA', 'Decision Tree', 'Logistic Regression', 'Support Vector Machines' ]
g= sns.barplot(x=model_accuracies, y=model_names, color='grey');

patch_h = []    
for patch in g.patches:
    reading = patch.get_height()
    patch_h.append(reading)

idx_tallest = np.argmax(patch_h)   

g.patches[idx_tallest].set_facecolor('#2ecc71')  
g.set_title('Model Accuracy for Student Failure', fontsize = 18)

Models to Predict Dropout

In [None]:
# Plotting Bar Chart for Accuracy of different classifiers

plt.figure(figsize=(12, 3))
model_accuracies = [0.86, 0.81, 0.80, 0.77, 0.74, 0.74]
model_names = ['RandomForest','XGBoost','Decision Tree', 'ANN - MLP wth PCA','Logistic Regression', 'Support Vector Machines' ]
g= sns.barplot(x=model_accuracies, y=model_names, color='grey');

patch_h = []    
for patch in g.patches:
    reading = patch.get_height()
    patch_h.append(reading)

idx_tallest = np.argmax(patch_h)   

g.patches[idx_tallest].set_facecolor('#2ecc71')  
g.set_title('Model Accuracy for Student Dropout', fontsize = 18)

# END