<img src="https://www.dbs.ie/images/default-source/logos/dbs-logo-2019-small.png" align = left/>

                            

# Predicting Student Dropout Using A.N.N.
Capstone Project

Claire Connaughton (10266499)

# Import Relevant Libraries 

In [None]:
import tensorflow as tf
import tensorflow.compat.v1 as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler
from sklearn.compose import make_column_transformer

# Supress unnecessary warnings so that presentation looks clean
import warnings
warnings.filterwarnings("ignore")

# Prepare dataset for Modelling

In [None]:
# Read in dataset

dataset= pd.read_csv('oulad_modelling.csv')

In [None]:
# Drop all of the columns which were discarded in the EDA stage

dataset.drop(columns=['student_failed'], inplace=True)

In [None]:
# Print dataset shape

print("This dataset is consisted of",dataset.shape)

In [None]:
dataset.info()

In [None]:
# Split dataset into train and test set

train,test = train_test_split(dataset, test_size=0.3, random_state=42)
print("Training Data :",train.shape)
print("Testing Data :",test.shape)

In [None]:
# Create rename train and test sets
train_data = train
test_data = test

In [None]:
# Treat categorical variables and normalise continuous variables

# Set encoding and scaling instructions
column_transform = make_column_transformer(
    (OneHotEncoder(), ['code_module', 'code_presentation', 'gender', 'region', 'age_band', 'disability']),
    (OrdinalEncoder(), ['highest_education', 'imd_band']),
    (RobustScaler(), ['num_of_prev_attempts', 'studied_credits', 'total_click', 'late_rate'])
)

# Apply column transformer to features
train_encoded = column_transform.fit_transform(train_data)
test_encoded = column_transform.fit_transform(test_data)

In [None]:
# Convert the treated numpy array into a dataframe
train_x= pd.DataFrame(train_encoded)
test_x = pd.DataFrame(test_encoded)

In [None]:
# Remove the target variable - dropout - from the training set

# Training Data
train_x = train_x.iloc[:,:18]
print("Training Data :", train_x.shape)

# Testing Data
test_x = test_x.iloc[:,:18]
print("Testing Data :", test_x.shape)

In [None]:
# Just retain the target variable - dropout

# Training Data
train_y = train_data.iloc[:,18:]
print("Training Data :", train_y.shape)

# Testing Data
test_y = test_data.iloc[:,18:]
print("Testing Data :", test_y.shape)

# Modelling

In [None]:
# Modify the tensorflow environment 
tf.disable_v2_behavior()

In [None]:
# Use placeholder to put and change values while the program is running.
# For X, a place must have 18 columns, since wbcd data has 18 features.
# For Y, a place must have 1 columns, since the target has 1 feature.

X = tf.placeholder(tf.float32, [None,18])
Y = tf.placeholder(tf.float32, [None, 1])

In [None]:
# Make Weight, Bias value with randomly
# Set W(weight) as [18,1] to account for 18 features and 1 target
# Set b(bias)as [1] because the target 1 layers.

# weight
W = tf.Variable(tf.random_normal([18,1], seed=0), name='weight')

# bias
b = tf.Variable(tf.random_normal([1], seed=0), name='bias')

In [None]:
logits = tf.matmul(X,W) + b

In [None]:
hypothesis = tf.nn.sigmoid(logits)

cost_i = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,labels=Y)
cost = tf.reduce_mean(cost_i)
# cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis))

In [None]:
train = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(cost)

In [None]:
# Compare : original vs. prediction

prediction = tf.cast(hypothesis > 0.5, dtype=tf.float32)
correct_prediction = tf.equal(prediction, Y)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, dtype=tf.float32))

In [None]:
# Activate Model
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for step in range(10001):
        sess.run(train, feed_dict={X: train_x, Y: train_y})
        if step % 1000 == 0:
            loss, acc = sess.run([cost, accuracy], feed_dict={X: train_x, Y: train_y})
            print("Step: {:5}\tLoss: {:.3f}\tAcc: {:.2%}".format(step, loss, acc))
            
    train_acc = sess.run(accuracy, feed_dict={X: train_x, Y: train_y})
    test_acc,test_predict,test_correct = sess.run([accuracy,prediction,correct_prediction], feed_dict={X: test_x, Y: test_y})
    print("Model Prediction =", train_acc)
    print("Test Prediction =", test_acc)

In [None]:
# Apply the ANN - MLP Model

def ann_mlp():
    print("===========Data Summary===========")
    print("Training Data :", train_x.shape)
    print("Testing Data :", test_x.shape)

    X = tf.placeholder(tf.float32, [None,18])
    Y = tf.placeholder(tf.float32, [None, 1])

    # input
    W1 = tf.Variable(tf.random_normal([18,34], seed=0), name='weight1')
    b1 = tf.Variable(tf.random_normal([34], seed=0), name='bias1')
    layer1 = tf.nn.sigmoid(tf.matmul(X,W1) + b1)

    # hidden1
    W2 = tf.Variable(tf.random_normal([34,34], seed=0), name='weight2')
    b2 = tf.Variable(tf.random_normal([34], seed=0), name='bias2')
    layer2 = tf.nn.sigmoid(tf.matmul(layer1,W2) + b2)

    # hidden2
    W3 = tf.Variable(tf.random_normal([34,54], seed=0), name='weight3')
    b3 = tf.Variable(tf.random_normal([54], seed=0), name='bias3')
    layer3 = tf.nn.sigmoid(tf.matmul(layer2,W3) + b3)

    # output
    W4 = tf.Variable(tf.random_normal([54,1], seed=0), name='weight4')
    b4 = tf.Variable(tf.random_normal([1], seed=0), name='bias4')
    logits = tf.matmul(layer3,W4) + b4
    hypothesis = tf.nn.sigmoid(logits)

    cost_i = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,labels=Y)
    cost = tf.reduce_mean(cost_i)

    train = tf.train.GradientDescentOptimizer(learning_rate=0.001).minimize(cost)

    prediction = tf.cast(hypothesis > 0.5, dtype=tf.float32)
    correct_prediction = tf.equal(prediction, Y)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, dtype=tf.float32))

    print("\n============Processing============")
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for step in range(10001):
            sess.run(train, feed_dict={X: train_x, Y: train_y})
            if step % 1000 == 0:
                loss, acc = sess.run([cost, accuracy], feed_dict={X: train_x, Y: train_y})
                print("Step: {:5}\tLoss: {:.3f}\tAcc: {:.2%}".format(step, loss, acc))

        train_acc = sess.run(accuracy, feed_dict={X: train_x, Y: train_y})
        test_acc,test_predict,test_correct = sess.run([accuracy,prediction,correct_prediction], feed_dict={X: test_x, Y: test_y})
        
        print("\n============Results============")
        print("Model Prediction =", train_acc)
        print("Test Prediction =", test_acc)
        
        return train_acc,test_acc
    
ann_mlp_train_acc, ann_mlp_test_acc = ann_mlp()

In [None]:
# Determine the optimal number of pricipal components

pca = sklearnPCA().fit(train_x)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

11 Principal Components explain 95% of the variance.

In [None]:
# Apply ANN - MLP - PCA Model

def ann_mlp_pca():
    sklearn_pca = sklearnPCA(n_components=11)

    print("===========Data Summary===========")
    pca_train_x = sklearn_pca.fit_transform(train_x)
    print("PCA Training Data :", pca_train_x.shape)

    pca_test_x = sklearn_pca.fit_transform(test_x)
    print("PCA Testing Data :", pca_test_x.shape)

    X = tf.placeholder(tf.float32, [None,11])
    Y = tf.placeholder(tf.float32, [None, 1])

    # input
    W1 = tf.Variable(tf.random_normal([11,33], seed=0), name='weight1')
    b1 = tf.Variable(tf.random_normal([33], seed=0), name='bias1')
    layer1 = tf.nn.sigmoid(tf.matmul(X,W1) + b1)

    # hidden1
    W2 = tf.Variable(tf.random_normal([33,66], seed=0), name='weight2')
    b2 = tf.Variable(tf.random_normal([66], seed=0), name='bias2')
    layer2 = tf.nn.sigmoid(tf.matmul(layer1,W2) + b2)

    # hidden2
    W3 = tf.Variable(tf.random_normal([66,66], seed=0), name='weight3')
    b3 = tf.Variable(tf.random_normal([66], seed=0), name='bias3')
    layer3 = tf.nn.sigmoid(tf.matmul(layer2,W3) + b3)

    # output
    W4 = tf.Variable(tf.random_normal([66,1], seed=0), name='weight4')
    b4 = tf.Variable(tf.random_normal([1], seed=0), name='bias4')
    logits = tf.matmul(layer3,W4) + b4
    hypothesis = tf.nn.sigmoid(logits)

    cost_i = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,labels=Y)
    cost = tf.reduce_mean(cost_i)

    train = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(cost)

    prediction = tf.cast(hypothesis > 0.5, dtype=tf.float32)
    correct_prediction = tf.equal(prediction, Y)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, dtype=tf.float32))

    print("\n============Processing============")
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for step in range(10001):
            sess.run(train, feed_dict={X: pca_train_x, Y: train_y})
            if step % 1000 == 0:
                loss, acc = sess.run([cost, accuracy], feed_dict={X: pca_train_x, Y: train_y})
                print("Step: {:5}\tLoss: {:.3f}\tAcc: {:.2%}".format(step, loss, acc))

        train_acc = sess.run(accuracy, feed_dict={X: pca_train_x, Y: train_y})
        test_acc,test_predict,test_correct = sess.run([accuracy,prediction,correct_prediction], feed_dict={X: pca_test_x, Y: test_y})
        
        print("\n============Results============")
        print("PCA Model Prediction =", train_acc)
        print("PCA Test Prediction =", test_acc)
        
        return train_acc,test_acc
        
ann_mlp_pca_train_acc, ann_mlp_pca_test_acc = ann_mlp_pca()

# END