# Demo

We will be using the attributes gender, region, highest_education, age_band, num_of_prev_attempts, studied_credits, *imd_band, disability to predict the classifier final_result

*imd_band is a measure of poverty based on area in the UK.


studied_credits will be converted to these categorical bins and num_of_prev_attempts will be convereted to a bool, True if previously attempted False if not.

|number|credit ranges|
|------|-------------|
|1|$\leq$ 59|
|2|60-119|
|3|120-179|
|4|180-239|
|5|$\geq$ 240 |



In [2]:
import importlib

from tabulate import tabulate
import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyKNeighborsClassifier, MySimpleLinearRegressor, MyNaiveBayesClassifier, MyDecisionTreeClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

student_data = MyPyTable().load_from_file("input_data/studentInfo.csv")
# remove missing values
student_data.remove_rows_with_missing_values()

gender = student_data.get_column("gender")
region = student_data.get_column("region")
highest_education = student_data.get_column("highest_education") 
age_band = student_data.get_column("age_band")
num_of_prev_attempts = student_data.get_column("num_of_prev_attempts") 
studied_credits = student_data.get_column("studied_credits")

myutils.convert_vals_into_cutoffs(num_of_prev_attempts, [0, 1], [False, True])
myutils.convert_vals_into_cutoffs(studied_credits, [59, 60, 120, 180, 240,], [1,2,3,4,5])

for i in range(len(student_data.data)):
    student_data.data[i][8] = num_of_prev_attempts[i]
    student_data.data[i][9] = studied_credits[i]

imd_band = student_data.get_column("imd_band")
disability = student_data.get_column("disability")
final_result = student_data.get_column("final_result")

student_train_folds, student_test_folds = myevaluation.stratified_kfold_cross_validation(student_data.data, final_result, 10) 

student_test = []
student_train = []

final_results_test = []
final_results_train = []

# turn indexes into data sets
for row in student_train_folds:
    student_set = []
    final_results_set = []
    for item in row:
        student_set.append(student_data.data[item][3:10])
        final_results_set.append(student_data.data[item][-1])
    student_train.append(student_set)
    final_results_train.append(final_results_set)

# turn indexes into data sets
for row in student_test_folds:
    student_set = []
    final_results_set = []
    for item in row:
        student_set.append(student_data.data[item][3:10])
        final_results_set.append(student_data.data[item][-1])
    student_test.append(student_set)
    final_results_test.append(final_results_set)

#Naive Bayes model
total_Naive = []
total_expected = []
for i in range(10):
    student_Naive = MyNaiveBayesClassifier()    
    student_Naive.fit(student_train[i],final_results_train[i])
    Naive_predictions = student_Naive.predict(student_test[i])
    total_Naive.extend(Naive_predictions)
    total_expected.extend(final_results_test[i])
print("===========================================")
print("Predictive Accuracy")
print("===========================================")
print("Stratified 10-Fold Cross Validation")
accuracy, errorrate = myutils.accuracy_errorrate(total_Naive, total_expected)
print("Naive Bayes: accuracy = ", accuracy, "error rate = ", errorrate)

#Tree model
total_tree = []
total_expected = []
for i in range(10):
    student_tree = MyDecisionTreeClassifier()    
    student_tree.fit(student_train[i],final_results_train[i])
    tree_predictions = student_tree.predict(student_test[i])
    total_tree.extend(tree_predictions)
    total_expected.extend(final_results_test[i])



accuracy, errorrate = myutils.accuracy_errorrate(total_tree, total_expected)
print("Tree: accuracy = ", accuracy, "error rate = ", errorrate)


tree_matrix = myevaluation.confusion_matrix(total_expected, total_tree, ["Pass","Withdrawn", "Fail", "Distinction"])

for i in range(len(tree_matrix)):
    total = 0
    rec = 0
    for item in tree_matrix[i]:
        total += item
    true_pos = tree_matrix[i][i]
    if total != 0:
        rec = (true_pos/total)*100
    tree_matrix[i].append(total)
    tree_matrix[i].append(rec)

tree_matrix[0].insert(0, "Pass")
tree_matrix[1].insert(0, "Withdrawn")
tree_matrix[2].insert(0, "Fail")
tree_matrix[3].insert(0, "Distinction")

print()
print("Decision Tree (Stratified 10 Fold Cross Validation Results)")
print(tabulate(tree_matrix, ["Final Result","Pass","Withdrawn","Fail", "Distinction","total", "Recognition %"]))






Predictive Accuracy
Stratified 10-Fold Cross Validation
Naive Bayes: accuracy =  0.4282447112635792 error rate =  0.5717552887364208
Tree: accuracy =  0.3956864239883108 error rate =  0.6043135760116892

Decision Tree (Stratified 10 Fold Cross Validation Results)
Final Result      Pass    Withdrawn    Fail    Distinction    total    Recognition %
--------------  ------  -----------  ------  -------------  -------  ---------------
Pass              7496         3066    1088            180    11830         63.3643
Withdrawn         4861         4014     936            109     9920         40.4637
Fail              3721         2253     872             61     6907         12.6249
Distinction       2017          544     189             75     2825          2.65487
