# Demo

We will be using the attributes gender, region, highest_education, age_band, num_of_prev_attempts, studied_credits, *imd_band, disability to predict the classifier final_result

*imd_band is a measure of poverty based on area in the UK.

In [10]:
import importlib

from tabulate import tabulate
import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyKNeighborsClassifier, MySimpleLinearRegressor, MyNaiveBayesClassifier, MyDecisionTreeClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

student_data = MyPyTable().load_from_file("input_data/studentInfo.csv")
# remove missing values
student_data.remove_rows_with_missing_values()

gender = student_data.get_column("gender")
region = student_data.get_column("region")
highest_education = student_data.get_column("highest_education") 
age_band = student_data.get_column("age_band")
num_of_prev_attempts = student_data.get_column("num_of_prev_attempts") 
studied_credits = student_data.get_column("studied_credits")
imd_band = student_data.get_column("imd_band")
disability = student_data.get_column("disability")
final_result = student_data.get_column("final_result")

student_train_folds, student_test_folds = myevaluation.stratified_kfold_cross_validation(student_data.data, final_result, 10) 

student_test = []
student_train = []

final_results_test = []
final_results_train = []

# turn indexes into data sets
for row in student_train_folds:
    student_set = []
    final_results_set = []
    for item in row:
        student_set.append(student_data.data[item][3:10])
        final_results_set.append(student_data.data[item][-1])
    student_train.append(student_set)
    final_results_train.append(final_results_set)

# turn indexes into data sets
for row in student_test_folds:
    student_set = []
    final_results_set = []
    for item in row:
        student_set.append(student_data.data[item][3:10])
        final_results_set.append(student_data.data[item][-1])
    student_test.append(student_set)
    final_results_test.append(final_results_set)

#Tree model
total_tree = []
total_expected = []
for i in range(10):
    student_tree = MyDecisionTreeClassifier()    
    student_tree.fit(student_train[i],final_results_train[i])
    tree_predictions = student_tree.predict(student_test[i])
    total_tree.extend(tree_predictions)
    total_expected.extend(final_results_test[i])

print("===========================================")
print("Predictive Accuracy")
print("===========================================")
print("Stratified 10-Fold Cross Validation")
accuracy, errorrate = myutils.accuracy_errorrate(total_tree, total_expected)
print("Tree: accuracy = ", accuracy, "error rate = ", errorrate)

tree_matrix = myevaluation.confusion_matrix(total_expected, total_tree, ["Pass","Withdrawn", "Fail"])

for i in range(len(tree_matrix)):
    total = 0
    rec = 0
    for item in tree_matrix[i]:
        total += item
    true_pos = tree_matrix[i][i]
    if total != 0:
        rec = (true_pos/total)*100
    tree_matrix[i].append(total)
    tree_matrix[i].append(rec)

tree_matrix[0].insert(0, "Pass")
tree_matrix[1].insert(0, "Withdrawn")
tree_matrix[2].insert(0, "Fail")

print("Decision Tree (Stratified 10 Fold Cross Validation Results)")
print(tabulate(tree_matrix, ["Final Result","Pass","Withdrawn","Fail","total", "Recognition %"]))






Predictive Accuracy
Stratified 10-Fold Cross Validation
Tree: accuracy =  0.39956165427863544 error rate =  0.6004383457213646
