In [55]:
import csv
import math
import numpy as np
import pandas as pd
import os
import random

## Methods to Define:

1. Make Prediction
2. Get the Accuracy
3. Calculate Mean and Variance
4. Data Split

In [56]:
def makePrediction(pPredict, nPredict):
    prediction = [] 
    
    for i in range(len(pPredict)):
        if pPredict[i]>nPredict[i]:
            prediction.append(1)
        else:
            prediction.append(0)
            
    return prediction

In [57]:
def getAccuracy(prediction, testC):
    
    trueP, trueN, falseP, falseN = 0, 0, 0, 0
    
    for i in range(len(prediction)):
        if prediction[i] == testC[i]:
            if prediction[i] == 1:
                trueP +=1
            else:
                trueN += 1
        else:
            if prediction[i] == 1:
                falseP += 1
            else:
                falseN += 1
                
    
    accuracy = ((trueP + trueN)/(len(prediction)))*100
    precision = trueP /float(trueP + falseP)
    recall = trueP/float(trueP + falseN)
    

    return  [accuracy, precision, recall]

In [58]:
def Calculate(meanAndVariance, dataset, prior):
    
    # Convert dataframe into a list
    dataList  = dataset.values.tolist()
    # To store predicts of all rows
    predict = []
    
    for i in range(len(dataList)):
        p = 1
        for j in range(len(dataList[i])):
            exponent = math.exp(-(math.pow(dataList[i][j]-meanAndVariance[j][0],2)/(2*math.pow(meanAndVariance[j][1],2))))
            p = p * (1/(math.sqrt(2*math.pi) *meanAndVariance[j][1])) * exponent
        predict.append((p*prior))
            
    return predict

In [59]:
def variance(dataset, MV ):
    
    #Convert dataframe into a list to calculate the variance
    DataList  = dataset.values.tolist()
    
    #Calculate variance for features of positive class
    for i in range(8):
        # To calculate deviations of each data point
        d = 0
        
        for j in range(len(DataList)):
            d += pow((DataList[j][i] - MV[i][0]),2)
        
        d = (d/float((len(dataset['passing_grade']))))
        MV[i].append(math.sqrt(d))
        
    
    return MV

In [60]:
def mean(datasetP, datasetN):
    
    #Get mean and variance for both positive and negative class
    positiveMV = []
    negativeMV = []
    
    #Traverse columns
    for column in datasetP:
        
        # break the loop since I don't need to calculate the mean of the class
        if column == 'passing_grade':
            break
            
        meanVictorP = [sum(datasetP[column])/(len(datasetP[column]))]
        positiveMV.append(meanVictorP)
        
        meanVictorN = [sum(datasetN[column])/(len(datasetN[column]))]
        negativeMV.append(meanVictorN)
        
    return [positiveMV, negativeMV]

In [61]:
def splitByPassingGrade(dataset):
    
    #Split by passing
    positiveT = dataset[dataset['passing_grade'] == 1]
    
    #Split by not passing
    negativeT = dataset[dataset['passing_grade'] == 0]
    
    return [positiveT, negativeT]

In [62]:
def splitData(dataset):
    
    #The dataset split for the test is 20% and for the training is 80%
     
    #Defining number of rows for the testing set
    testingRows = math.floor((len(dataset)*20)/100)
    
    #Splitting the data into testing set. The rest will be the training set.
    test = dataset.iloc[:testingRows,:]
    train = dataset.iloc[testingRows:,:] 
    
    return [test, train]

## Model from Scratch:

1. Read in Data
2. Shuffle
3. Train/Test Split
4. Calculate Mean and Variance
5. Split the feature and class of Testing Set

In [63]:
dataset = pd.read_csv(open('data/student_performance.csv'))
dataset.head()

Unnamed: 0.1,Unnamed: 0,school,sex,age,address,family_size,parents_status,mother_education,father_education,mother_job,...,free_time,go_out,weekday_alcohol_use,weekend_alcohol_use,health,absences,period1_score,period2_score,final_score,passing_grade
0,0,GP,F,18,U,GT3,A,4,4,at_home,...,3,4,1,1,3,6,5,6,6,0
1,1,GP,F,17,U,GT3,T,1,1,at_home,...,3,3,1,1,3,4,5,5,6,0
2,2,GP,F,15,U,LE3,T,1,1,at_home,...,3,2,2,3,3,10,7,8,10,0
3,3,GP,F,15,U,GT3,T,4,2,health,...,2,2,1,1,5,2,15,14,15,1
4,4,GP,F,16,U,GT3,T,3,3,other,...,3,2,1,2,5,4,6,10,10,0


In [64]:
# not enough time to encode by scratch, so they will be dropped from dataset

dataset = dataset.drop(columns=['Unnamed: 0', 'school', 'sex', 'address', 'family_size', 'parents_status', 'mother_job', 'father_job', 'reason', 'guardian',
                                'school_support', 'family_support', 'paid_classes', 'activities', 'nursery', 'desire_higher_edu', 'internet', 'romantic'])

## Passing Grade is a final score >= 14 (70%)

In [65]:
dataset = dataset.iloc[np.random.permutation(len(dataset))]
test, train = splitData(dataset)
print("No. of testing rows: ", len(test['passing_grade']))
print("No. of training rows: ", len(train['passing_grade']))

No. of testing rows:  208
No. of training rows:  836


In [66]:
# caclculate initial probability 

positiveT, negativeT = splitByPassingGrade(train)

priorP = len(positiveT['passing_grade'])/len(train['passing_grade'])   # priorP for the positive class
priorN = len(negativeT['passing_grade'])/len(train['passing_grade'])   # priorN for the negative one

print("Prior probability of 'not_passing: ", priorN)
print("Prior probability of 'passing: ", priorP)

Prior probability of 'not_passing:  0.7117224880382775
Prior probability of 'passing:  0.28827751196172247


In [67]:
# get the mean & variance

positiveMV , negativeMV = mean(positiveT, negativeT)

positiveMV = variance(positiveT,positiveMV)
negativeMV = variance(negativeT, negativeMV)

testF = test.iloc[:, 0:8]
testC = test.iloc[:,-1]

In [68]:
# find predicts of both classes

pPredict = Calculate(positiveMV, testF, priorP)
nPredict = Calculate(negativeMV, testF, priorN)

# calculate prediction

prediction = makePrediction(pPredict, nPredict)

print("Prediction result:\n\n", prediction)

Prediction result:

 [1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1]


In [69]:
accuracy, precision, recall = getAccuracy(prediction, testC.values.tolist())
print("accuracy: ",accuracy,"%\nprecision: ",precision,"\nrecall: ",recall)

accuracy:  56.25 %
precision:  0.3442622950819672 
recall:  0.7924528301886793


### 14.92 points below baseline. 