In [43]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import pickle

In [25]:
# Get Dataset
df = pd.read_csv("../../data/GPA/student-mat.csv", sep=";")
df.head(5)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [26]:
# Features To Preserve: Sex, Age, Absences, Failures, Activities, Internet
df = df[["sex", "age", "absences", "failures", "activities", "internet", "G1", "G2", "G3"]]
df.head(5)

Unnamed: 0,sex,age,absences,failures,activities,internet,G1,G2,G3
0,F,18,6,0,no,no,5,6,6
1,F,17,4,0,no,yes,5,5,6
2,F,15,10,3,no,yes,7,8,10
3,F,15,2,0,yes,yes,15,14,15
4,F,16,4,0,no,no,6,10,10


In [27]:
# Convert Sex to F = 1, M = 0
df.loc[(df["sex"] == 'F'),"sex"] = 1
df.loc[(df["sex"] == 'M'),"sex"] = 0

# Convert Activities to yes = 1, no = 0
df.loc[(df["activities"] == 'yes'),"activities"] = 1
df.loc[(df["activities"] == 'no'),"activities"] = 0

# Convert Internet to yes = 1, no = 0
df.loc[(df["internet"] == 'yes'),"internet"] = 1
df.loc[(df["internet"] == 'no'),"internet"] = 0

# Convert Grades from 1 - 20 scale to 1 - 100 scale
df["G1"] = df["G1"] * 5
df["G2"] = df["G2"] * 5
df["G3"] = df["G3"] * 5

df.head(5)

Unnamed: 0,sex,age,absences,failures,activities,internet,G1,G2,G3
0,1,18,6,0,0,0,25,30,30
1,1,17,4,0,0,1,25,25,30
2,1,15,10,3,0,1,35,40,50
3,1,15,2,0,1,1,75,70,75
4,1,16,4,0,0,0,30,50,50


In [29]:
# Save CSV File
df.to_csv('../../data/GPA/student-processed.csv')

In [32]:
# Get X and y vectors
predict_column = "G3"
X = np.array(df.drop([predict_column], 1))
y = np.array(df[predict_column])

In [33]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [45]:
# Create Linear Model
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [46]:
# Test Accuracy
accuracy = model.score(X_test, y_test)
accuracy

0.7294172792434963

In [47]:
def mse(actual, predicted):
    return (actual - predicted) ** 2

In [48]:
# Predict Custom Input
custom_input = np.array([[0, 17, 2, 0, 1, 1, 55, 55]])
actual_grade = 50
prediction = model.predict(custom_input)

# Print Values
print("Prediction: ", prediction[0])
print("Actual: ", actual_grade)
print("MSE: ", mse(actual_grade, prediction[0]))

Prediction:  52.68771222844219
Actual:  50
MSE:  7.223797022917668


In [49]:
# Save Model
with open("../GPA.bin", "wb") as f:
    pickle.dump(model, f)