# Predicting Stat Student Grades

## Import Statements

In [1]:
import pandas as pd
import time
import numpy as np

# Linear regression import
from sklearn.linear_model import LinearRegression
# MLP import
from sklearn.neural_network import MLPRegressor
# Decision Tree import
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Loading in the data

In [2]:
# The data
df = pd.read_csv("Combined-Data-Table.csv")
df = df.dropna()
display(df)

Unnamed: 0,S1 Overall,S1 Assignments,S1 Exit Tickets,S1 MCQ Tests,S1 FRQ Tests,S1 Final Exam,S2 Overall,S2 Assignments,S2 Exit Tickets,S1 MCQ Tests.1,S2 FRQ Tests,S2 Final Exam,AP Exam
0,87,82.52,98.75,84.0,92.50,77.5,85.0,82.05,97.33,83.64,87.31,77.5,4.0
1,94,100.00,98.75,89.0,89.17,97.5,91.0,100.00,100.00,85.91,87.69,82.5,4.0
2,97,100.00,100.00,95.0,97.50,90.0,98.0,100.00,100.00,95.45,94.62,100.0,5.0
3,96,100.00,100.00,93.0,97.50,90.0,94.0,99.49,100.00,91.36,93.46,87.5,5.0
4,96,100.00,100.00,89.0,95.00,97.5,97.0,100.00,100.00,90.00,98.85,97.5,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,92,98.25,97.33,91.0,90.00,85.0,92.0,95.83,100.00,88.18,93.18,85.0,5.0
219,89,87.63,98.67,82.0,88.50,95.0,91.0,86.55,100.00,92.73,93.64,82.5,5.0
220,94,94.39,98.67,91.0,96.00,90.0,84.0,68.33,97.50,89.09,89.09,77.5,4.0
221,83,98.25,92.00,71.0,83.50,75.0,77.0,89.41,81.25,70.91,83.64,57.5,3.0


In [3]:
# Splits the data between covariates and label
X = df[df.columns[:-1]]
y = df[df.columns[-1]]

## Defining the various models

In [4]:
# Returns Linear regression model fit to X_train and y_train
def linearReg(X_train, y_train):
    reg = LinearRegression().fit(X_train, y_train)
    return reg

In [5]:
# Returns MLP with hidden layer structure (9,6,3) fit to X_train and y_train
def mlp(X_train, y_train):
    regr = MLPRegressor(hidden_layer_sizes = (9,6,3),random_state=673, max_iter=10000).fit(X_train, y_train)
    return regr

In [6]:
# Returns decision tree fit to X_train and y_train
def tree(X_train, y_train):
    clf = DecisionTreeClassifier().fit(X_train, y_train)
    return clf

## Evaluating the Models

In [7]:
# Returns percentage of AP tests predicted correctly. Y_pred is rounded for regression outputs
def testAcc(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, np.round(y_pred))

In [8]:
# Creates 100 different random states to split data between train and test
random_states = np.random.randint(1000000, size=600)

In [9]:
# Takes in a model definition and fits the model to a random test-train split of data from a list of random states
# Returns the average accuracy of the model over the above random states
def eval_model(model_func, random_states):
    avg_score = []
    for random_state in random_states:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random_state)
        model = model_func(X_train, y_train)
        avg_score.append(testAcc(model, X_test, y_test))
        #avg_score.append(model.score(X_test, y_test))
    return np.average(avg_score)

In [10]:
# Results for Linear Regression
eval_model(linearReg, random_states)

0.6961788617886178

In [11]:
# Results for the MLP
eval_model(mlp, random_states)

0.7281300813008129

In [12]:
# Results for the decision tree
eval_model(tree, random_states)

0.5995934959349594