In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

#### Load the Dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')
# Replace with the actual path to your file
file_path = '/content/drive/MyDrive/Deep_Dive_Project/uiuc-gpa.pk'

with open(file_path, 'rb') as f:
  df = pickle.load(f)

print(df.shape)
df.head(10)

Mounted at /content/drive
(58864, 512)


Unnamed: 0,Year,Number,Course Title,Avg GPA,A+,A,A-,B+,B,B-,...,"Primary Instructor_Wright, Margaret","Primary Instructor_Wu, Martin G","Primary Instructor_Yang, Xi","Primary Instructor_Yu, Albert","Primary Instructor_Zhang, Jianhua","Primary Instructor_Zhang, Li","Primary Instructor_Zhang, Zheng","Primary Instructor_Zhao, Kai","Primary Instructor_Zhao, Rui","Primary Instructor_Zilles, Craig"
0,2023,100,Intro Asian American Studies,3.530909,0,11,5,1,4,0,...,False,False,False,False,False,False,False,False,False,False
1,2023,100,Intro Asian American Studies,3.797391,0,17,2,1,2,1,...,False,False,False,False,False,False,False,False,False,False
2,2023,100,Intro Asian American Studies,3.476667,0,13,2,2,0,2,...,False,False,False,False,False,False,False,False,False,False
3,2023,200,U.S. Race and Empire,3.606364,6,15,5,2,3,0,...,False,False,False,False,False,False,False,False,False,False
4,2023,215,US Citizenship Comparatively,3.889091,16,12,2,1,1,1,...,False,False,False,False,False,False,False,False,False,False
5,2023,141,ABE Principles: Biological,3.098889,0,5,4,6,5,2,...,False,False,False,False,False,False,False,False,False,False
6,2023,152,Water in Global Environment,3.779427,55,115,0,4,9,0,...,False,False,False,False,False,False,False,False,False,False
7,2023,232,Context Intl Interventions,3.777778,3,13,5,5,1,0,...,False,False,False,False,False,False,False,False,False,False
8,2023,425,Engrg Measurement Systems,3.944167,0,22,0,2,0,0,...,False,False,False,False,False,False,False,False,False,False
9,2023,457,NPS Pollution Processes,3.758182,1,14,4,0,2,0,...,False,False,False,False,False,False,False,False,False,False


#### Linear Regression

In [7]:
class GPALinearRegression():
    def __init__(self):
        # Initialize the linear regression model
        self.model = LinearRegression()
        self.features = None  # To keep track of feature names after preprocessing

    def preprocess_data(self, df):
        """
        Preprocess the dataframe:
        - Handle categorical variables
        - Ensure all features are numerical
        - Split data into features (X) and target (y)
        """
        '''
        # Drop missing values
        df = df.dropna()
        # Convert Categorical Variables to Numerical
        df = df.drop(columns=['YearTerm', 'Course Title'])
        df = pd.get_dummies(df, columns=['Term', 'Subject', 'Sched Type'], drop_first=True)
        # Only encode the most frequent intructors
        top_instrs = df['Primary Instructor'].value_counts().nlargest(300).index
        df['Primary Instructor'] = df['Primary Instructor'].apply(lambda x: x if x in top_instrs else 'Other')
        df = pd.get_dummies(df, columns=['Primary Instructor'], drop_first=True)
        '''

        # Shift Year by min(Year)
        min_yr = df['Year'].min()
        df['Year'] -= min_yr

        # Divide Course by 100
        df['Number'] /= 100
        df.head(10)

        # remove Avg GPA as y
        X = df.drop(['Course Title', 'Avg GPA','A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-','D+', 'D', 'D-', 'F', 'W',], axis=1)
        X = X.to_numpy(np.float64)
        print(f"X shape {X.shape}")
        y = df['Avg GPA'].to_numpy(np.float64).reshape(-1, 1)
        print(f"X shape {y.shape}")

        self.features = X

        return X, y

    def split_data_by_year(self, X, y):
        """
        Split the data by year:
        - Training: All years < 2023
        - Testing: Year = 2023
        """
        max_yr = X[:, 0].max()

        # Create boolean masks for train and test sets
        train_mask = X[:, 0] < max_yr
        test_mask = X[:, 0] == max_yr

        X_train, y_train = X[train_mask], y[train_mask]
        X_test, y_test = X[test_mask], y[test_mask]

        print(f"X_train shape: {X_train.shape}")
        print(f"y_train shape: {y_train.shape}")
        print(f"X_test shape:  {X_test.shape}")
        print(f"y_test shape:  {y_test.shape}")

        return X_train, y_train, X_test, y_test

    def train(self, X, y):
        """
        Train the linear regression model.
        """
        self.model.fit(X, y)
        print("Model training completed.")

    def test(self, X, y, thres):
        """
        Test the model and return performance metrics.
        """
        y_pred = self.model.predict(X)

        # accs = []
        # for t in np.arange(thres[0], thres[1] + 0.01, 0.01):
        #     success = np.abs(y_pred - y) <= t
        #     acc = np.mean(success) * 100
        #     accs.append((t, acc))

        # mse = mean_squared_error(y, y_pred)
        # r2 = r2_score(y, y_pred)

        # for t, acc in accs:
        #     print(f"Accuracy (±{t:.2f}):    {acc:.4f}%")
        # print(f"Mean Squared Error : {mse}")
        # print(f"R-squared Score :    {r2}")

        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)

        y = self._convert_to_letter_grade(y)
        y_pred = self._convert_to_letter_grade(y_pred)
        matches = sum(a == b for a, b in zip(y, y_pred))

        # Calculate accuracy
        acc = matches / len(y) * 100

        print(f"Accuracy:  {acc:.4f}%")
        print(f"MSE:       {mse:4f}")
        print(f"R^2 Score: {r2:.4f}")

        return acc, mse, r2

    def _convert_to_letter_grade(self, gpas):
        letter_grades = []
        for gpa in gpas:
            if 3.835 < gpa <= 4.0:
                letter_grades.append('A')
            elif 3.50 < gpa <= 3.835:
                letter_grades.append('A-')
            elif 3.165 < gpa <= 3.50:
                letter_grades.append('B+')
            elif 2.835 < gpa <= 3.165:
                letter_grades.append('B')
            elif 2.50 < gpa <= 2.835:
                letter_grades.append('B-')
            elif 2.165 < gpa <= 2.50:
                letter_grades.append('C+')
            elif 1.835 < gpa <= 2.165:
                letter_grades.append('C')
            elif 1.50 < gpa <= 1.835:
                letter_grades.append('C-')
            elif 1.165 < gpa <= 1.50:
                letter_grades.append('D+')
            elif 0.835 < gpa <= 1.165:
                letter_grades.append('D')
            elif 0.5 < gpa <= 0.835:
                letter_grades.append('D-')
            elif 0.0 < gpa <= 0.5:
                letter_grades.append('F')
        return letter_grades


    def predict(self, X):
        """
        Make predictions with the trained model.
        """
        return self.model.predict(X)

    def get_coefficients(self):
        """
        Get the coefficients of the trained model.
        """
        # print(f"Coefficients: {self.model.coef_}")
        return self.model.coef_

In [8]:
# Initialize the model
regressor = GPALinearRegression()

# Preprocess the data
X, y = regressor.preprocess_data(df)

# Split data into training and testing sets
X_train, y_train, X_test, y_test = regressor.split_data_by_year(X, y)

# Train the model
regressor.train(X_train, y_train)

# Test the model
acc, mse, r2 = regressor.test(X_test, y_test, thres=[0.01, 0.10])

X shape (58864, 496)
X shape (58864, 1)
X_train shape: (56612, 496)
y_train shape: (56612, 1)
X_test shape:  (2252, 496)
y_test shape:  (2252, 1)
Model training completed.
Accuracy:  35.9680%
MSE:       0.093443
R^2 Score: 0.2286


In [None]:
# Get model coefficients
coefficients = regressor.get_coefficients()
print(coefficients.shape)
print(coefficients)

(1, 496)
[[ 1.07511475e-02  6.89189290e-02  5.73634170e+09  5.73634170e+09
   5.73634170e+09  5.73634170e+09 -9.79664750e+06 -9.79664761e+06
  -9.79664776e+06 -9.79664787e+06 -9.79664742e+06 -9.79664780e+06
  -9.79664786e+06 -9.79664772e+06 -9.79664763e+06 -9.79664760e+06
  -9.79664750e+06 -9.79664730e+06 -9.79664755e+06 -9.79664735e+06
  -9.79664764e+06 -9.79664770e+06 -9.79664763e+06 -9.79664788e+06
  -9.79664746e+06 -9.79664784e+06 -9.79664750e+06 -9.79664761e+06
  -9.79664781e+06 -9.79664768e+06 -9.79664764e+06 -9.79664785e+06
  -9.79664792e+06 -9.79664782e+06 -9.79664776e+06 -9.79664758e+06
  -9.79664781e+06 -9.79664753e+06 -9.79664756e+06 -9.79664777e+06
  -9.79664757e+06 -9.79664750e+06 -9.79664751e+06 -9.79664726e+06
  -9.79664734e+06 -9.79664785e+06 -9.79664804e+06 -9.79664782e+06
  -9.79664792e+06 -9.79664761e+06 -9.79664735e+06 -9.79664741e+06
  -9.79664777e+06 -9.79664758e+06 -9.79664780e+06 -9.79664774e+06
  -9.79664779e+06 -9.79664749e+06 -9.79664755e+06 -9.79664743e+06
 