In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# We start off by training debugging dataset, just to see if the model is working, as in it produces at least some meaningful values

In [None]:
# open the dataset pickles

debugging_dataset = pd.read_pickle('debugging_dataset.pkl')
working_dataset = pd.read_pickle('working_dataset.pkl')

## Treat differently for categorical and numerical features. also make sure to transform the values properly

In [None]:
grade_columns = ['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F', 'W']
debugging_dataset[grade_columns] = debugging_dataset[grade_columns].apply(pd.to_numeric, errors='coerce').fillna(0)

# Define categorical columns and encode them consistently across the dataset
categorical_columns = ['Year', 'Term', 'Subject', 'Sched Type', 'Number', 'Course Title']
for column in categorical_columns:
    le = LabelEncoder()
    debugging_dataset[column] = le.fit_transform(debugging_dataset[column].astype(str))

In [None]:
train_data, temp_data = train_test_split(debugging_dataset, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print("Train set shape:", train_data.shape)
print("Validation set shape:", val_data.shape)
print("Test set shape:", test_data.shape)

Train set shape: (2065, 20)
Validation set shape: (443, 20)
Test set shape: (443, 20)


In [None]:
feature_columns = categorical_columns
target_columns = grade_columns

In [None]:
X_train = train_data[feature_columns]
y_train = train_data[target_columns]
X_val = val_data[feature_columns]
y_val = val_data[target_columns]
X_test = test_data[feature_columns]
y_test = test_data[target_columns]

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


## We train simple basic Linear regression model for each grade (A+, A, ...)

In [None]:
models = {}
for grade in target_columns:
    model = LinearRegression()
    model.fit(X_train, y_train[grade])
    models[grade] = model

In [None]:
predictions = {}
for grade, model in models.items():
    predictions[grade] = model.predict(X_test)

## Comparing the model prediction against the groundtruth grade distribution

In [None]:
pred_df = pd.DataFrame(predictions, columns=target_columns)
actual_grade_distribution = y_test.mean(axis=0) * 100

predicted_grade_distribution = pred_df.mean(axis=0) * 100

comparison_df = pd.DataFrame({
    'Actual': actual_grade_distribution,
    'Predicted': predicted_grade_distribution
})

print("\nComparison of Actual and Predicted Grade Distribution (Percentage):")
print(comparison_df)

for grade in target_columns:
    y_test_actual = y_test[grade]
    y_pred = pred_df[grade]
    mse = mean_squared_error(y_test_actual, y_pred)
    r2 = r2_score(y_test_actual, y_pred)
    print(f"\nGrade: {grade}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"R-squared (R2): {r2}")


Comparison of Actual and Predicted Grade Distribution (Percentage):
       Actual  Predicted
A+  11.644910  11.655937
A   32.437908  32.846352
A-  14.352393  14.127400
B+  10.782024  10.560539
B   12.253517  12.202918
B-   5.260435   5.188847
C+   3.176939   3.169433
C    3.831894   3.889185
C-   1.649604   1.656959
D+   0.777099   0.788391
D    1.142750   1.161973
D-   0.485665   0.496514
F    1.823044   1.887710
W    0.381817   0.367840

Grade: A+
Mean Squared Error (MSE): 0.026920251266652383
R-squared (R2): 0.03147160381927472

Grade: A
Mean Squared Error (MSE): 0.048375403801239975
R-squared (R2): 0.018721808232958104

Grade: A-
Mean Squared Error (MSE): 0.011783569947907985
R-squared (R2): 0.01078671115769081

Grade: B+
Mean Squared Error (MSE): 0.007679107339471414
R-squared (R2): 0.015203758020584268

Grade: B
Mean Squared Error (MSE): 0.009385058167175431
R-squared (R2): 0.008156800316001078

Grade: B-
Mean Squared Error (MSE): 0.0030987017343739568
R-squared (R2): 0.00866854

## The model prediction on average

Altough the model prediction is not very accruate, we now have a good evidence that the model works. Therefore, we now move on to the model training with dataset of more data points

In [None]:
mae = np.mean(np.abs(actual_grade_distribution - predicted_grade_distribution))
print(f"Mean Absolute Error (MAE) of Predicted Grade Distribution: {mae:.4f}%")

rmse = np.sqrt(np.mean((actual_grade_distribution - predicted_grade_distribution) ** 2))
print(f"Root Mean Squared Error (RMSE) of Predicted Grade Distribution: {rmse:.4f}%")

Mean Absolute Error (MAE) of Predicted Grade Distribution: 0.0843%
Root Mean Squared Error (RMSE) of Predicted Grade Distribution: 0.1421%


# We will train 2 linear regression models with regularizer--Ridge, and Lasso Regression models

In [None]:
param_grids = {
    'Ridge Regression': {
        'alpha': [0.1, 1.0, 10.0, 100.0]
    },
    'Lasso Regression': {
        'alpha': [0.001, 0.01, 0.1, 1.0]
    }
}

## Again, we split the data and convert them into proper values for model training

In [None]:
grade_columns = ['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F', 'W']
working_dataset[grade_columns] = working_dataset[grade_columns].apply(pd.to_numeric, errors='coerce').fillna(0)

categorical_columns = ['Year', 'Term', 'Subject', 'Sched Type', 'Number', 'Course Title']
for column in categorical_columns:
    le = LabelEncoder()
    working_dataset[column] = le.fit_transform(working_dataset[column].astype(str))

In [None]:
train_data, temp_data = train_test_split(working_dataset, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print("Train set shape:", train_data.shape)
print("Validation set shape:", val_data.shape)
print("Test set shape:", test_data.shape)

feature_columns = categorical_columns
target_columns = grade_columns

Train set shape: (41311, 20)
Validation set shape: (8852, 20)
Test set shape: (8853, 20)


In [None]:
X_train = train_data[feature_columns]
y_train = train_data[target_columns]
X_val = val_data[feature_columns]
y_val = val_data[target_columns]
X_test = test_data[feature_columns]
y_test = test_data[target_columns]

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

## Now, we will train the models with hyper-parameter (namely regularizer value) tuning on the validation set

In [None]:
val_results = {}

# Iterate over each model and its parameter grid
for name, param_grid in param_grids.items():
    print(f"\nTuning {name}...")

    if name == 'Ridge Regression':
        base_model = Ridge()
    elif name == 'Lasso Regression':
        base_model = Lasso()

    grid_search = GridSearchCV(base_model, param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)

    mae_scores = []
    rmse_scores = []
    for grade in target_columns:
        grid_search.fit(X_train, y_train[grade])
        best_model = grid_search.best_estimator_
        y_val_pred = best_model.predict(X_val)

        mae = np.mean(np.abs(y_val[grade] - y_val_pred))
        rmse = np.sqrt(np.mean((y_val[grade] - y_val_pred) ** 2))

        mae_scores.append(mae)
        rmse_scores.append(rmse)

    avg_mae = np.mean(mae_scores)
    avg_rmse = np.mean(rmse_scores)
    val_results[name] = {
        'Best Params': grid_search.best_params_,
        'MAE': avg_mae,
        'RMSE': avg_rmse
    }

    print(f"{name} - Best Params: {grid_search.best_params_}, Validation MAE: {avg_mae:.4f}, Validation RMSE: {avg_rmse:.4f}")




Tuning Ridge Regression...
Ridge Regression - Best Params: {'alpha': 0.1}, Validation MAE: 0.0516, Validation RMSE: 0.0681

Tuning Lasso Regression...
Lasso Regression - Best Params: {'alpha': 0.001}, Validation MAE: 0.0516, Validation RMSE: 0.0681


## it turns out the selected best hyper parmaters are small, and each best model prediction result is very similar

In [None]:
print("\nHyperparameter Tuning Results:")
for model_name, result in val_results.items():
    print(f"{model_name}: Best Params = {result['Best Params']}, MAE = {result['MAE']:.4f}, RMSE = {result['RMSE']:.4f}")


Hyperparameter Tuning Results:
Ridge Regression: Best Params = {'alpha': 0.1}, MAE = 0.0516, RMSE = 0.0681
Lasso Regression: Best Params = {'alpha': 0.001}, MAE = 0.0516, RMSE = 0.0681


## We further compute the numbers on the test set

In [None]:
best_model_name = min(val_results, key=lambda x: val_results[x]['MAE'])

print(f'best model chosen is {best_model_name}')
best_model_params = val_results[best_model_name]['Best Params']

if best_model_name == 'Ridge Regression':
    final_model = Ridge(**best_model_params)
elif best_model_name == 'Lasso Regression':
    final_model = Lasso(**best_model_params)

X_train_val = np.vstack((X_train, X_val))
y_train_val = pd.concat([y_train, y_val])

best model chosen is Ridge Regression


In [None]:
# Initialize an empty dictionary for predictions
predictions = {}

for grade in target_columns:
    final_model.fit(X_train_val, y_train_val[grade])
    predictions[grade] = final_model.predict(X_test)

pred_df = pd.DataFrame(predictions, columns=target_columns)

actual_grade_distribution = y_test.values
predicted_grade_distribution = pred_df.values

distribution_mae = np.mean(np.abs(actual_grade_distribution - predicted_grade_distribution))
distribution_rmse = np.sqrt(np.mean((actual_grade_distribution - predicted_grade_distribution) ** 2))

actual_grade_distribution = actual_grade_distribution * 100
predicted_grade_distribution = predicted_grade_distribution * 100

comparison_df = pd.DataFrame({
    'Grade': target_columns,
    'Actual Mean': actual_grade_distribution.mean(axis=0),
    'Predicted Mean': predicted_grade_distribution.mean(axis=0),
})

print("\nComparison of Actual and Predicted Grade Distribution:")
print(comparison_df)

print(f"\nOverall MAE of Grade Distribution: {distribution_mae:.4f} ({distribution_mae*100:.4f}%)")
print(f"Overall RMSE of Grade Distribution: {distribution_rmse:.4f} ({distribution_rmse*100:.4f}%)")


Comparison of Actual and Predicted Grade Distribution:
   Grade  Actual Mean  Predicted Mean
0     A+    11.644910       11.696920
1      A    32.437908       32.883158
2     A-    14.352393       14.128598
3     B+    10.782024       10.527335
4      B    12.253517       12.198097
5     B-     5.260435        5.171094
6     C+     3.176939        3.153001
7      C     3.831894        3.876271
8     C-     1.649604        1.654795
9     D+     0.777099        0.790311
10     D     1.142750        1.165648
11    D-     0.485665        0.501753
12     F     1.823044        1.886891
13     W     0.381817        0.366128

Overall MAE of Grade Distribution: 0.0518 (5.1803%)
Overall RMSE of Grade Distribution: 0.0911 (9.1062%)


## It shows that with the current linear regression model (Ridge Regression), our prediction is 5.1803% off and 9.1062% off from the groundtruth in MAE and RMSE, respectively. These numbers will serve as baseline numbers for upcoming milestones