In [1]:
# ✅ Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# ✅ Step 2: Install XGBoost
!pip install xgboost --quiet



Mounted at /content/drive


In [2]:
# ✅ STEP 3: Import Libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score



In [3]:
# ✅ Step 4: Load Dataset
file_path = '/content/drive/My Drive/Dataset_with_Updated_Mastery.csv'
df = pd.read_csv(file_path)

# Treat 0s as missing if they represent "no prereq"
df[["prereq_1_grade", "prereq_2_grade"]] = df[["prereq_1_grade", "prereq_2_grade"]].replace(0, np.nan)


In [4]:
# ✅ STEP 5: Split by student (no leakage)
unique_students = df["student_id"].unique()
train_students, test_students = train_test_split(unique_students, test_size=0.2, random_state=42)

train_df = df[df["student_id"].isin(train_students)]
test_df = df[df["student_id"].isin(test_students)]

features = ["current_mastery", "prereq_1_grade", "prereq_2_grade"]
target = "final_grade"

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]


In [5]:
# ✅ STEP 6: Train XGBoost Model
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'eta': 0.1,
    'seed': 42
}

model = xgb.train(params, dtrain, num_boost_round=100)

# Evaluate
y_pred = model.predict(dtest)
print("✅ Test MSE:", mean_squared_error(y_test, y_pred))
print("✅ R² Score:", r2_score(y_test, y_pred))


# Evaluate
y_pred = model.predict(dtest)
print("✅ Test MSE:", mean_squared_error(y_test, y_pred))
print("✅ R² Score:", r2_score(y_test, y_pred))

# ✅ Add this block below 👇 for tolerance-based accuracy
tolerance = 0.05  # 5% tolerance

# Bounds for what is considered "acceptable"
lower = y_test * (1 - tolerance)
upper = y_test * (1 + tolerance)

within_tol = ((y_pred >= lower) & (y_pred <= upper)).sum()
total = len(y_test)

tolerance_accuracy = within_tol / total
print(f"✅ Accuracy within ±5% tolerance: {tolerance_accuracy:.2%}")



✅ Test MSE: 87.54106990334753
✅ R² Score: 0.3596803722125276
✅ Test MSE: 87.54106990334753
✅ R² Score: 0.3596803722125276
✅ Accuracy within ±5% tolerance: 28.88%


In [6]:
df[["current_mastery", "final_grade"]].corr()


Unnamed: 0,current_mastery,final_grade
current_mastery,1.0,0.449295
final_grade,0.449295,1.0


In [7]:
# ✅ STEP 5: Test prediction on one student's term
random_student = test_df["student_id"].sample(1).values[0]
student_terms = test_df[test_df["student_id"] == random_student]["term"].unique()
random_term = np.random.choice(student_terms)

term_data = test_df[(test_df["student_id"] == random_student) & (test_df["term"] == random_term)]
term_features = term_data[features]

dterm = xgb.DMatrix(term_features)
term_data = term_data.copy()
term_data["predicted_grade"] = model.predict(dterm)

# ✅ Show results
term_data[["course_code", "current_mastery", "prereq_1_grade", "prereq_2_grade", "final_grade", "predicted_grade"]]


Unnamed: 0,course_code,current_mastery,prereq_1_grade,prereq_2_grade,final_grade,predicted_grade
88488,CC410,0.622567,61.0,,69.0,71.742966
88489,CC414,0.617927,61.0,,71.0,71.635963
88490,CC419,0.603776,67.0,73.0,65.0,63.173836
88491,CC421,0.525326,56.0,,48.0,67.577339
88492,CC527,0.672553,75.0,56.0,83.0,68.79892
88493,EE418,0.587522,57.0,,63.0,69.783592


In [8]:


# ✅ STEP 6: Train XGBoost Model
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror',
    'max_depth': 5,                  # Slightly shallower trees help generalize
    'eta': 0.05,                     # Lower learning rate for smoother learning
    'subsample': 0.8,                # Use 80% of training data per tree
    'colsample_bytree': 0.8,         # Use 80% of features per tree
    'lambda': 1.0,                   # L2 regularization (default)
    'alpha': 0.0,                    # L1 regularization
    'min_child_weight': 10,          # Avoid overfitting on small leaves
    'seed': 42
}

model = xgb.train(params, dtrain, num_boost_round=100)


from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

y_pred = model.predict(dtest)

print("✅ Test MSE:", mean_squared_error(y_test, y_pred))
print("✅ R² Score:", r2_score(y_test, y_pred))
print("✅ MAE (Mean Absolute Error):", mean_absolute_error(y_test, y_pred))


# ✅ Add this block below 👇 for tolerance-based accuracy
tolerance = 0.15


# Bounds for what is considered "acceptable"
lower = y_test * (1 - tolerance)
upper = y_test * (1 + tolerance)

within_tol = ((y_pred >= lower) & (y_pred <= upper)).sum()
total = len(y_test)

tolerance_accuracy = within_tol / total
print(f"✅ Accuracy within ±5% tolerance: {tolerance_accuracy:.2%}")



✅ Test MSE: 87.82337545896743
✅ R² Score: 0.3576154467038898
✅ MAE (Mean Absolute Error): 7.4504524359320445
✅ Accuracy within ±5% tolerance: 73.06%


In [9]:
# ✅ STEP 5: Test prediction on one student's term
random_student = test_df["student_id"].sample(1).values[0]
student_terms = test_df[test_df["student_id"] == random_student]["term"].unique()
random_term = np.random.choice(student_terms)

term_data = test_df[(test_df["student_id"] == random_student) & (test_df["term"] == random_term)]
term_features = term_data[features]

dterm = xgb.DMatrix(term_features)
term_data = term_data.copy()
term_data["predicted_grade"] = model.predict(dterm)

# ✅ Show results
term_data[["course_code", "current_mastery", "prereq_1_grade", "prereq_2_grade", "final_grade", "predicted_grade"]]


Unnamed: 0,course_code,current_mastery,prereq_1_grade,prereq_2_grade,final_grade,predicted_grade
65070,CC415,0.637856,63.0,,59.0,71.747787
65071,CC416,0.65779,51.0,,82.0,74.586388
65072,CC418,0.682084,51.0,,84.0,77.297272
65073,CC431,0.646773,73.0,,65.0,71.264313
65074,CC527,0.749952,61.0,70.0,83.0,80.113251
65075,IM423,0.707474,,,85.0,74.989235


In [10]:
# Save model as binary file
model.save_model("/content/drive/My Drive/xgboost_grade_predictor.model")




In [None]:
!pip install --upgrade lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 4.5.0
    Uninstalling lightgbm-4.5.0:
      Successfully uninstalled lightgbm-4.5.0
Successfully installed lightgbm-4.6.0


In [None]:
#LGBM

# ✅ Step 2: Import Libraries
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# ✅ Step 3: Prepare Data (already loaded in your case)
features = ["current_mastery", "prereq_1_grade", "prereq_2_grade"]
target = "final_grade"

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

# ✅ Step 4: Create Dataset for LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score

# ✅ Step 1: Create and configure the model
lgb_model = LGBMRegressor(
    objective='regression',
    learning_rate=0.1,
    num_leaves=31,
    feature_fraction=0.3,
    bagging_fraction=0.2,
    bagging_freq=5,
    n_estimators=300,
    random_state=42
)

# ✅ Step 2: Fit with early stopping
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)]
)




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003426 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 394
[LightGBM] [Info] Number of data points in the train set: 74237, number of used features: 3
[LightGBM] [Info] Start training from score 67.973470


In [None]:
# ✅ Step 7: Predict & Evaluate
y_pred = lgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("✅ LGBM MSE:", mse)
print("✅ LGBM R² Score:", r2)
print("✅ MAE (Mean Absolute Error):", mean_absolute_error(y_test, y_pred))

# ✅ Tolerance-Based Accuracy (15%)
tolerance = 0.15
lower = y_test * (1 - tolerance)
upper = y_test * (1 + tolerance)

within_tol = ((y_pred >= lower) & (y_pred <= upper)).sum()
tolerance_accuracy = within_tol / len(y_test)

print(f"✅ Accuracy within ±15% tolerance: {tolerance_accuracy:.2%}")


✅ LGBM MSE: 97.5371511952715
✅ LGBM R² Score: 0.2865639817086766
✅ MAE (Mean Absolute Error): 7.909629832374686
✅ Accuracy within ±15% tolerance: 70.13%


In [None]:
# AFTER CHECKING WITH DOCTORS
# ✅ Imports
!pip install catboost
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error

# ✅ Prepare Pool (CatBoost can handle NaNs directly)
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

# ✅ Initialize and Train Model
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function='RMSE',
    verbose=100,
    random_seed=42
)

model.fit(train_pool)

# ✅ Predict
y_pred = model.predict(test_pool)

# ✅ Evaluate
print("✅ Test MSE:", mean_squared_error(y_test, y_pred))
print("✅ R² Score:", r2_score(y_test, y_pred))
print("✅ MAE (Mean Absolute Error):", mean_absolute_error(y_test, y_pred))

# ✅ Optional: Accuracy within ±5% tolerance
tolerance = 0.15
lower = y_test * (1 - tolerance)
upper = y_test * (1 + tolerance)
within_tol = ((y_pred >= lower) & (y_pred <= upper)).sum()
tolerance_accuracy = within_tol / len(y_test)
print(f"✅ Accuracy within ±15%: {tolerance_accuracy:.2%}")


0:	learn: 11.3831213	total: 38.6ms	remaining: 19.3s
100:	learn: 9.3660956	total: 2.58s	remaining: 10.2s
200:	learn: 9.3404413	total: 5.51s	remaining: 8.2s
300:	learn: 9.3204727	total: 7.84s	remaining: 5.18s
400:	learn: 9.3035607	total: 8.94s	remaining: 2.21s
499:	learn: 9.2891580	total: 9.84s	remaining: 0us
✅ Test MSE: 87.35492604134357
✅ R² Score: 0.36104192249475386
✅ MAE (Mean Absolute Error): 7.418131663735804
✅ Accuracy within ±15%: 73.40%


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# ✅ Initialize and train the model
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,        # You can tune this if needed
    random_state=42,
    n_jobs=-1              # Use all cores for faster training
)

rf_model.fit(X_train, y_train)

# ✅ Predict on test data
y_pred = rf_model.predict(X_test)

# ✅ Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("✅ MAE:", mae)
print("✅ R² Score:", r2)

# ✅ Optional: Accuracy within ±15% tolerance
tolerance = 0.15
lower = y_test * (1 - tolerance)
upper = y_test * (1 + tolerance)
within_tol = ((y_pred >= lower) & (y_pred <= upper)).sum()
tolerance_accuracy = within_tol / len(y_test)
print(f"✅ Accuracy within ±15%: {tolerance_accuracy:.2%}")

✅ MAE: 8.341975114528122
✅ R² Score: 0.17412041216401242
✅ Accuracy within ±15%: 67.68%
