In [1]:
pip install pandas numpy scikit-learn

Collecting pandas
  Using cached pandas-2.3.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Using cached numpy-2.4.0-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.8.0-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Using cached scipy-1.16.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached pandas-2.3.3-cp311-cp311-win_amd64.whl (11.3 MB)
Using cached numpy-2.4.0-cp311-cp311-win_amd64.whl (12.6 MB)
Using cached scikit_learn-1.8.0-cp311-cp311


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import numpy as np
import pandas as pd

np.random.seed(42)
n = 2000
data = []


def clamp(x, low=0, high=100):
    return max(low, min(high, x))

for i in range(n):
    historical_gpa = np.random.normal(2.8, 0.7)
    historical_gpa = np.clip(historical_gpa, 0, 4)

    eti_score = clamp(
        historical_gpa * 20 + np.random.normal(0, 15),
        0, 100
    )

    quiz_avg = clamp(
        historical_gpa * 18 + eti_score * 0.3 + np.random.normal(0, 12)
    )

    assignment_avg = clamp(
        historical_gpa * 20 + eti_score * 0.35 + np.random.normal(0, 10)
    )

    exam_score = clamp(
        historical_gpa * 25 + eti_score * 0.25 + np.random.normal(0, 15)
    )

    login_frequency = np.clip(
        np.random.normal(eti_score / 15, 1.5),
        0, 10
    )

    time_spent_hours = np.clip(
        np.random.normal(eti_score / 5, 3),
        1, 40
    )

    course_progress = clamp(
        eti_score + np.random.normal(0, 10)
    )

    # Final performance score (ground truth)
    final_score = (
        quiz_avg * 0.25 +
        assignment_avg * 0.30 +
        exam_score * 0.35 +
        eti_score * 0.10 +
        np.random.normal(0, 5)
    )

    final_score = clamp(final_score)

    data.append([
        i, quiz_avg, assignment_avg, exam_score,
        login_frequency, time_spent_hours,
        course_progress, historical_gpa,
        eti_score, final_score
    ])

columns = [
    "student_id",
    "quiz_avg",
    "assignment_avg",
    "exam_score",
    "login_frequency",
    "time_spent_hours",
    "course_progress",
    "historical_gpa",
    "eti_score",
    "final_performance_score"
]

df_perf = pd.DataFrame(data, columns=columns)


In [3]:
def grade(score):
    if score >= 85: return "A"
    elif score >= 70: return "B"
    elif score >= 55: return "C"
    elif score >= 40: return "D"
    else: return "F"

df_perf["final_grade"] = df_perf["final_performance_score"].apply(grade)


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df_perf.drop(
    ["student_id", "final_performance_score", "final_grade"],
    axis=1
)
y = df_perf["final_performance_score"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Linear Regression

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

lr = LinearRegression()
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)
r2_lr = r2_score(y_test, pred_lr)
mae_lr = mean_absolute_error(y_test, pred_lr)


Random Forest Regressor

In [6]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=300,
    random_state=42
)
rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)
r2_rf = r2_score(y_test, pred_rf)
mae_rf = mean_absolute_error(y_test, pred_rf)


Gradient Boosting Regressor

In [7]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)

pred_gb = gb.predict(X_test)
r2_gb = r2_score(y_test, pred_gb)
mae_gb = mean_absolute_error(y_test, pred_gb)


In [8]:
pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest", "Gradient Boosting"],
    "R2 Score": [r2_lr, r2_rf, r2_gb],
    "MAE (↓ better)": [mae_lr, mae_rf, mae_gb]
})

Unnamed: 0,Model,R2 Score,MAE (↓ better)
0,Linear Regression,0.927013,3.759637
1,Random Forest,0.916083,4.075988
2,Gradient Boosting,0.91676,4.095612


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

X = df_perf.drop(
    ["student_id", "final_performance_score", "final_grade"],
    axis=1
)

y = LabelEncoder().fit_transform(df_perf["final_grade"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    stratify=y,
    random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [11]:
from sklearn.metrics import accuracy_score, f1_score

In [12]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(X_train, y_train)

pred_lr = log_reg.predict(X_test)
acc_lr = accuracy_score(y_test, pred_lr)
f1_lr = f1_score(y_test, pred_lr, average="macro")


In [13]:
from sklearn.svm import SVC

svm = SVC(kernel="rbf", probability=True)
svm.fit(X_train, y_train)

pred_svm = svm.predict(X_test)
acc_svm = accuracy_score(y_test, pred_svm)
f1_svm = f1_score(y_test, pred_svm, average="macro")

In [14]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, pred_knn)
f1_knn = f1_score(y_test, pred_knn, average="macro")


In [15]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(
    max_depth=8,
    min_samples_leaf=20,
    random_state=42
)
dt.fit(X_train, y_train)

pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test, pred_dt)
f1_dt = f1_score(y_test, pred_dt, average="macro")


In [16]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    min_samples_leaf=10,
    random_state=42
)
rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, pred_rf)
f1_rf = f1_score(y_test, pred_rf, average="macro")


In [17]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

pred_gb = gb.predict(X_test)
acc_gb = accuracy_score(y_test, pred_gb)
f1_gb = f1_score(y_test, pred_gb, average="macro")


In [18]:
results = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "SVM (RBF)",
        "KNN",
        "Decision Tree",
        "Random Forest",
        "Gradient Boosting"
    ],
    "Accuracy": [
        acc_lr, acc_svm, acc_knn,
        acc_dt, acc_rf, acc_gb
    ],
    "Macro F1": [
        f1_lr, f1_svm, f1_knn,
        f1_dt, f1_rf, f1_gb
    ]
})

results.sort_values("Macro F1", ascending=False)


Unnamed: 0,Model,Accuracy,Macro F1
1,SVM (RBF),0.776,0.765502
0,Logistic Regression,0.776,0.76301
4,Random Forest,0.766,0.74994
5,Gradient Boosting,0.758,0.747855
3,Decision Tree,0.726,0.723762
2,KNN,0.712,0.691377


In [19]:
import joblib

joblib.dump(lr, "performance_linear_regression.pkl")
joblib.dump(scaler, "performance_scaler.pkl")

['performance_scaler.pkl']

In [20]:
lr_model = joblib.load("performance_linear_regression.pkl")
perf_scaler = joblib.load("performance_scaler.pkl")

In [21]:
import pandas as pd

new_student = pd.DataFrame([{
    "quiz_avg": 72,
    "assignment_avg": 78,
    "exam_score": 70,
    "login_frequency": 5,
    "time_spent_hours": 18,
    "course_progress": 80,
    "historical_gpa": 3.0,
    "eti_score": 74
}])


In [22]:
X_new_scaled = perf_scaler.transform(new_student)

predicted_score = lr_model.predict(X_new_scaled)
predicted_score

array([72.91763402])

In [23]:
def score_to_grade(score):
    if score >= 85:
        return "A"
    elif score >= 70:
        return "B"
    elif score >= 55:
        return "C"
    elif score >= 40:
        return "D"
    else:
        return "F"

predicted_grade = score_to_grade(predicted_score[0])
predicted_grade


'B'