In [1]:
pip install pandas numpy scikit-learn

Collecting pandas
  Using cached pandas-2.3.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Using cached numpy-2.4.0-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.8.0-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Using cached scipy-1.16.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached pandas-2.3.3-cp311-cp311-win_amd64.whl (11.3 MB)
Using cached numpy-2.4.0-cp311-cp311-win_amd64.whl (12.6 MB)
Using cached scikit_learn-1.8.0-cp311-cp311


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd

np.random.seed(42)
n = 600

data = []

for i in range(n):
    if i < 200:  # Highly Engaged
        row = [
            i,
            np.random.randint(5, 10),
            np.random.randint(40, 90),
            np.random.randint(5, 15),
            np.random.randint(6, 12),
            np.random.uniform(0.5, 2),
            np.random.randint(0, 3),
            "Highly Engaged"
        ]
    elif i < 400:  # Moderately Engaged
        row = [
            i,
            np.random.randint(2, 5),
            np.random.randint(20, 45),
            np.random.randint(1, 5),
            np.random.randint(2, 6),
            np.random.uniform(2, 5),
            np.random.randint(3, 7),
            "Moderately Engaged"
        ]
    else:  # At-Risk
        row = [
            i,
            np.random.randint(0, 2),
            np.random.randint(5, 20),
            np.random.randint(0, 1),
            np.random.randint(0, 2),
            np.random.uniform(5, 12),
            np.random.randint(7, 30),
            "At-Risk"
        ]
    data.append(row)

columns = [
    "student_id", "login_frequency", "session_duration",
    "forum_participation", "assignment_access",
    "time_gap_avg", "inactivity_days", "engagement_level"
]

df = pd.DataFrame(data, columns=columns)


In [12]:
import numpy as np
import pandas as pd

np.random.seed(42)
n = 2000

data = []

def noisy(val, noise=0.15):
    return max(0, val + np.random.normal(0, val * noise))

for i in range(n):

    persona = np.random.choice(
        ["Highly Engaged", "Moderately Engaged", "At-Risk"],
        p=[0.35, 0.40, 0.25]
    )

    if persona == "Highly Engaged":
        row = [
            i,
            noisy(np.random.randint(4, 10)),
            noisy(np.random.randint(35, 120)),
            noisy(np.random.randint(2, 15)),
            noisy(np.random.randint(4, 12)),
            noisy(np.random.uniform(0.3, 3)),
            noisy(np.random.randint(0, 6)),
            persona
        ]

    elif persona == "Moderately Engaged":
        row = [
            i,
            noisy(np.random.randint(1, 6)),
            noisy(np.random.randint(15, 60)),
            noisy(np.random.randint(0, 6)),
            noisy(np.random.randint(1, 8)),
            noisy(np.random.uniform(1, 6)),
            noisy(np.random.randint(2, 15)),
            persona
        ]

    else:  # At-Risk
        row = [
            i,
            noisy(np.random.randint(0, 3)),
            noisy(np.random.randint(5, 35)),
            noisy(np.random.randint(0, 2)),
            noisy(np.random.randint(0, 3)),
            noisy(np.random.uniform(4, 15)),
            noisy(np.random.randint(5, 45)),
            persona
        ]

    data.append(row)

columns = [
    "student_id",
    "login_frequency",
    "session_duration",
    "forum_participation",
    "assignment_access",
    "time_gap_avg",
    "inactivity_days",
    "engagement_level"
]

df = pd.DataFrame(data, columns=columns)

# Optional: clip extreme noise
df.iloc[:, 1:-1] = df.iloc[:, 1:-1].clip(lower=0)


In [13]:
df.to_csv("student_engagement_data.csv", index=False)

In [14]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

X = df.drop(["student_id", "engagement_level"], axis=1)
y = LabelEncoder().fit_transform(df["engagement_level"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


LogisticRegression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

lr_preds = lr.predict(X_test)
lr_acc = accuracy_score(y_test, lr_preds)


RandomForestClassifier

In [16]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

rf_preds = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_preds)


XGBoost

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

gb_preds = gb.predict(X_test)
gb_acc = accuracy_score(y_test, gb_preds)

In [18]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest", "Gradient Boosting"],
    "Accuracy": [lr_acc, rf_acc, gb_acc]
})

results

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.98
1,Random Forest,0.992
2,Gradient Boosting,0.984


In [21]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(scaler.fit_transform(X))

df["cluster"] = clusters


In [22]:
sil_score = silhouette_score(X, clusters)
sil_score

0.295051497567442

In [23]:
models = {
    "logistic": lr,
    "random_forest": rf,
    "gradient_boosting": gb
}

accuracies = {
    "logistic": lr_acc,
    "random_forest": rf_acc,
    "gradient_boosting": gb_acc
}

best_model_name = max(accuracies, key=accuracies.get)
best_model = models[best_model_name]

best_model_name, accuracies[best_model_name]


('random_forest', 0.992)

In [24]:
import joblib

joblib.dump(best_model, "student_engagement_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(LabelEncoder().fit(df["engagement_level"]), "label_encoder.pkl")

['label_encoder.pkl']

model testing

In [25]:
model = joblib.load("student_engagement_model.pkl")
scaler = joblib.load("scaler.pkl")
label_encoder = joblib.load("label_encoder.pkl")

In [64]:
new_students = pd.DataFrame([
    {
        "login_frequency": 6,
        "session_duration": 75,
        "forum_participation": 8,
        "assignment_access": 9,
        "time_gap_avg": 1.2,
        "inactivity_days": 1
    },
    {
        "login_frequency": 1,
        "session_duration": 18,
        "forum_participation": 0,
        "assignment_access": 1,
        "time_gap_avg": 7.5,
        "inactivity_days": 20
    },
    {
        "login_frequency": 3,
        "session_duration": 35,
        "forum_participation": 2,
        "assignment_access": 4,
        "time_gap_avg": 3.5,
        "inactivity_days": 6
    }
])

In [27]:
X_new_scaled = scaler.transform(new_students)

pred_encoded = model.predict(X_new_scaled)
pred_labels = label_encoder.inverse_transform(pred_encoded)

new_students["Predicted_Engagement"] = pred_labels
new_students


Unnamed: 0,login_frequency,session_duration,forum_participation,assignment_access,time_gap_avg,inactivity_days,Predicted_Engagement
0,6,75,8,9,1.2,1,Highly Engaged
1,1,18,0,1,7.5,20,At-Risk
2,3,35,2,4,3.5,6,Moderately Engaged


In [28]:
probs = model.predict_proba(X_new_scaled)

probs_df = pd.DataFrame(
    probs,
    columns=label_encoder.classes_
)

final_output = pd.concat([new_students, probs_df], axis=1)
final_output


Unnamed: 0,login_frequency,session_duration,forum_participation,assignment_access,time_gap_avg,inactivity_days,Predicted_Engagement,At-Risk,Highly Engaged,Moderately Engaged
0,6,75,8,9,1.2,1,Highly Engaged,0.0,1.0,0.0
1,1,18,0,1,7.5,20,At-Risk,1.0,0.0,0.0
2,3,35,2,4,3.5,6,Moderately Engaged,0.0,0.0,1.0


In [33]:
def predict_engagement(student_features_df):
    X_scaled = scaler.transform(student_features_df)
    preds = model.predict(X_scaled)
    probs = model.predict_proba(X_scaled)

    result = student_features_df.copy()
    result["Predicted_Engagement"] = label_encoder.inverse_transform(preds)

    for i, cls in enumerate(label_encoder.classes_):
        result[f"prob_{cls}"] = probs[:, i]

    return result


Regression Model For Engagement Score

In [35]:
def compute_engagement_score(row):
    score = (
        row["login_frequency"] * 8 +
        row["session_duration"] * 0.4 +
        row["forum_participation"] * 6 +
        row["assignment_access"] * 7 -
        row["time_gap_avg"] * 6 -
        row["inactivity_days"] * 2
    )

    # Add realism noise
    score += np.random.normal(0, 8)

    return np.clip(score, 0, 100)

df["engagement_score"] = df.apply(compute_engagement_score, axis=1)


In [50]:
X = df.drop(
    ["student_id", "engagement_level", "engagement_score","cluster"],
    axis=1
)
y = df["engagement_score"]

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

scaler_reg = StandardScaler()
X_train = scaler_reg.fit_transform(X_train)
X_test = scaler_reg.transform(X_test)


Linear Regression

In [52]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)

pred_lr = lr_reg.predict(X_test)

r2_lr = r2_score(y_test, pred_lr)
mae_lr = mean_absolute_error(y_test, pred_lr)


Random Forest Regressor

In [53]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

rf_reg.fit(X_train, y_train)
pred_rf = rf_reg.predict(X_test)

r2_rf = r2_score(y_test, pred_rf)
mae_rf = mean_absolute_error(y_test, pred_rf)


Gradient Boosting Regressor

In [54]:
from sklearn.ensemble import GradientBoostingRegressor

gb_reg = GradientBoostingRegressor()
gb_reg.fit(X_train, y_train)

pred_gb = gb_reg.predict(X_test)

r2_gb = r2_score(y_test, pred_gb)
mae_gb = mean_absolute_error(y_test, pred_gb)


In [55]:
pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest", "Gradient Boosting"],
    "R2 Score": [r2_lr, r2_rf, r2_gb],
    "MAE (↓ better)": [mae_lr, mae_rf, mae_gb]
})

Unnamed: 0,Model,R2 Score,MAE (↓ better)
0,Linear Regression,0.881088,11.559379
1,Random Forest,0.974214,3.654294
2,Gradient Boosting,0.973334,4.086405


In [56]:
import joblib

joblib.dump(rf_reg, "engagement_score_model.pkl")
joblib.dump(scaler_reg, "engagement_score_scaler.pkl")


['engagement_score_scaler.pkl']

In [59]:
X

Unnamed: 0,login_frequency,session_duration,forum_participation,assignment_access,time_gap_avg,inactivity_days
0,5.407807,31.768994,1.041856,3.454732,1.006823,5.527347
1,1.216235,46.844774,1.840455,3.929515,3.454375,3.864534
2,3.673370,21.349406,3.823961,0.995524,5.303404,6.852696
3,5.927370,40.980941,3.862330,11.642426,1.855927,2.841098
4,5.163979,92.845661,10.060468,12.585614,0.917050,3.568093
...,...,...,...,...,...,...
1995,1.949683,42.017726,0.841910,3.152163,5.440910,14.117451
1996,7.759887,123.083663,8.831432,10.285593,0.871067,0.000000
1997,4.984540,46.221091,2.185351,2.834984,3.104455,9.947205
1998,5.796020,85.104944,10.393548,6.118242,0.794359,1.734208


In [61]:
new_students

Unnamed: 0,login_frequency,session_duration,forum_participation,assignment_access,time_gap_avg,inactivity_days
0,6,75,8,9,1.2,1
1,1,18,0,1,7.5,20
2,3,35,2,4,3.5,6


In [60]:
new_students = new_students.drop(columns=["Predicted_Engagement"], errors='ignore')

In [62]:
reg_model = joblib.load("engagement_score_model.pkl")
reg_scaler = joblib.load("engagement_score_scaler.pkl")

X_new = new_students.copy()
X_new_scaled = reg_scaler.transform(X_new)

engagement_scores = reg_model.predict(X_new_scaled)

new_students["Engagement_Score (%)"] = np.round(engagement_scores, 1)
new_students


Unnamed: 0,login_frequency,session_duration,forum_participation,assignment_access,time_gap_avg,inactivity_days,Engagement_Score (%)
0,6,75,8,9,1.2,1,100.0
1,1,18,0,1,7.5,20,0.0
2,3,35,2,4,3.5,6,47.1


In [65]:
persona_preds = model.predict(scaler.transform(new_students))
persona_labels = label_encoder.inverse_transform(persona_preds)

new_students["Persona"] = persona_labels
new_students["Engagement_Score (%)"] = np.round(engagement_scores, 1)


In [66]:
def engagement_action(score):
    if score >= 80:
        return "No action"
    elif score >= 50:
        return "Light nudges"
    else:
        return "Immediate intervention"

new_students["Action"] = new_students["Engagement_Score (%)"].apply(engagement_action)


In [67]:
new_students

Unnamed: 0,login_frequency,session_duration,forum_participation,assignment_access,time_gap_avg,inactivity_days,Persona,Engagement_Score (%),Action
0,6,75,8,9,1.2,1,Highly Engaged,100.0,No action
1,1,18,0,1,7.5,20,At-Risk,0.0,Immediate intervention
2,3,35,2,4,3.5,6,Moderately Engaged,47.1,Immediate intervention
