In [9]:
# Capstone Project: Student Success & Career Path Prediction
# Scenario
# The university wants to analyze student performance data to:
# Predict exam scores (Regression).
# Classify students into “At Risk” vs. “On Track” categories (Classification).
# Cluster students into groups with similar study habits (Clustering).
# Recommend interventions (extra tutoring, workshops, counseling).
# https://github.com/himanshusar123/Datasets
# Student Success and Career Path

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_csv("Student Success & Career Path  - Sheet1.csv")

df = df.dropna()

le = LabelEncoder()

df["Gender"] = le.fit_transform(df["Gender"])
df["Pass_Fail"] = le.fit_transform(df["Pass_Fail"])


df = df.drop("Student_ID", axis=1)



X_reg = df.drop("Final_Exam_Score", axis=1)
y_reg = df["Final_Exam_Score"]

X_train, X_test, y_train, y_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

reg_model = LinearRegression()
reg_model.fit(X_train_scaled, y_train)

y_pred = reg_model.predict(X_test_scaled)

print("Regression Results")
print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

df["status"] = np.where(df["Final_Exam_Score"] < 50, 0, 1)


X_clf = df.drop(["Final_Exam_Score", "status"], axis=1)
y_clf = df["status"]

X_train, X_test, y_train, y_test = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf_model = LogisticRegression(max_iter=1000)
clf_model.fit(X_train_scaled, y_train)

y_pred = clf_model.predict(X_test_scaled)

print("\nClassification Results")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))



cluster_features = df[[
    "Hours_Studied",
    "Attendance (%)",
    "Assignments_Submitted",
    "Participation_Score"
]]

cluster_scaled = scaler.fit_transform(cluster_features)

kmeans = KMeans(n_clusters=3, random_state=42)
df["cluster"] = kmeans.fit_predict(cluster_scaled)

print("\nCluster Distribution:")
print(df["cluster"].value_counts())


def recommend(row):
    if row["status"] == 0 and row["Attendance (%)"] < 60:
        return "Counseling + Attendance Monitoring"
    elif row["status"] == 0:
        return "Extra Tutoring"
    elif row["cluster"] == 2:
        return "Advanced Workshops"
    else:
        return "Regular Monitoring"

df["recommendation"] = df.apply(recommend, axis=1)

print("\nSample Recommendations:")
print(df[["Final_Exam_Score", "status", "cluster", "recommendation"]].head())

Regression Results
R2 Score: 0.9285287923994713
MSE: 9.523538412770446

Classification Results
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         3

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4

Accuracy: 1.0

Cluster Distribution:
cluster
0    8
2    6
1    6
Name: count, dtype: int64

Sample Recommendations:
   Final_Exam_Score  status  cluster                      recommendation
0                78       1        0                  Regular Monitoring
1                55       1        2                  Advanced Workshops
2                88       1        0                  Regular Monitoring
3                45       0        1  Counseling + Attendance Monitoring
4                70       1        2                  Advanced Workshops
