Capstone Project: Student Success & Career Path Prediction

Scenario

The university wants to analyze student performance data to:

Predict exam scores (Regression).
Classify students into “At Risk” vs. “On Track” categories (Classification).
Cluster students into groups with similar study habits (Clustering).
Recommend interventions (extra tutoring, workshops, counseling).

https://github.com/himanshusar123/Datasets
Student Success and Career Path

In [5]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, mean_squared_error, r2_score, accuracy_score

filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

df.columns = df.columns.str.strip().str.lower()
df.replace(["NA", "?", "null", ""], np.nan, inplace=True)

for col in df.columns:
    if df[col].dtype == "object":
        try:
            df[col] = pd.to_numeric(df[col])
        except:
            pass

for col in df.columns:
    if df[col].dtype == "object":
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

for col in df.columns:
    if df[col].dtype == "object":
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

target_column = "final_exam_score"

# REGRESSION

X_reg = df.drop(target_column, axis=1)
y_reg = df[target_column]

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

reg_model = LinearRegression()
reg_model.fit(X_train_reg, y_train_reg)
reg_pred = reg_model.predict(X_test_reg)

regression_results = pd.DataFrame({
    "Metric": ["R2 Score", "Mean Squared Error"],
    "Value": [
        r2_score(y_test_reg, reg_pred),
        mean_squared_error(y_test_reg, reg_pred)
    ]
})

# CLASSIFICATION

df["status"] = np.where(df[target_column] >= 50, 1, 0)

X_clf = df.drop([target_column, "status"], axis=1)
y_clf = df["status"]

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_clf)
X_test_scaled = scaler.transform(X_test_clf)

clf_model = LogisticRegression()
clf_model.fit(X_train_scaled, y_train_clf)
clf_pred = clf_model.predict(X_test_scaled)

report = classification_report(y_test_clf, clf_pred, output_dict=True)

classification_results = pd.DataFrame({
    "Metric": ["Accuracy", "Precision (On Track)", "Recall (On Track)", "F1 Score (On Track)"],
    "Value": [
        accuracy_score(y_test_clf, clf_pred),
        report["1"]["precision"],
        report["1"]["recall"],
        report["1"]["f1-score"]
    ]
})

# CLUSTERING

scaler_cluster = StandardScaler()
scaled_data = scaler_cluster.fit_transform(X_clf)

kmeans = KMeans(n_clusters=3, random_state=42)
df["cluster"] = kmeans.fit_predict(scaled_data)

cluster_summary = df["cluster"].value_counts().reset_index()
cluster_summary.columns = ["Cluster", "Number of Students"]


# INTERVENTION

def suggest_intervention(row):
    if row["status"] == 0:
        return "Extra Tutoring & Counseling"
    elif row["cluster"] == 2:
        return "Time Management Workshop"
    else:
        return "Advanced Career Guidance"

df["intervention"] = df.apply(suggest_intervention, axis=1)

intervention_summary = df["intervention"].value_counts().reset_index()
intervention_summary.columns = ["Intervention Type", "Number of Students"]

# DISPLAY ORGANIZED OUTPUT

print("\n========== REGRESSION RESULTS ==========")
display(regression_results)

print("\n========== CLASSIFICATION RESULTS ==========")
display(classification_results)

print("\n========== CLUSTERING SUMMARY ==========")
display(cluster_summary)

print("\n========== INTERVENTION SUMMARY ==========")
display(intervention_summary)

Saving Student Success & Career Path  - Sheet1.csv to Student Success & Career Path  - Sheet1 (4).csv



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


Unnamed: 0,Metric,Value
0,R2 Score,0.927666
1,Mean Squared Error,9.638462





Unnamed: 0,Metric,Value
0,Accuracy,1.0
1,Precision (On Track),1.0
2,Recall (On Track),1.0
3,F1 Score (On Track),1.0





Unnamed: 0,Cluster,Number of Students
0,0,10
1,1,6
2,2,4





Unnamed: 0,Intervention Type,Number of Students
0,Advanced Career Guidance,12
1,Extra Tutoring & Counseling,5
2,Time Management Workshop,3
