In [9]:
import pandas as pd

In [10]:
df = pd.read_csv("Student_Performance.csv")


In [11]:
# find duplicates (all occurrences, including the original)
duplicates = df[df.duplicated(keep=False)]

# print("Number of duplicate rows (including originals):", len(duplicates))

# show them as a proper table
# print(duplicates.sort_values(by=df.columns.tolist()).to_string(index=False))
df = df.drop_duplicates()
# print("Number of duplicate rows (including originals):", len(duplicates))


In [12]:
import numpy as np

# select numeric columns
numeric_cols = df.select_dtypes(include=[np.number])

# calculate skewness
skew_values = numeric_cols.skew()

# # check each column and decide missing handling
# for col, skew in skew_values.items():
#     if -0.5 <= skew <= 0.5:
#         print(f"{col}: skew = {skew:.2f} → Nearly symmetric → Use MEAN (because |skew| ≤ 0.5)")
#     elif skew > 0.5:
#         print(f"{col}: skew = {skew:.2f} → Right skew → Use MEDIAN (because skew > 0.5)")
#     else:  # skew < -0.5
#         print(f"{col}: skew = {skew:.2f} → Left skew → Use MEDIAN (because skew < -0.5)")


In [13]:
def detect_outliers_iqr(df):
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        outliers = df[(df[col] < lower) | (df[col] > upper)]
        print(f"{col}: {len(outliers)} outliers found")

# Example usage
detect_outliers_iqr(df)


Hours Studied: 0 outliers found
Previous Scores: 0 outliers found
Sleep Hours: 0 outliers found
Sample Question Papers Practiced: 0 outliers found
Performance Index: 0 outliers found


In [14]:
import matplotlib.pyplot as plt
import seaborn as sns

target = "Performance Index"

# 1. Numeric features vs Performance Index → scatter plots
# numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop(target)
# for col in numeric_cols:
#     plt.figure(figsize=(6,4))
#     sns.scatterplot(x=df[col], y=df[target])
#     plt.title(f"{col} vs {target}")
#     plt.show()

# # 2. Categorical features vs Performance Index → boxplots
# categorical_cols = df.select_dtypes(include=['object', 'category']).columns
# for col in categorical_cols:
#     plt.figure(figsize=(6,4))
#     sns.boxplot(x=df[col], y=df[target])
#     plt.title(f"{col} vs {target}")
#     plt.show()


In [22]:
df['Extracurricular Activities Mapped'] = df['Extracurricular Activities'].map({'Yes': 1, 'No': 0})
df["Extracurricular Activities Mapped"].dtype

dtype('int64')

In [None]:
# df = df.drop(columns=['Extracurricular Activities'])


In [73]:
from sklearn.preprocessing import StandardScaler

X = df.drop("Performance Index", axis=1)  # features only
y = df["Performance Index"]               # target


numeric_cols = ["Hours Studied", "Previous Scores", "Sleep Hours", "Sample Question Papers Practiced"]
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

df.head()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index,Extracurricular Activities Mapped
0,0.775566,1.706168,1.454025,-1.249715,1.862979,1.010078
1,-0.383205,0.724912,-1.491315,-0.900925,0.509348,-0.990022
2,1.161822,-1.064438,0.275889,-0.900925,-0.531907,1.010078
3,0.003052,-1.006717,-0.902247,-0.900925,-1.000471,1.010078
4,0.775566,0.320865,0.864957,0.145444,0.561411,-0.990022


In [None]:
from sklearn.model_selection import train_test_split

# Features (everything except Performance Index)
X = df.drop("Performance Index", axis=1)
print(X.dtypes)
# Target (Performance Index)
y = df["Performance Index"]

# Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Index(['Hours Studied', 'Previous Scores', 'Sleep Hours',
       'Sample Question Papers Practiced',
       'Extracurricular Activities Mapped'],
      dtype='object')


In [75]:
from sklearn.linear_model import LinearRegression

# Initialize model
model = LinearRegression()

# Train (fit) the model
model.fit(X_train, y_train)

In [76]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on test data
y_pred = model.predict(X_test)

# Evaluation metrics
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


MSE: 0.011671265615520251
R2 Score: 0.9884301209927054


In [77]:
# New data as dict (without square brackets)
new_data_dict = {
    "Hours Studied": 8,
    "Previous Scores": 70,
    "Sleep Hours": 6,
    "Sample Question Papers Practiced": 5,
     "Extracurricular Activities Mapped": 1
}

# Convert to DataFrame
X_new = pd.DataFrame([new_data_dict])

import numpy as np

# New data (must be in 2D array and follow same feature order as training)
new_data = np.array([[8, 70, 1, 6, 5]])  
X_new = X_new[model.feature_names_in_]

# Predict
prediction = model.predict(X_new)
print("Predicted Performance Index:", prediction[0])


Predicted Performance Index: 67.78146659054217


In [78]:
import joblib
joblib.dump(model, "std_performance_model.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [79]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import json

# Predictions
y_pred = model.predict(X_test)

# Regression metrics
r2   = r2_score(y_test, y_pred)
mse  = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae  = mean_absolute_error(y_test, y_pred)

print("R² Score:", r2)
print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)

# Save to JSON
metrics = {
    "r2_score": r2,
    "mse": mse,
    "rmse": rmse,
    "mae": mae
}

with open("regression_metrics.json", "w") as f:
    json.dump(metrics, f, indent=4)


R² Score: 0.9884301209927054
MSE: 0.011671265615520251
RMSE: 0.10803363187230286
MAE: 0.08574577950768568


In [63]:
import pandas as pd
import joblib

# Load your trained model and scaler
model = joblib.load("std_performance_model.pkl")
scaler = joblib.load("scaler.pkl")

In [81]:
import pandas as pd
import joblib

# Load model and scaler
model = joblib.load("std_performance_model.pkl")
scaler = joblib.load("scaler.pkl")

# New data
new_data = {
    "Hours Studied": 8,
    "Previous Scores": 170,
    "Extracurricular Activities Mapped": 1,
    "Sleep Hours": 6,
    "Sample Question Papers Practiced": 15
}

# Convert to DataFrame
X_new = pd.DataFrame([new_data])

# --- SCALE first: only numeric columns that were scaled during training ---
numeric_cols_to_scale = ["Hours Studied", "Previous Scores", "Sleep Hours", "Sample Question Papers Practiced"]
X_new[numeric_cols_to_scale] = scaler.transform(X_new[numeric_cols_to_scale])

# --- THEN reorder columns for model ---
X_new = X_new[model.feature_names_in_]

# Predict
prediction = model.predict(X_new)[0]

print("Prediction:", prediction)


Prediction: 159.9227511459848
