In [36]:
import pandas as pd

In [37]:
df = pd.read_csv("Student_Performance.csv")


In [38]:
# find duplicates (all occurrences, including the original)
duplicates = df[df.duplicated(keep=False)]

# print("Number of duplicate rows (including originals):", len(duplicates))

# show them as a proper table
# print(duplicates.sort_values(by=df.columns.tolist()).to_string(index=False))
df = df.drop_duplicates()
# print("Number of duplicate rows (including originals):", len(duplicates))


In [40]:
import numpy as np

# select numeric columns
numeric_cols = df.select_dtypes(include=[np.number])

# calculate skewness
skew_values = numeric_cols.skew()

# # check each column and decide missing handling
# for col, skew in skew_values.items():
#     if -0.5 <= skew <= 0.5:
#         print(f"{col}: skew = {skew:.2f} → Nearly symmetric → Use MEAN (because |skew| ≤ 0.5)")
#     elif skew > 0.5:
#         print(f"{col}: skew = {skew:.2f} → Right skew → Use MEDIAN (because skew > 0.5)")
#     else:  # skew < -0.5
#         print(f"{col}: skew = {skew:.2f} → Left skew → Use MEDIAN (because skew < -0.5)")


In [41]:
def detect_outliers_iqr(df):
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        outliers = df[(df[col] < lower) | (df[col] > upper)]
        print(f"{col}: {len(outliers)} outliers found")

# Example usage
detect_outliers_iqr(df)


Hours Studied: 0 outliers found
Previous Scores: 0 outliers found
Sleep Hours: 0 outliers found
Sample Question Papers Practiced: 0 outliers found
Performance Index: 0 outliers found


In [43]:
import matplotlib.pyplot as plt
import seaborn as sns

target = "Performance Index"

# 1. Numeric features vs Performance Index → scatter plots
# numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop(target)
# for col in numeric_cols:
#     plt.figure(figsize=(6,4))
#     sns.scatterplot(x=df[col], y=df[target])
#     plt.title(f"{col} vs {target}")
#     plt.show()

# # 2. Categorical features vs Performance Index → boxplots
# categorical_cols = df.select_dtypes(include=['object', 'category']).columns
# for col in categorical_cols:
#     plt.figure(figsize=(6,4))
#     sns.boxplot(x=df[col], y=df[target])
#     plt.title(f"{col} vs {target}")
#     plt.show()


In [44]:
df['Extracurricular Activities'] = df['Extracurricular Activities'].map({'Yes': 1, 'No': 0})
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 9873 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype
---  ------                            --------------  -----
 0   Hours Studied                     9873 non-null   int64
 1   Previous Scores                   9873 non-null   int64
 2   Extracurricular Activities        9873 non-null   int64
 3   Sleep Hours                       9873 non-null   int64
 4   Sample Question Papers Practiced  9873 non-null   int64
 5   Performance Index                 9873 non-null   int64
dtypes: int64(6)
memory usage: 539.9 KB


In [47]:
from sklearn.preprocessing import StandardScaler
# Select only numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,0.775566,1.706168,1.010078,1.454025,-1.249715,1.862979
1,-0.383205,0.724912,-0.990022,-1.491315,-0.900925,0.509348
2,1.161822,-1.064438,1.010078,0.275889,-0.900925,-0.531907
3,0.003052,-1.006717,1.010078,-0.902247,-0.900925,-1.000471
4,0.775566,0.320865,-0.990022,0.864957,0.145444,0.561411


In [None]:
from sklearn.model_selection import train_test_split

# Features (everything except Performance Index)
X = df.drop("Performance Index", axis=1)

# Target (Performance Index)
y = df["Performance Index"]

# Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [49]:
from sklearn.linear_model import LinearRegression

# Initialize model
model = LinearRegression()

# Train (fit) the model
model.fit(X_train, y_train)


In [50]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on test data
y_pred = model.predict(X_test)

# Evaluation metrics
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


MSE: 0.011671265615520246
R2 Score: 0.9884301209927054


In [None]:
# Hours Studied = 8
# Previous Scores = 70
# Extracurricular Activities = 1 (Yes)
# Sleep Hours = 6
# Sample Question Papers Practiced = 5

import numpy as np

# New data (must be in 2D array and follow same feature order as training)
new_data = np.array([[8, 70, 1, 6, 5]])  

# Predict
prediction = model.predict(new_data)
print("Predicted Performance Index:", prediction[0])


Predicted Performance Index: 67.78146659054217


