# Student Performance Analysis & Prediction
# This notebook analyzes student performance data, performs EDA, feature engineering, and predicts average scores using Random Forest and XGBoost models.

#Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

#Load and view data

In [None]:
df = pd.read_csv("Expanded_data_with_more_features.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75


# Rename columns and handle missing values

In [None]:

df = df.rename(columns={
    'Gender': 'gender',
    'EthnicGroup': 'ethnic_group',
    'ParentEduc': 'parent_education',
    'LunchType': 'lunch_type',
    'TestPrep': 'test_prep',
    'ParentMaritalStatus': 'parent_marital_status',
    'PracticeSport': 'practice_sport',
    'IsFirstChild': 'is_first_child',
    'NrSiblings': 'no_of_siblings',
    'TransportMeans': 'transport_means',
    'WklyStudyHours': 'weekly_study_hours',
    'MathScore': 'math_score',
    'ReadingScore': 'reading_score',
    'WritingScore': 'writing_score'
})

# Drop unnecessary column
df = df.drop('Unnamed: 0', axis=1, errors='ignore')

# Fill missing values
cols_to_fill = [
    'ethnic_group', 'parent_education', 'test_prep',
    'parent_marital_status', 'practice_sport',
    'is_first_child', 'no_of_siblings',
    'transport_means', 'weekly_study_hours'
]
df[cols_to_fill] = df[cols_to_fill].fillna(df[cols_to_fill].mode().iloc[0])

df.isnull().sum()

Unnamed: 0,0
gender,0
ethnic_group,0
parent_education,0
lunch_type,0
test_prep,0
parent_marital_status,0
practice_sport,0
is_first_child,0
no_of_siblings,0
transport_means,0


#Feature Engineering

In [None]:
# Total & Average scores
df["total_score"] = df["math_score"] + df["reading_score"] + df["writing_score"]
df["avg_score"] = df["total_score"] / 3

# Performance Levels
df["performance_level"] = pd.cut(
    df["avg_score"],
    bins=[0, 60, 80, 100],
    labels=["Low", "Medium", "High"]
)

# Convert study hours
def convert_study_hours(x):
    if x == "< 5":
        return 2.5
    elif x == "5 - 10":
        return 7.5
    elif x == "> 10":
        return 12
    else:
        return np.nan

df['weekly_study_hours_num'] = df['weekly_study_hours'].apply(convert_study_hours)

#Exploratory Data Analysis (EDA)

In [None]:
# Basic Distribution Plots
sns.countplot(data=df, x='gender').bar_label(sns.countplot(data=df, x='gender').containers[0])
plt.title('Gender Distribution')
plt.show()

sns.countplot(data=df, x='ethnic_group')
plt.title('Ethnic Group Distribution')
plt.show()

sns.countplot(data=df, x="performance_level")
plt.title("Student Performance Levels")
plt.show()

In [None]:
# Correlation heatmap
corr = df[["math_score", "reading_score", "writing_score", "avg_score"]].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Score Correlation Heatmap")
plt.show()


In [None]:
# Group analysis heatmaps

# Scores vs Parent Education
gb = df.groupby('parent_education')[["math_score","reading_score","writing_score"]].mean()
sns.heatmap(gb, annot=True)
plt.title("Scores vs Parent Education")
plt.show()

# Scores vs Parent Marital Status
gb1 = df.groupby('parent_marital_status')[["math_score","reading_score","writing_score"]].mean()
sns.heatmap(gb1, annot=True)
plt.title("Scores vs Parent Marital Status")
plt.show()

In [None]:
# Pivot table for study hours and test prep
pivot_table = df.pivot_table(
    values="avg_score",
    index="weekly_study_hours",
    columns="test_prep",
    aggfunc="mean"
)
sns.heatmap(pivot_table, annot=True, cmap="coolwarm")
plt.title("Study Hours + Test Prep Impact")
plt.show()

In [None]:
# Box plots and Pie chart
sns.boxplot(data=df, x="gender", y="avg_score")
plt.title("Gender vs Performance")
plt.show()

sns.boxplot(data=df, x="lunch_type", y="avg_score")
plt.title("Lunch Type vs Performance")
plt.show()

plt.pie(df["ethnic_group"].value_counts(), labels=df["ethnic_group"].value_counts().index, autopct="%1.1f%%")
plt.title("Ethnic Group Distribution")
plt.show()

In [None]:
#Encoding categorical features
df_encoded = df.copy()
categorical_cols = [
    'gender', 'ethnic_group', 'parent_education', 'lunch_type', 'test_prep',
    'parent_marital_status', 'practice_sport', 'is_first_child', 'transport_means'
]

for col in categorical_cols:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

In [None]:
# Feature selection
X = df_encoded[categorical_cols + ['no_of_siblings', 'weekly_study_hours_num']]
y = df_encoded['avg_score']

In [None]:
# Train - test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

In [None]:
# Model Training
# Random Forest
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# XGBoost
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

In [None]:
#Model Evaluation
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    print(f"--- {model_name} Performance ---")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"R-squared (R2): {r2:.4f}\n")

evaluate_model(y_test, y_pred_rf, "RandomForestRegressor")
evaluate_model(y_test, y_pred_xgb, "XGBRegressor")

In [None]:
#Feature Importance
importance = pd.Series(xgb_model.feature_importances_, index=X.columns)
importance.sort_values().plot(kind="barh", figsize=(10,6))
plt.title("Feature Importance Affecting Student Performance (XGBoost)")
plt.show()