### Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

### Load the Dataset

In [2]:
df = pd.read_csv("student_habits_performance.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'student_habits_performance.csv'

### Initial Data Overview

In [None]:
# Checking the shape of the dataset
print(df.shape)

# Checking for missing values
print(df.isnull().sum())

# Checking mode and value counts of 'parental_education_level'
print(df.parental_education_level.mode())
print(df.parental_education_level.value_counts())

### Handling Missing Values

In [None]:
df1 = df.copy()
df1.parental_education_level = df1.parental_education_level.fillna("High School")

# Confirming missing values have been filled
# print(df1.info())

###  Visualizing Outliers Using Boxplots

In [None]:
fig, axes = plt.subplots(3, 3)
plt.tight_layout(pad=2)

axes[0, 0].boxplot(x=df1.age)
axes[0, 1].boxplot(df1.study_hours_per_day)
axes[0, 2].boxplot(df1.social_media_hours)
axes[1, 0].boxplot(df1.netflix_hours)
axes[1, 1].boxplot(df1.attendance_percentage)
axes[1, 2].boxplot(df1.sleep_hours)
axes[2, 0].boxplot(df1.exercise_frequency)
axes[2, 1].boxplot(df1.mental_health_rating)
axes[2, 2].boxplot(df1.exam_score)

plt.show()

###  Outlier Treatment Using IQR Method

In [None]:
for col in df1.select_dtypes(include=['float64', 'int64']):
    Q1 = df1[col].quantile(0.25)
    Q3 = df1[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    is_outlier = (df1[col] < lower) | (df1[col] > upper)

    # Replace outliers with mean
    df1.loc[is_outlier, col] = df1[col].mean()

###  Analyzing Categorical Features

In [None]:
print(df1.gender.value_counts())
print(df1.part_time_job.value_counts())
print(df1.diet_quality.value_counts())
print(df1.exercise_frequency.value_counts())
print(df1.parental_education_level.value_counts())
print(df1.internet_quality.value_counts())
print(df1.extracurricular_participation.value_counts())

### Visualizing Categorical Distributions

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(12, 10))
plt.tight_layout(pad=2)

sns.countplot(x='gender', data=df1, ax=axes[0, 0])
sns.countplot(x='part_time_job', data=df1, ax=axes[0, 1])
sns.countplot(x='diet_quality', data=df1, ax=axes[1, 0])
sns.countplot(x='exercise_frequency', data=df1, ax=axes[1, 1])
sns.countplot(x='parental_education_level', data=df1, ax=axes[2, 0])
sns.countplot(x='internet_quality', data=df1, ax=axes[2, 1])

plt.show()

sns.countplot(df1.extracurricular_participation)
plt.show()

###  Correlation Heatmap with Encoded Data

In [None]:
df1_numeric = df1.select_dtypes(include=['number']).copy()
df1_numeric['gender'] = df1['gender'].map({'Male': 0, 'Female': 1})
df1_numeric['part_time_job'] = df1['part_time_job'].map({'No': 0, 'Yes': 1})

plt.figure(figsize=(10, 10))
corr = df1_numeric.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

### Grouped Analysis with Heatmaps

In [None]:
numeric_cols = df.select_dtypes(include='number').columns

grouped1 = df.groupby('parental_education_level')[numeric_cols].mean()
grouped2 = df.groupby('diet_quality')[numeric_cols].mean()
grouped3 = df.groupby('internet_quality')[numeric_cols].mean()

fig, axes = plt.subplots(1, 3, figsize=(16, 6))

sns.heatmap(grouped1, ax=axes[0], annot=True, cmap='YlGnBu')
axes[0].set_title("Heatmap 1: Parental Education Level")

sns.heatmap(grouped2, ax=axes[1], annot=True, cmap='coolwarm')
axes[1].set_title("Heatmap 2: Diet Quality")

sns.heatmap(grouped3, ax=axes[2], annot=True, cmap='coolwarm')
axes[2].set_title("Heatmap 3: Internet Quality")

plt.tight_layout()
plt.show()

### One-Hot Encoding of Categorical Variables

In [None]:
categorical_cols = df1.select_dtypes(include='object').columns
df_encoded = pd.get_dummies(df1, columns=categorical_cols, drop_first=True)

### Train-Test Split

In [None]:
X = df_encoded.drop('exam_score', axis=1)
y = df_encoded['exam_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training the Linear Regression Model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

### Model Evaluation

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")