# Data Exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("student_habits_performance.csv")

In [None]:
print(df.head())
print(df.info())
print(df.describe())

print(df.isnull().sum())

In [None]:
sns.histplot(df['exam_score'], kde=True)
plt.title('Distribution of Exam Scores')
plt.show()

# Preprocessing

In [None]:
df = df.drop('student_id', axis=1)


df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})
df['part_time_job'] = df['part_time_job'].map({'No': 0, 'Yes': 1})
df['extracurricular_participation'] = df['extracurricular_participation'].map({'No': 0, 'Yes': 1})

df = pd.get_dummies(df, columns=['diet_quality', 'parental_education_level', 'internet_quality'], drop_first=True, dtype=int)

df = df.fillna(df.mean())


In [None]:
df

In [None]:
Y = df['exam_score']
X = df.drop('exam_score', axis=1)

In [154]:
print("Number of missing values in X:", X.isnull().sum().sum())
print("Number of missing values in y:", Y.isnull().sum())

Number of missing values in X: 0
Number of missing values in y: 0


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

# Training Model

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, Y_train)

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, Y_train)

# Evaluating Model

## Linear Regression

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

Y_pred = model.predict(X_test)

mse = mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, Y_pred)
r2 = r2_score(Y_test,Y_pred)


In [None]:
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

## Random Forest Regressor

In [None]:
Y_rf_pred = rf_model.predict(X_test)

rf_mse = mean_squared_error(Y_test, Y_rf_pred)
rf_rmse = np.sqrt(rf_mse)
rf_mae = mean_absolute_error(Y_test, Y_rf_pred)
rf_r2 = r2_score(Y_test, Y_rf_pred)

In [None]:
print("Random Forest Regressor Performance:")
print(f"MAE: {rf_mae:.2f}")
print(f"RMSE: {rf_rmse:.2f}")
print(f"R²: {rf_r2:.2f}")

## Model Comparison

In [None]:
model_results = []

model_results.append({
    'Mode Name': 'Linear Regression',
    'MAE': mae,
    'RMSE': rmse,
    'R2': r2
})

model_results.append({
    'Mode Name': 'Random Forest Regressor',
    'MAE': rf_mae,
    'RMSE': rf_rmse,
    'R2': rf_r2
})

results_df = pd.DataFrame(model_results)

print(results_df)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(Y_test, Y_pred, edgecolors=(0, 0, 0))
plt.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Exam Scores')
plt.ylabel('Predicted Exam Scores')
plt.title('Actual vs Predicted Exam Scores (Linear Regression)')
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(Y_test, Y_rf_pred, edgecolors=(0, 0, 0))
plt.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Exam Scores')
plt.ylabel('Predicted Exam Scores')
plt.title('Actual vs Predicted Exam Scores (Random Forest Regressor)')
plt.grid(True)
plt.show()

# Hyperparameter Tuning