In [None]:
# -----------------------------
# 1. Import Libraries
# -----------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# -----------------------------
# 2. Load Dataset
# -----------------------------
df = pd.read_csv('StudentsPerformance.csv')

# -----------------------------
# 3. Initial Exploration
# -----------------------------
print(f"Dataset Shape: {df.shape}")
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nData Types:")
print(df.dtypes)

# -----------------------------
# 4. Create Average Score Column (Optional Bonus)
# -----------------------------
df['average score'] = (df['math score'] + df['reading score'] + df['writing score']) / 3

# -----------------------------
# 5. EDA: Dataset Visualizations
# -----------------------------

# Distribution of categorical features
categorical_cols = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
for col in categorical_cols:
    plt.figure(figsize=(8,4))
    sns.countplot(data=df, x=col)
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
    plt.show()

# Distribution of numerical features
numerical_cols = ['math score', 'reading score', 'writing score']
for col in numerical_cols:
    plt.figure(figsize=(8,4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.show()

# Correlation heatmap between scores
plt.figure(figsize=(8,6))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Between Scores')
plt.show()

# Pie chart for Gender distribution
gender_counts = df['gender'].value_counts()
plt.figure(figsize=(6,6))
plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Gender Distribution')
plt.show()

# Boxplot of math score by gender
plt.figure(figsize=(8,6))
sns.boxplot(data=df, x='gender', y='math score')
plt.title('Math Score by Gender')
plt.show()

# Violin plot of math score by test preparation course
plt.figure(figsize=(8,6))
sns.violinplot(data=df, x='test preparation course', y='math score')
plt.title('Math Score by Test Preparation Course')
plt.show()

# -----------------------------
# 6. Preprocessing: Encoding Categorical Variables
# -----------------------------
features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
target = 'math score'

X = df[features]
y = df[target]

# Encode categorical features
label_encoders = {}
for col in features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le  # Save encoders

# -----------------------------
# 7. Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------------
# 8. Model Training
# -----------------------------

# Model 1: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Model 2: Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# -----------------------------
# 9. Predictions
# -----------------------------
y_pred_lr = lr_model.predict(X_test)
y_pred_dt = dt_model.predict(X_test)

# -----------------------------
# 10. Evaluation
# -----------------------------

# Linear Regression
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Decision Tree
mse_dt = mean_squared_error(y_test, y_pred_dt)
rmse_dt = np.sqrt(mse_dt)
r2_dt = r2_score(y_test, y_pred_dt)

# Print Evaluation Results
print("\nLinear Regression Results:")
print(f"RMSE: {rmse_lr:.2f}")
print(f"R² Score: {r2_lr:.2f}\n")

print("Decision Tree Regressor Results:")
print(f"RMSE: {rmse_dt:.2f}")
print(f"R² Score: {r2_dt:.2f}")

# -----------------------------
# 11. Model Results Visualizations
# -----------------------------

# Actual vs Predicted - Linear Regression
plt.figure(figsize=(8,5))
sns.scatterplot(x=y_test, y=y_pred_lr)
plt.title('Actual vs Predicted - Linear Regression')
plt.xlabel('Actual Math Score')
plt.ylabel('Predicted Math Score')
plt.plot([0, 100], [0, 100], 'r--')
plt.show()

# Actual vs Predicted - Decision Tree
plt.figure(figsize=(8,5))
sns.scatterplot(x=y_test, y=y_pred_dt)
plt.title('Actual vs Predicted - Decision Tree Regressor')
plt.xlabel('Actual Math Score')
plt.ylabel('Predicted Math Score')
plt.plot([0, 100], [0, 100], 'r--')
plt.show()

# Residuals Distribution - Linear Regression
residuals_lr = y_test - y_pred_lr
plt.figure(figsize=(8,5))
sns.histplot(residuals_lr, bins=30, kde=True)
plt.title('Residuals Distribution - Linear Regression')
plt.xlabel('Residuals')
plt.show()

# Residuals Distribution - Decision Tree
residuals_dt = y_test - y_pred_dt
plt.figure(figsize=(8,5))
sns.histplot(residuals_dt, bins=30, kde=True)
plt.title('Residuals Distribution - Decision Tree Regressor')
plt.xlabel('Residuals')
plt.show()

# RMSE Comparison
models = ['Linear Regression', 'Decision Tree']
rmse_values = [rmse_lr, rmse_dt]

plt.figure(figsize=(8,5))
sns.barplot(x=models, y=rmse_values)
plt.title('Model RMSE Comparison')
plt.ylabel('RMSE')
plt.show()

# R² Score Comparison
r2_values = [r2_lr, r2_dt]

plt.figure(figsize=(8,5))
sns.barplot(x=models, y=r2_values)
plt.title('Model R² Score Comparison')
plt.ylabel('R² Score')
plt.show()
