In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
path = "student-math.csv"  # or the actual file path
df = pd.read_csv(path, sep=";", quotechar='"', dtype=str)

In [None]:
print(df.head(3).T)

In [None]:
def clean_numeric_columns(df, numeric_cols):
    for c in numeric_cols:
        # remove quotes, spaces and coerce to numeric
        df[c] = df[c].astype(str).str.replace('"', '', regex=False).str.strip()
        df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

numeric_cols = [
    "age", "Medu", "Fedu", "traveltime", "studytime", "failures", "famrel",
    "freetime", "goout", "Dalc", "Walc", "health", "absences", "G1", "G2", "G3"
]
df = clean_numeric_columns(df, numeric_cols)


In [None]:
bool_cols = ["schoolsup", "famsup", "paid", "activities", "nursery", "higher", "internet", "romantic", "Pstatus"]
for c in bool_cols:
    if c in df.columns:
        df[c] = df[c].astype(str).str.lower().str.replace('"', '').str.strip()

In [None]:
print("\nMissing values per column:\n", df.isnull().sum())

In [None]:
df = df.dropna(subset=["G3"])

In [None]:
for c in numeric_cols:
    if c in df.columns:
        df[c].fillna(df[c].median(), inplace=True)


In [None]:
cat_cols = df.select_dtypes(include="object").columns.tolist()
for c in cat_cols:
    df[c].fillna(df[c].mode().iloc[0], inplace=True)

In [None]:
print( df.shape)


In [None]:
for c in cat_cols:
    df[c] = df[c].astype("category")

In [None]:
num_df = df[numeric_cols].copy()
corr = num_df.corr()
plt.figure(figsize=(9, 7))


In [None]:
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Numeric Feature Correlation Matrix")

In [None]:
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.histplot(df["G3"], kde=True)
plt.title("Distribution of final grade (G3)")
plt.xlabel("G3")
plt.show()


In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.scatterplot(x="G1", y="G3", data=df)
plt.title("G1 vs G3")
plt.subplot(1, 2, 2)
sns.scatterplot(x="G2", y="G3", data=df)
plt.title("G2 vs G3")
plt.show()

In [None]:
simple_feature = ["G1"]  
multi_features = ["G1", "G2", "studytime", "failures"]  


In [None]:
for col in simple_feature + multi_features:
    if col not in df.columns:
        raise KeyError(f"Expected column '{col}' not found in dataframe")

X_simple = df[simple_feature].values.reshape(-1, 1)
X_multi = df[multi_features].values
y = df["G3"].values



In [None]:
splits = {"80:20": 0.2, "70:30": 0.3, "60:40": 0.4}
random_state = 42