In [3]:
# model_training.ipynb (Jupyter Notebook structure as .py script)

# --------------------------
# Part 1: Data Preprocessing
# --------------------------
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Load dataset
df = pd.read_csv("StudentsPerformance.csv")

# Encode categorical variables
label_cols = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
label_encoders = {}
for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# --------------------------
# Part 2: Linear Regression
# --------------------------
def train_linear_regression(X, y):
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred

# --------------------------
# Part 3: Random Forest Regression
# --------------------------
def train_random_forest(X, y):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred

# --------------------------
# Part 4: Evaluation Function
# --------------------------
def evaluate_model(y_true, y_pred, model_name, target_name):
    print(f"===== {model_name} Evaluation using {target_name} score =====")
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("MSE:", mean_squared_error(y_true, y_pred))
    print("R2 Score:", r2_score(y_true, y_pred))
    print()

# --------------------------
# Part 5: Run for each target
# --------------------------
targets = ['math score', 'reading score', 'writing score']

for target in targets:
    X = df.drop(columns=targets)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Linear Regression
    y_pred_lr = train_linear_regression(X, y)
    evaluate_model(y_test, y_pred_lr, "Linear Regression", target)

    # Random Forest
    y_pred_rf = train_random_forest(X, y)
    evaluate_model(y_test, y_pred_rf, "Random Forest", target)


===== Linear Regression Evaluation using math score score =====
MAE: 11.224583244917547
MSE: 202.87356439155698
R2 Score: 0.16629038248152794

===== Random Forest Evaluation using math score score =====
MAE: 12.371250865682617
MSE: 246.79812300227644
R2 Score: -0.014217743694691976

===== Linear Regression Evaluation using reading score score =====
MAE: 11.047403516657988
MSE: 196.52842652573906
R2 Score: 0.1314980253243776

===== Random Forest Evaluation using reading score score =====
MAE: 11.91545585301048
MSE: 233.92864671312987
R2 Score: -0.03378170372183176

===== Linear Regression Evaluation using writing score score =====
MAE: 10.55520025914006
MSE: 192.73138710353217
R2 Score: 0.20034102534258358

===== Random Forest Evaluation using writing score score =====
MAE: 11.365521771346534
MSE: 220.40921122228065
R2 Score: 0.08550337078008452

