# Modeling
This notebook trains, evaluates, and saves models.

In [7]:
import sys
from pathlib import Path

# Add the src folder directly
src_path = Path.cwd().parent / "src"
sys.path.insert(0, str(src_path))

from data_loader import load_data
from preprocessing import build_preprocessor
from model import train_model, evaluate_model, save_model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline



In [8]:
# Load dataset (Math for example)
mat, _ = load_data()
if mat is None:
    print("Dataset not found!")
else:
    print("Math dataset loaded successfully!")
    print("Shape:", mat.shape)

2025-09-29 09:20:48,201 [INFO] Math dataset loaded successfully with shape (395, 33)
2025-09-29 09:20:48,203 [INFO] Portuguese dataset loaded successfully with shape (649, 33)
Math dataset loaded successfully!
Shape: (395, 33)


In [9]:
X = mat.drop("G3", axis=1)
y = mat["G3"]

numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numeric columns:", len(numeric_cols))
print("Categorical columns:", len(categorical_cols))

Numeric columns: 15
Categorical columns: 17


In [10]:
# Define models
models = {
    "linear_regression": LinearRegression(),
    "random_forest": RandomForestRegressor(n_estimators=100, random_state=42),
}

print("Models to train:", list(models.keys()))

Models to train: ['linear_regression', 'random_forest']


In [11]:
results = {}

for name, model in models.items():
    print(f"\n=== Training {name} ===")
    
    pipeline = Pipeline([
        ("preprocessor", build_preprocessor(numeric_cols, categorical_cols)),
        ("model", model),
    ])

    pipeline, X_test, y_test = train_model(X, y, pipeline)
    metrics = evaluate_model(
        pipeline, X_test, y_test,
        metrics_path="results/metrics",
        dataset_name=f"math_{name}"
    )
    save_model(pipeline, "results/models", f"{name}_math.pkl")
    results[name] = metrics
    
    print(f"{name} training completed!")


=== Training linear_regression ===
2025-09-29 09:20:48,318 [INFO] ✅ Model training completed. Train size: 316, Test size: 79
2025-09-29 09:20:48,347 [INFO] 📊 Model evaluation completed. Metrics saved to: results/metrics\metrics_math_linear_regression.csv
2025-09-29 09:20:48,350 [INFO]    MAE: 1.647, RMSE: 2.378, R²: 0.724


2025-09-29 09:20:48,370 [INFO] [SAVE] Versioned model saved at: results/models\linear_regression_math.pkl
2025-09-29 09:20:48,384 [INFO] [SAVE] Latest model updated at: results/models\latest_model.pkl
linear_regression training completed!

=== Training random_forest ===
2025-09-29 09:20:48,854 [INFO] ✅ Model training completed. Train size: 316, Test size: 79
2025-09-29 09:20:48,901 [INFO] 📊 Model evaluation completed. Metrics saved to: results/metrics\metrics_math_random_forest.csv
2025-09-29 09:20:48,903 [INFO]    MAE: 1.179, RMSE: 1.958, R²: 0.813
2025-09-29 09:20:48,978 [INFO] [SAVE] Versioned model saved at: results/models\random_forest_math.pkl
2025-09-29 09:20:49,050 [INFO] [SAVE] Latest model updated at: results/models\latest_model.pkl
random_forest training completed!


In [12]:
print("\n=== Model Results ===")
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric, value in metrics.items():
        if isinstance(value, (int, float)):   
            print(f"  {metric}: {value:.4f}")
        else:                                 
            print(f"  {metric}: {value}")



=== Model Results ===

linear_regression:
  mae: 1.6470
  mse: 5.6570
  rmse: 2.3780
  r2: 0.7240
  dataset: math_linear_regression
  timestamp: 2025-09-29 09:20:48

random_forest:
  mae: 1.1790
  mse: 3.8350
  rmse: 1.9580
  r2: 0.8130
  dataset: math_random_forest
  timestamp: 2025-09-29 09:20:48
