# 04 - Model Training

Train multiple ML/DL models for sales forecasting

In [None]:
import pandas as pd
import numpy as np
import pickle
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

sys.path.append('../src')
from train import train_model, save_model
from evaluate import calculate_metrics, print_metrics

## Load Featured Dataset

In [None]:
df = pd.read_csv('../data/processed/featured_dataset.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nFirst rows:")
df.head()

## Prepare Data

In [None]:
# Define target and features
target_col = 'quantity'  # Target variable for sales forecasting

# Separate features and target
X = df.drop(columns=[target_col])
y = df[target_col]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target statistics:")
print(y.describe())

## Train-Test Split

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train size: {X_train.shape[0]} samples")
print(f"Test size: {X_test.shape[0]} samples")
print(f"Training ratio: {X_train.shape[0] / len(df) * 100:.1f}%")
print(f"Test ratio: {X_test.shape[0] / len(df) * 100:.1f}%")

## Feature Scaling

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"‚úì Features scaled")
print(f"Train mean: {X_train_scaled.mean():.6f}")
print(f"Train std: {X_train_scaled.std():.6f}")

# Save scaler
with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print(f"‚úì Scaler saved")

## Train Models

In [None]:
# Train multiple models
models = {}

print("Training models...\n")

# 1. Linear Regression
print("1. Linear Regression...")
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
models['linear_regression'] = lr_model
y_pred_lr = lr_model.predict(X_test_scaled)
print_metrics(calculate_metrics(y_test, y_pred_lr))

# 2. Random Forest
print("\n2. Random Forest Regressor...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)
models['random_forest'] = rf_model
y_pred_rf = rf_model.predict(X_test_scaled)
print_metrics(calculate_metrics(y_test, y_pred_rf))

# 3. Gradient Boosting
print("\n3. Gradient Boosting Regressor...")
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train_scaled, y_train)
models['gradient_boosting'] = gb_model
y_pred_gb = gb_model.predict(X_test_scaled)
print_metrics(calculate_metrics(y_test, y_pred_gb))

## Model Comparison

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

results = {}
for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    results[name] = {
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'MAE': mean_absolute_error(y_test, y_pred),
        'R2': r2_score(y_test, y_pred)
    }

results_df = pd.DataFrame(results).T
print("\nModel Comparison:")
print(results_df)

best_model_name = results_df['R2'].idxmax()
print(f"\nüèÜ Best Model: {best_model_name}")
best_model = models[best_model_name]

## Save Best Model

In [None]:
# Save the best model
model_path = '../models/best_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(best_model, f)
print(f"‚úì Best model saved: {model_path}")
print(f"  Model type: {type(best_model).__name__}")
print(f"  R¬≤ Score: {results_df.loc[best_model_name, 'R2']:.4f}")
print(f"  RMSE: {results_df.loc[best_model_name, 'RMSE']:.4f}")
print(f"  MAE: {results_df.loc[best_model_name, 'MAE']:.4f}")

## Feature Importance

In [None]:
if hasattr(best_model, 'feature_importances_'):
    importances = best_model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Important Features:")
    print(feature_importance_df.head(10))
    
    # Plot
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(figsize=(10, 6))
    feature_importance_df.head(10).plot(x='feature', y='importance', kind='barh', ax=ax)
    plt.title('Top 10 Feature Importances')
    plt.tight_layout()
    plt.savefig('../reports/feature_importance.png', dpi=100)
    plt.show()
else:
    print("Selected model does not have feature importances")