In [None]:
# 04_model_tuning.ipynb
# Model Tuning & Validation (Optimized to account for limited memory)
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, mean_squared_error, r2_score
import joblib
import os
import time

# Load processed datasets
print("Loading datasets...")
X_train_clf = joblib.load("../data/processed/X_train_clf.pkl")
X_test_clf = joblib.load("../data/processed/X_test_clf.pkl")
y_train_clf = joblib.load("../data/processed/y_train_clf.pkl")
y_test_clf = joblib.load("../data/processed/y_test_clf.pkl")

print(f"Dataset size: {X_train_clf.shape[0]} samples, {X_train_clf.shape[1]} features")

# Classification Model Tuning
print("\nTuning Random Forest Classifier...")
start_time = time.time()

clf = RandomForestClassifier(
    random_state=42,
    n_jobs=2,
    max_features='sqrt'
)

# Simplified parameter grid
clf_param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [10, 20],
    "min_samples_split": [5, 10],
    "min_samples_leaf": [2, 4]
}

# Use RandomizedSearchCV
clf_grid = RandomizedSearchCV(
    clf,
    clf_param_grid,
    n_iter=5,
    cv=2,
    scoring="f1_weighted",
    n_jobs=1,
    random_state=42,
    verbose=2,
    pre_dispatch='1*n_jobs'
)

clf_grid.fit(X_train_clf, y_train_clf)
best_clf = clf_grid.best_estimator_

print(f"\nClassifier tuning completed in {time.time() - start_time:.2f} seconds")
print("Best classifier parameters:")
print(clf_grid.best_params_)
print(f"Best CV score: {clf_grid.best_score_:.4f}")

# Evaluate tuned classifier
y_pred_clf = best_clf.predict(X_test_clf)
print("\nTuned Classification Report:")
print(classification_report(y_test_clf, y_pred_clf))

# Save Final Classifier
os.makedirs("../models", exist_ok=True)
joblib.dump(best_clf, "../models/random_forest_classifier.pkl")
print("Final classification model saved.")

# Clear memory
del clf_grid, y_pred_clf
import gc
gc.collect()

print("\n" + "="*50)
print("All models trained and saved successfully!")