In [2]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')

# Create directory
tuned_dir = "../models/tunedmodels/"
os.makedirs(tuned_dir, exist_ok=True)

print("üéØ TUNING 3 MODELS")

üéØ TUNING 3 MODELS


In [3]:
# Load the pre-processed data you already have
print("üìä Using previous X, y data...")

# Assuming you have these variables from previous training
# If not, recreate them:
from sklearn.preprocessing import StandardScaler, LabelEncoder

model_df = pd.read_csv("../data/processed/processed_weekly_modeling.csv")
ref_df = pd.read_csv("../data/processed/processed_weekly_reference.csv")

target_name = 'Target_1w_Price'
feature_cols = [col for col in model_df.columns if not col.startswith('Target_')]

X = model_df[feature_cols].copy()
y = model_df[target_name].copy()

train_mask = ref_df['Is_Train'].values
X_train, X_test = X[train_mask].copy(), X[~train_mask].copy()
y_train, y_test = y[train_mask].copy(), y[~train_mask].copy()

# Clean and scale (same as before)
for col in feature_cols:
    try:
        X_train[col] = pd.to_numeric(X_train[col])
        X_test[col] = pd.to_numeric(X_test[col])
    except:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"‚úÖ Data: {X_train.shape[1]} features, {X_train.shape[0]} samples")

üìä Using previous X, y data...
‚úÖ Data: 64 features, 4080 samples


In [4]:
print("\nüîß Tuning XGBoost...")
import xgboost as xgb

xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)

# Quick parameter grid
xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 0.9]
}

# Quick random search
xgb_tuned = RandomizedSearchCV(
    xgb_model, xgb_params, n_iter=10, cv=3,
    scoring='neg_mean_absolute_error', random_state=42, n_jobs=-1
)
xgb_tuned.fit(X_train_scaled, y_train)

# Save tuned model
with open(os.path.join(tuned_dir, "xgboost_tuned.pkl"), 'wb') as f:
    pickle.dump(xgb_tuned.best_estimator_, f)

print(f"‚úÖ XGBoost tuned and saved")


üîß Tuning XGBoost...
‚úÖ XGBoost tuned and saved


In [5]:
print("\nüí° Tuning LightGBM...")
import lightgbm as lgb

lgb_model = lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)

# Quick parameter grid
lgb_params = {
    'n_estimators': [100, 200],
    'num_leaves': [31, 63],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 0.9]
}

# Quick random search
lgb_tuned = RandomizedSearchCV(
    lgb_model, lgb_params, n_iter=10, cv=3,
    scoring='neg_mean_absolute_error', random_state=42, n_jobs=-1
)
lgb_tuned.fit(X_train_scaled, y_train)

# Save tuned model
with open(os.path.join(tuned_dir, "lightgbm_tuned.pkl"), 'wb') as f:
    pickle.dump(lgb_tuned.best_estimator_, f)

print(f"‚úÖ LightGBM tuned and saved")


üí° Tuning LightGBM...
‚úÖ LightGBM tuned and saved


In [6]:
print("\nüê± Tuning CatBoost...")
import catboost as cb

# CatBoost has its own hyperparameter tuning - use CatBoost's CV directly
# Instead of RandomizedSearchCV which has compatibility issues

# Define parameter grid for manual tuning
cat_param_combinations = [
    {'iterations': 100, 'depth': 4, 'learning_rate': 0.1, 'l2_leaf_reg': 3},
    {'iterations': 200, 'depth': 6, 'learning_rate': 0.05, 'l2_leaf_reg': 5},
    {'iterations': 150, 'depth': 8, 'learning_rate': 0.2, 'l2_leaf_reg': 1},
    {'iterations': 100, 'depth': 6, 'learning_rate': 0.1, 'l2_leaf_reg': 10},
    {'iterations': 200, 'depth': 4, 'learning_rate': 0.05, 'l2_leaf_reg': 3},
    {'iterations': 150, 'depth': 6, 'learning_rate': 0.15, 'l2_leaf_reg': 5},
]

best_score = float('inf')
best_params = None
best_model = None

print("Testing 6 parameter combinations...")

for i, params in enumerate(cat_param_combinations):
    print(f"  Combination {i+1}/6: {params}")

    # Create model with current parameters
    model = cb.CatBoostRegressor(
        **params,
        random_seed=42,
        verbose=False,
        allow_writing_files=False
    )

    # Simple train-test split for quick evaluation
    from sklearn.model_selection import train_test_split
    X_temp_train, X_temp_val, y_temp_train, y_temp_val = train_test_split(
        X_train_scaled, y_train, test_size=0.2, random_state=42
    )

    # Train and evaluate
    model.fit(X_temp_train, y_temp_train)
    y_pred = model.predict(X_temp_val)
    mae = np.mean(np.abs(y_pred - y_temp_val))

    print(f"    MAE: {mae:.4f}")

    if mae < best_score:
        best_score = mae
        best_params = params
        best_model = model

print(f"\n‚úÖ Best parameters: {best_params}")
print(f"   Best MAE: {best_score:.4f}")

# Retrain best model on full training data
cat_tuned = cb.CatBoostRegressor(
    **best_params,
    random_seed=42,
    verbose=False,
    allow_writing_files=False
)
cat_tuned.fit(X_train_scaled, y_train)

# Save tuned model
with open(os.path.join(tuned_dir, "catboost_tuned.pkl"), 'wb') as f:
    pickle.dump(cat_tuned, f)

print(f"üíæ CatBoost tuned and saved")


üê± Tuning CatBoost...
Testing 6 parameter combinations...
  Combination 1/6: {'iterations': 100, 'depth': 4, 'learning_rate': 0.1, 'l2_leaf_reg': 3}
    MAE: 2.7362
  Combination 2/6: {'iterations': 200, 'depth': 6, 'learning_rate': 0.05, 'l2_leaf_reg': 5}
    MAE: 2.6076
  Combination 3/6: {'iterations': 150, 'depth': 8, 'learning_rate': 0.2, 'l2_leaf_reg': 1}
    MAE: 2.5520
  Combination 4/6: {'iterations': 100, 'depth': 6, 'learning_rate': 0.1, 'l2_leaf_reg': 10}
    MAE: 2.7196
  Combination 5/6: {'iterations': 200, 'depth': 4, 'learning_rate': 0.05, 'l2_leaf_reg': 3}
    MAE: 2.6901
  Combination 6/6: {'iterations': 150, 'depth': 6, 'learning_rate': 0.15, 'l2_leaf_reg': 5}
    MAE: 2.5437

‚úÖ Best parameters: {'iterations': 150, 'depth': 6, 'learning_rate': 0.15, 'l2_leaf_reg': 5}
   Best MAE: 2.5437
üíæ CatBoost tuned and saved


In [7]:
print("\n" + "="*50)
print("‚úÖ ALL 3 MODELS TUNED AND SAVED")
print("="*50)

print(f"\nüìÅ Files in {tuned_dir}:")
for f in os.listdir(tuned_dir):
    print(f"  ‚Ä¢ {f}")

print("\nüéØ Done! Only 3 model files created.")


‚úÖ ALL 3 MODELS TUNED AND SAVED

üìÅ Files in ../models/tunedmodels/:
  ‚Ä¢ catboost_tuned.pkl
  ‚Ä¢ lightgbm_tuned.pkl
  ‚Ä¢ xgboost_tuned.pkl

üéØ Done! Only 3 model files created.
