In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import GridSearchCV,train_test_split,cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

In [2]:
!pip install pyxlsb



In [3]:
!pip install xlrd



In [4]:
import pandas as pd
import numpy as np
import warnings

# Suppress the specific pandas warnings
warnings.filterwarnings('ignore', message='invalid value encountered in greater')
warnings.filterwarnings('ignore', message='invalid value encountered in less')

# Or suppress all RuntimeWarnings from pandas formatting
warnings.filterwarnings('ignore', category=RuntimeWarning, module='pandas.io.formats.format')

# Now read your data
df = pd.read_excel('rawData.xlsx')

print("Data loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Check for data quality issues that might cause these warnings
print("\n" + "="*50)
print("DATA QUALITY CHECK:")

# Check for missing values
print(f"Total missing values: {df.isnull().sum().sum()}")
print(f"Columns with missing values:")
missing_cols = df.isnull().sum()
for col in missing_cols[missing_cols > 0].index:
    print(f"  - {col}: {missing_cols[col]} missing")

# Check data types
print(f"\nData types:")
for col, dtype in df.dtypes.items():
    print(f"  - {col}: {dtype}")

# Check for mixed data types in numeric columns
print(f"\nChecking for mixed data types:")
for col in df.columns:
    if df[col].dtype == 'object':  # String columns might contain mixed types
        # Try to identify if it should be numeric
        sample_values = df[col].dropna().astype(str).str.strip()
        if len(sample_values) > 0:
            # Check if values look numeric
            numeric_pattern = sample_values.str.match(r'^-?\d+\.?\d*$')
            if numeric_pattern.any():
                numeric_count = numeric_pattern.sum()
                total_count = len(sample_values)
                if numeric_count > total_count * 0.5:  # More than 50% numeric
                    print(f"  - {col}: Appears to be numeric but stored as object ({numeric_count}/{total_count} numeric)")

# Safe display function that handles problematic data
def safe_display(df, n_rows=5):
    """Display dataframe without triggering formatting warnings"""
    try:
        # Create a copy for display
        display_df = df.head(n_rows).copy()
        
        # Replace problematic values for display
        for col in display_df.columns:
            if display_df[col].dtype in ['float64', 'int64']:
                # Replace inf and -inf with string representations
                display_df[col] = display_df[col].replace([np.inf, -np.inf], ['inf', '-inf'])
        
        return display_df
    except Exception as e:
        print(f"Display error: {e}")
        return df.head(n_rows)

print(f"\n" + "="*50)
print("FIRST FEW ROWS (safe display):")
display_data = safe_display(df)
print(display_data)

# Clean up numeric columns if needed
print(f"\n" + "="*50)
print("CLEANING NUMERIC COLUMNS:")

numeric_cols = []
for col in df.columns:
    if 'price' in col.lower() or 'cost' in col.lower() or 'amount' in col.lower() or 'width' in col.lower() or 'height' in col.lower() or 'area' in col.lower():
        numeric_cols.append(col)

if numeric_cols:
    print(f"Found potential numeric columns: {numeric_cols}")
    
    for col in numeric_cols:
        if col in df.columns:
            print(f"\nCleaning column: {col}")
            original_type = df[col].dtype
            
            try:
                # Convert to numeric, coercing errors to NaN
                df[col] = pd.to_numeric(df[col], errors='coerce')
                print(f"  - Converted from {original_type} to {df[col].dtype}")
                print(f"  - NaN values after conversion: {df[col].isnull().sum()}")
                
            except Exception as e:
                print(f"  - Could not convert {col}: {e}")

# Final summary
print(f"\n" + "="*50)
print("FINAL DATA SUMMARY:")
print(f"Shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
print(f"Total missing values: {df.isnull().sum().sum()}")

# Display basic statistics for numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
if len(numeric_columns) > 0:
    print(f"\nNumeric columns summary:")
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        print(df[numeric_columns].describe())

print("\nWarnings should now be suppressed!")
print("\nYour dataframe is ready to use: 'df'")

Data loaded successfully!
Shape: (29668, 19)
Columns: ['Order ID', 'Order Date', 'Order Added in Month Tab', 'Account Name', 'Sign Type', 'Sign Type Broad Category', 'Month (AT)', 'Sign Width (in)', 'Sign Height (in)', 'Selling Price (USD)', 'Withdrawal Amount (USD)', 'Project Name', 'Status', 'Production Line', 'BOM - Material Cost (PKR) - Calculated By Ali Hassan', '📙 BOM - Production Cost (USD)', '📙 BOM - Shipping Cost (USD)', 'Sign Area (sq.ft)', 'Length of Curve (m)']

DATA QUALITY CHECK:
Total missing values: 115117
Columns with missing values:
  - Order ID: 1 missing
  - Order Date: 420 missing
  - Order Added in Month Tab: 5 missing
  - Account Name: 4 missing
  - Sign Type: 24831 missing
  - Sign Type Broad Category: 24826 missing
  - Month (AT): 2 missing
  - Sign Width (in): 731 missing
  - Sign Height (in): 802 missing
  - Selling Price (USD): 669 missing
  - Withdrawal Amount (USD): 29668 missing
  - Project Name: 274 missing
  - Status: 1001 missing
  - Production Line: 1

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29668 entries, 0 to 29667
Data columns (total 19 columns):
 #   Column                                                Non-Null Count  Dtype         
---  ------                                                --------------  -----         
 0   Order ID                                              29667 non-null  object        
 1   Order Date                                            29248 non-null  datetime64[ns]
 2   Order Added in Month Tab                              29663 non-null  datetime64[ns]
 3   Account Name                                          29664 non-null  object        
 4   Sign Type                                             4837 non-null   object        
 5   Sign Type Broad Category                              4842 non-null   object        
 6   Month (AT)                                            29666 non-null  datetime64[ns]
 7   Sign Width (in)                                       28937 non-null  float6

In [6]:
print(f'The number of rows are {df.shape[0]} and columns are {df.shape[1]}')

The number of rows are 29668 and columns are 19


In [7]:
df.columns

Index(['Order ID', 'Order Date', 'Order Added in Month Tab', 'Account Name',
       'Sign Type', 'Sign Type Broad Category', 'Month (AT)',
       'Sign Width (in)', 'Sign Height (in)', 'Selling Price (USD)',
       'Withdrawal Amount (USD)', 'Project Name', 'Status', 'Production Line',
       'BOM - Material Cost (PKR) - Calculated By Ali Hassan',
       '📙 BOM - Production Cost (USD)', '📙 BOM - Shipping Cost (USD)',
       'Sign Area (sq.ft)', 'Length of Curve (m)'],
      dtype='object')

In [8]:
df.head(5)

Unnamed: 0,Order ID,Order Date,Order Added in Month Tab,Account Name,Sign Type,Sign Type Broad Category,Month (AT),Sign Width (in),Sign Height (in),Selling Price (USD),Withdrawal Amount (USD),Project Name,Status,Production Line,BOM - Material Cost (PKR) - Calculated By Ali Hassan,📙 BOM - Production Cost (USD),📙 BOM - Shipping Cost (USD),Sign Area (sq.ft),Length of Curve (m)
0,BS-ET-7833 A,2024-12-30,2024-12-30,ArtfulAdornmentz,Metal - With Backlit,Halo lit Chanel letter sign,2025-01-01,72.0,10.9,550.0,,ETSY Project,Shipped,Business Sign,,207.24,205.81,5,
1,BS-ET-7835,2024-12-31,2024-12-30,COMUNITYTreasures,Blade Sign,Blade Sign,2025-01-01,24.0,24.0,340.0,,ETSY Project,Shipped,Business Sign,,0.0,125.45,4,
2,BS-ET-7837,2024-12-31,2024-12-30,PurelyCraftByDimo,Metal on Acrylic,Flatcut Letters,2025-01-01,44.0,20.8,360.0,,ETSY Project,Shipped,Business Sign,,67.58,74.16,6,
3,BS-ET-7833 B,2024-12-30,2024-12-30,ArtfulAdornmentz,Metal - With Backlit,Halo lit Chanel letter sign,2025-01-01,32.4,36.0,700.0,,ETSY Project,Shipped,Business Sign,,370.56,205.81,8,
4,BS-SM-7798 B,2024-12-24,2025-01-01,Signmakerz-Ads,3D Metal with UV Printed,Halo lit Chanel letter sign,2025-01-01,90.0,36.0,1251.0,,Google Ads Project,Shipped,Business Sign,,268.85,185.47,23,


In [9]:
df.isnull().sum()

Order ID                                                    1
Order Date                                                420
Order Added in Month Tab                                    5
Account Name                                                4
Sign Type                                               24831
Sign Type Broad Category                                24826
Month (AT)                                                  2
Sign Width (in)                                           731
Sign Height (in)                                          802
Selling Price (USD)                                       669
Withdrawal Amount (USD)                                 29668
Project Name                                              274
Status                                                   1001
Production Line                                             1
BOM - Material Cost (PKR) - Calculated By Ali Hassan    29668
📙 BOM - Production Cost (USD)                              17
📙 BOM - 

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.describe(include='all')

Unnamed: 0,Order ID,Order Date,Order Added in Month Tab,Account Name,Sign Type,Sign Type Broad Category,Month (AT),Sign Width (in),Sign Height (in),Selling Price (USD),Withdrawal Amount (USD),Project Name,Status,Production Line,BOM - Material Cost (PKR) - Calculated By Ali Hassan,📙 BOM - Production Cost (USD),📙 BOM - Shipping Cost (USD),Sign Area (sq.ft),Length of Curve (m)
count,29666,29247,29662,29663,4837,4842,29665,28937.0,28866.0,28998.0,0.0,29393,28666,29666,0.0,29651.0,28357.0,29667.0,24236.0
unique,29609,,,164,119,38,,,,,,7,130,2,,,,,
top,BS-QS-8593 B Re,,,ManhattanNeons,Metal - With Backlit,Halo lit Chanel letter sign,,,,,,ETSY Project,Shipped,Neon Sign,,,,,
freq,3,,,6955,1024,1379,,,,,,22359,21918,24794,,,,,
mean,,2025-04-28 20:16:31.117037824,2025-05-01 11:36:15.632122112,,,,2025-04-16 20:08:03.964267776,31.689671,12.906835,303.247293,,,,,,75.739426,62.831426,3.323524,3.312394
min,,2022-11-29 00:00:00,2022-11-29 00:00:00,,,,2022-11-01 00:00:00,0.0,0.0,-3619.0,,,,,,0.0,0.0,0.0,0.15
25%,,2025-02-20 00:00:00,2025-02-22 00:00:00,,,,2025-02-01 00:00:00,20.0,6.0,124.0,,,,,,25.0,29.51,1.0,1.84
50%,,2025-04-28 00:00:00,2025-04-30 00:00:00,,,,2025-04-01 00:00:00,28.0,8.5,179.0,,,,,,32.55,37.36,2.0,2.7
75%,,2025-07-09 00:00:00,2025-07-11 00:00:00,,,,2025-07-01 00:00:00,38.0,14.4,270.0,,,,,,53.02,60.25,3.0,3.87
max,,2025-12-31 00:00:00,2025-08-30 00:00:00,,,,2025-08-01 00:00:00,600.4,905.0,45000.0,,,,,,4453.31,6163.43,393.0,347.0


In [12]:
pip install skopt

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement skopt (from versions: none)
ERROR: No matching distribution found for skopt


In [13]:
#!/usr/bin/env python3
"""
train_models.py

Converted from Kaggle notebook to run locally in VS Code.

Usage:
    python train_models.py --data data.csv --models_dir ./models
"""

import argparse
import os
import warnings
import pickle
import joblib
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from skopt import BayesSearchCV
import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings('ignore')

def main(data_path: str, models_dir: str, reduce_search: bool):
    # Ensure models directory exists
    os.makedirs(models_dir, exist_ok=True)

    # Load dataset (CSV expected). If different format, replace accordingly.
    try:
        df = pd.read_csv(data_path)
        print(f"Loaded dataset: {data_path} (shape: {df.shape})")
    except Exception as e:
        raise RuntimeError(f"Failed to load data from '{data_path}'. Error: {e}")

    # === Rename columns to simple names (as in original Kaggle code) ===
    df_clean = df.copy()
    df_clean = df_clean.rename(columns={
        'Sign Width (in)': 'width',
        'Sign Height (in)': 'height',
        'Depth': 'depth',
        '📙 BOM - Shipping Cost (USD)': 'shipping_cost'
    })
    print("Renamed columns (if present).")

    # Print missing values before imputation
    print("Missing values before imputation:")
    for c in ['width', 'height', 'depth', '📙 BOM - Shipping Cost (USD)', 'shipping_cost']:
        if c in df_clean.columns:
            print(f"  {c}: {df_clean[c].isnull().sum()}")

    # === Select features for imputation ===
    feature_columns = ['width', 'height', 'depth', 'Selling Price (USD)', 'Sign Area (sq.ft)', 'shipping_cost']
    missing_features = [c for c in feature_columns if c not in df_clean.columns]
    if missing_features:
        print("Warning: The following expected feature columns are missing from your dataset:", missing_features)
        # Continue anyway; will raise if required columns missing during selection
    data_for_imputation = df_clean.reindex(columns=feature_columns).copy()

    print(f"\nApplying KNN Imputation with 4 neighbors...")
    print(f"Dataset shape before imputation: {data_for_imputation.shape}")
    print(f"Total missing values before: {data_for_imputation.isnull().sum().sum()}")

    # KNN Imputer
    knn_imputer = KNNImputer(n_neighbors=4)
    data_imputed_array = knn_imputer.fit_transform(data_for_imputation)
    df_imputed = pd.DataFrame(data_imputed_array, columns=feature_columns, index=df_clean.index)

    print("\nAfter KNN Imputation missing value counts:")
    print(df_imputed.isnull().sum())

    # === Prepare X and y ===
    X = df_imputed[['width', 'height', 'depth', 'Selling Price (USD)', 'Sign Area (sq.ft)']].copy()
    y = df_imputed['shipping_cost'].copy()

    print(f"\nFinal dataset for modeling:")
    print(f"Features shape: {X.shape}")
    print(f"Target shape: {y.shape}")

    # Scale features using MaxAbsScaler
    scaler = MaxAbsScaler()
    X_scaled = scaler.fit_transform(X)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    print(f"\nTrain set: {X_train.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")

    print("\n" + "="*70)
    print("TRAINING MODELS WITH BAYESIAN OPTIMIZATION...")

    models_results = {}

    # Helper to control n_iter/cv for local runs if requested
    def n_iter_default(orig):
        return max(10, int(orig / 2)) if reduce_search else orig

    # ------------------------------
    # 1. Random Forest
    # ------------------------------
    print("\n1. Random Forest with Bayesian Optimization:")
    rf_param_grid = {
        'n_estimators': (10, 200),
        'max_depth': (3, 20),
        'min_samples_split': (2, 15),
        'min_samples_leaf': (1, 10),
        'max_features': ['sqrt', 'log2', None]
    }

    rf_model = RandomForestRegressor(random_state=42)
    rf_bayes_search = BayesSearchCV(
        estimator=rf_model,
        search_spaces=rf_param_grid,
        n_iter=n_iter_default(50),
        cv=5 if not reduce_search else 3,
        scoring='r2',
        random_state=42,
        n_jobs=-1
    )

    print("Training Random Forest...")
    rf_bayes_search.fit(X_train, y_train)
    print("Best RF Parameters:", rf_bayes_search.best_params_)

    rf_pred = rf_bayes_search.best_estimator_.predict(X_test)
    rf_mae = mean_absolute_error(y_test, rf_pred)
    rf_mse = mean_squared_error(y_test, rf_pred)
    rf_rmse = np.sqrt(rf_mse)
    rf_r2 = r2_score(y_test, rf_pred)

    print(f"Random Forest Test Metrics:")
    print(f"  MAE: ${rf_mae:.2f}")
    print(f"  MSE: {rf_mse:.2f}")
    print(f"  RMSE: ${rf_rmse:.2f}")
    print(f"  R² Score: {rf_r2:.3f}")

    models_results['Random Forest'] = {
        'MAE': rf_mae, 'MSE': rf_mse, 'RMSE': rf_rmse, 'R2': rf_r2,
        'model': rf_bayes_search.best_estimator_
    }

    # ------------------------------
    # 2. Gradient Boosting
    # ------------------------------
    print("\n2. Gradient Boosting with Bayesian Optimization:")
    gb_param_grid = {
        'n_estimators': (50, 300),
        'max_depth': (3, 10),
        'learning_rate': (0.01, 0.3),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10),
        'subsample': (0.8, 1.0)
    }

    gb_model = GradientBoostingRegressor(random_state=42)
    gb_bayes_search = BayesSearchCV(
        estimator=gb_model,
        search_spaces=gb_param_grid,
        n_iter=n_iter_default(50),
        cv=5 if not reduce_search else 3,
        scoring='r2',
        random_state=42,
        n_jobs=-1
    )

    print("Training Gradient Boosting...")
    gb_bayes_search.fit(X_train, y_train)
    print("Best GB Parameters:", gb_bayes_search.best_params_)

    gb_pred = gb_bayes_search.best_estimator_.predict(X_test)
    gb_mae = mean_absolute_error(y_test, gb_pred)
    gb_mse = mean_squared_error(y_test, gb_pred)
    gb_rmse = np.sqrt(gb_mse)
    gb_r2 = r2_score(y_test, gb_pred)

    print(f"Gradient Boosting Test Metrics:")
    print(f"  MAE: ${gb_mae:.2f}")
    print(f"  MSE: {gb_mse:.2f}")
    print(f"  RMSE: ${gb_rmse:.2f}")
    print(f"  R² Score: {gb_r2:.3f}")

    models_results['Gradient Boosting'] = {
        'MAE': gb_mae, 'MSE': gb_mse, 'RMSE': gb_rmse, 'R2': gb_r2,
        'model': gb_bayes_search.best_estimator_
    }

    # ------------------------------
    # 3. SVR
    # ------------------------------
    print("\n3. SVR with Bayesian Optimization:")
    svr_param_grid = {
        'C': (1, 1000),
        'gamma': (0.001, 1),
        'epsilon': (0.01, 1)
    }

    svr_model = SVR(kernel='rbf')
    svr_bayes_search = BayesSearchCV(
        estimator=svr_model,
        search_spaces=svr_param_grid,
        n_iter=n_iter_default(50),
        cv=5 if not reduce_search else 3,
        scoring='r2',
        random_state=42,
        n_jobs=-1
    )

    print("Training SVR...")
    svr_bayes_search.fit(X_train, y_train)
    print("Best SVR Parameters:", svr_bayes_search.best_params_)

    svr_pred = svr_bayes_search.best_estimator_.predict(X_test)
    svr_mae = mean_absolute_error(y_test, svr_pred)
    svr_mse = mean_squared_error(y_test, svr_pred)
    svr_rmse = np.sqrt(svr_mse)
    svr_r2 = r2_score(y_test, svr_pred)

    print(f"SVR Test Metrics:")
    print(f"  MAE: ${svr_mae:.2f}")
    print(f"  MSE: {svr_mse:.2f}")
    print(f"  RMSE: ${svr_rmse:.2f}")
    print(f"  R² Score: {svr_r2:.3f}")

    models_results['SVR'] = {
        'MAE': svr_mae, 'MSE': svr_mse, 'RMSE': svr_rmse, 'R2': svr_r2,
        'model': svr_bayes_search.best_estimator_
    }

    # ------------------------------
    # 4. Decision Tree
    # ------------------------------
    print("\n4. Decision Tree with Bayesian Optimization:")
    dt_param_grid = {
        'max_depth': (3, 20),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10),
        'max_features': ['sqrt', 'log2', None]
    }

    dt_model = DecisionTreeRegressor(random_state=42)
    dt_bayes_search = BayesSearchCV(
        estimator=dt_model,
        search_spaces=dt_param_grid,
        n_iter=n_iter_default(50),
        cv=5 if not reduce_search else 3,
        scoring='r2',
        random_state=42,
        n_jobs=-1
    )

    print("Training Decision Tree...")
    dt_bayes_search.fit(X_train, y_train)
    print("Best DT Parameters:", dt_bayes_search.best_params_)

    dt_pred = dt_bayes_search.best_estimator_.predict(X_test)
    dt_mae = mean_absolute_error(y_test, dt_pred)
    dt_mse = mean_squared_error(y_test, dt_pred)
    dt_rmse = np.sqrt(dt_mse)
    dt_r2 = r2_score(y_test, dt_pred)

    print(f"Decision Tree Test Metrics:")
    print(f"  MAE: ${dt_mae:.2f}")
    print(f"  MSE: {dt_mse:.2f}")
    print(f"  RMSE: ${dt_rmse:.2f}")
    print(f"  R² Score: {dt_r2:.3f}")

    models_results['Decision Tree'] = {
        'MAE': dt_mae, 'MSE': dt_mse, 'RMSE': dt_rmse, 'R2': dt_r2,
        'model': dt_bayes_search.best_estimator_
    }

    # ------------------------------
    # 5. XGBoost
    # ------------------------------
    print("\n5. XGBoost with Bayesian Optimization:")
    xgb_param_grid = {
        'n_estimators': (50, 300),
        'max_depth': (3, 10),
        'learning_rate': (0.01, 0.3),
        'subsample': (0.8, 1.0),
        'colsample_bytree': (0.8, 1.0),
        'reg_alpha': (0, 1),
        'reg_lambda': (0, 1)
    }

    xgb_model = xgb.XGBRegressor(random_state=42, eval_metric='rmse')
    xgb_bayes_search = BayesSearchCV(
        estimator=xgb_model,
        search_spaces=xgb_param_grid,
        n_iter=n_iter_default(50),
        cv=5 if not reduce_search else 3,
        scoring='r2',
        random_state=42,
        n_jobs=-1
    )

    print("Training XGBoost...")
    xgb_bayes_search.fit(X_train, y_train)
    print("Best XGB Parameters:", xgb_bayes_search.best_params_)

    xgb_pred = xgb_bayes_search.best_estimator_.predict(X_test)
    xgb_mae = mean_absolute_error(y_test, xgb_pred)
    xgb_mse = mean_squared_error(y_test, xgb_pred)
    xgb_rmse = np.sqrt(xgb_mse)
    xgb_r2 = r2_score(y_test, xgb_pred)

    print(f"XGBoost Test Metrics:")
    print(f"  MAE: ${xgb_mae:.2f}")
    print(f"  MSE: {xgb_mse:.2f}")
    print(f"  RMSE: ${xgb_rmse:.2f}")
    print(f"  R² Score: {xgb_r2:.3f}")

    models_results['XGBoost'] = {
        'MAE': xgb_mae, 'MSE': xgb_mse, 'RMSE': xgb_rmse, 'R2': xgb_r2,
        'model': xgb_bayes_search.best_estimator_
    }

    # ------------------------------
    # 6. LightGBM
    # ------------------------------
    print("\n6. LightGBM with Bayesian Optimization:")
    lgb_param_grid = {
        'n_estimators': (50, 300),
        'max_depth': (3, 10),
        'learning_rate': (0.01, 0.3),
        'subsample': (0.8, 1.0),
        'colsample_bytree': (0.8, 1.0),
        'reg_alpha': (0, 1),
        'reg_lambda': (0, 1),
        'num_leaves': (10, 100)
    }

    lgb_model = lgb.LGBMRegressor(random_state=42, verbose=-1)
    lgb_bayes_search = BayesSearchCV(
        estimator=lgb_model,
        search_spaces=lgb_param_grid,
        n_iter=n_iter_default(50),
        cv=5 if not reduce_search else 3,
        scoring='r2',
        random_state=42,
        n_jobs=-1
    )

    print("Training LightGBM...")
    lgb_bayes_search.fit(X_train, y_train)
    print("Best LGB Parameters:", lgb_bayes_search.best_params_)

    lgb_pred = lgb_bayes_search.best_estimator_.predict(X_test)
    lgb_mae = mean_absolute_error(y_test, lgb_pred)
    lgb_mse = mean_squared_error(y_test, lgb_pred)
    lgb_rmse = np.sqrt(lgb_mse)
    lgb_r2 = r2_score(y_test, lgb_pred)

    print(f"LightGBM Test Metrics:")
    print(f"  MAE: ${lgb_mae:.2f}")
    print(f"  MSE: {lgb_mse:.2f}")
    print(f"  RMSE: ${lgb_rmse:.2f}")
    print(f"  R² Score: {lgb_r2:.3f}")

    models_results['LightGBM'] = {
        'MAE': lgb_mae, 'MSE': lgb_mse, 'RMSE': lgb_rmse, 'R2': lgb_r2,
        'model': lgb_bayes_search.best_estimator_
    }

    # ------------------------------
    # 7. Extra Trees
    # ------------------------------
    print("\n7. Extra Trees with Bayesian Optimization:")
    et_param_grid = {
        'n_estimators': (10, 200),
        'max_depth': (3, 20),
        'min_samples_split': (2, 15),
        'min_samples_leaf': (1, 10),
        'max_features': ['sqrt', 'log2', None]
    }

    et_model = ExtraTreesRegressor(random_state=42)
    et_bayes_search = BayesSearchCV(
        estimator=et_model,
        search_spaces=et_param_grid,
        n_iter=n_iter_default(50),
        cv=5 if not reduce_search else 3,
        scoring='r2',
        random_state=42,
        n_jobs=-1
    )

    print("Training Extra Trees...")
    et_bayes_search.fit(X_train, y_train)
    print("Best ET Parameters:", et_bayes_search.best_params_)

    et_pred = et_bayes_search.best_estimator_.predict(X_test)
    et_mae = mean_absolute_error(y_test, et_pred)
    et_mse = mean_squared_error(y_test, et_pred)
    et_rmse = np.sqrt(et_mse)
    et_r2 = r2_score(y_test, et_pred)

    print(f"Extra Trees Test Metrics:")
    print(f"  MAE: ${et_mae:.2f}")
    print(f"  MSE: {et_mse:.2f}")
    print(f"  RMSE: ${et_rmse:.2f}")
    print(f"  R² Score: {et_r2:.3f}")

    models_results['Extra Trees'] = {
        'MAE': et_mae, 'MSE': et_mse, 'RMSE': et_rmse, 'R2': et_r2,
        'model': et_bayes_search.best_estimator_
    }

    # ------------------------------
    # 8. Ridge
    # ------------------------------
    print("\n8. Ridge Regression with Bayesian Optimization:")
    ridge_param_grid = {
        'alpha': (0.1, 100)
    }

    ridge_model = Ridge(random_state=42)
    ridge_bayes_search = BayesSearchCV(
        estimator=ridge_model,
        search_spaces=ridge_param_grid,
        n_iter=n_iter_default(30),
        cv=5 if not reduce_search else 3,
        scoring='r2',
        random_state=42,
        n_jobs=-1
    )

    print("Training Ridge...")
    ridge_bayes_search.fit(X_train, y_train)
    print("Best Ridge Parameters:", ridge_bayes_search.best_params_)

    ridge_pred = ridge_bayes_search.best_estimator_.predict(X_test)
    ridge_mae = mean_absolute_error(y_test, ridge_pred)
    ridge_mse = mean_squared_error(y_test, ridge_pred)
    ridge_rmse = np.sqrt(ridge_mse)
    ridge_r2 = r2_score(y_test, ridge_pred)

    print(f"Ridge Test Metrics:")
    print(f"  MAE: ${ridge_mae:.2f}")
    print(f"  MSE: {ridge_mse:.2f}")
    print(f"  RMSE: ${ridge_rmse:.2f}")
    print(f"  R² Score: {ridge_r2:.3f}")

    models_results['Ridge'] = {
        'MAE': ridge_mae, 'MSE': ridge_mse, 'RMSE': ridge_rmse, 'R2': ridge_r2,
        'model': ridge_bayes_search.best_estimator_
    }

    # ------------------------------
    # 9. ElasticNet
    # ------------------------------
    print("\n9. ElasticNet with Bayesian Optimization:")
    elastic_param_grid = {
        'alpha': (0.1, 10),
        'l1_ratio': (0.1, 0.9)
    }

    elastic_model = ElasticNet(random_state=42)
    elastic_bayes_search = BayesSearchCV(
        estimator=elastic_model,
        search_spaces=elastic_param_grid,
        n_iter=n_iter_default(30),
        cv=5 if not reduce_search else 3,
        scoring='r2',
        random_state=42,
        n_jobs=-1
    )

    print("Training ElasticNet...")
    elastic_bayes_search.fit(X_train, y_train)
    print("Best ElasticNet Parameters:", elastic_bayes_search.best_params_)

    elastic_pred = elastic_bayes_search.best_estimator_.predict(X_test)
    elastic_mae = mean_absolute_error(y_test, elastic_pred)
    elastic_mse = mean_squared_error(y_test, elastic_pred)
    elastic_rmse = np.sqrt(elastic_mse)
    elastic_r2 = r2_score(y_test, elastic_pred)

    print(f"ElasticNet Test Metrics:")
    print(f"  MAE: ${elastic_mae:.2f}")
    print(f"  MSE: {elastic_mse:.2f}")
    print(f"  RMSE: ${elastic_rmse:.2f}")
    print(f"  R² Score: {elastic_r2:.3f}")

    models_results['ElasticNet'] = {
        'MAE': elastic_mae, 'MSE': elastic_mse, 'RMSE': elastic_rmse, 'R2': elastic_r2,
        'model': elastic_bayes_search.best_estimator_
    }

    # === Final comparison ===
    print("\n" + "="*70)
    print("FINAL MODEL COMPARISON:")

    best_model_name = max(models_results.keys(), key=lambda x: models_results[x]['R2'])
    best_model = models_results[best_model_name]['model']

    print(f"Best Model: {best_model_name}")
    print(f"Performance Summary:")
    for name, results in sorted(models_results.items(), key=lambda x: x[1]['R2'], reverse=True):
        print(f"\n{name}:")
        print(f"  MAE: ${results['MAE']:.2f}")
        print(f"  RMSE: ${results['RMSE']:.2f}")
        print(f"  R² Score: {results['R2']:.3f}")

    print("\n" + "="*70)
    print("SAVING HIGH-ACCURACY MODELS (R² > 0.7):")

    # Save models based on performance
    high_accuracy_models = {name: results for name, results in models_results.items() if results['R2'] > 0.7}

    if high_accuracy_models:
        for model_name, results in high_accuracy_models.items():
            model_filename = os.path.join(models_dir, f"{model_name.replace(' ', '_').lower()}_model.joblib")
            joblib.dump(results['model'], model_filename)
            print(f"✓ Saved {model_name} model: {model_filename}")

            pickle_filename = os.path.join(models_dir, f"{model_name.replace(' ', '_').lower()}_model.pkl")
            with open(pickle_filename, 'wb') as f:
                pickle.dump(results['model'], f)
            print(f"✓ Saved {model_name} pickle: {pickle_filename}")

        # Save the best model separately
        best_model_joblib = os.path.join(models_dir, f"best_model_{best_model_name.replace(' ', '_').lower()}.joblib")
        joblib.dump(best_model, best_model_joblib)
        print(f"✓ Saved BEST model: {best_model_joblib}")

        # Save scaler
        scaler_filename = os.path.join(models_dir, "scaler.joblib")
        joblib.dump(scaler, scaler_filename)
        print(f"✓ Saved scaler: {scaler_filename}")

        # Save feature names
        feature_names = ['width', 'height', 'depth', 'Selling Price (USD)', 'Sign Area (sq.ft)']
        features_filename = os.path.join(models_dir, "feature_names.pkl")
        with open(features_filename, 'wb') as f:
            pickle.dump(feature_names, f)
        print(f"✓ Saved feature names: {features_filename}")

        # Save model results summary
        results_filename = os.path.join(models_dir, "model_results.pkl")
        with open(results_filename, 'wb') as f:
            pickle.dump(models_results, f)
        print(f"✓ Saved model results: {results_filename}")
    else:
        print("No models achieved R² > 0.7. Saving top 3 models instead:")
        top_models = sorted(models_results.items(), key=lambda x: x[1]['R2'], reverse=True)[:3]
        for model_name, results in top_models:
            model_filename = os.path.join(models_dir, f"{model_name.replace(' ', '_').lower()}_model.joblib")
            joblib.dump(results['model'], model_filename)
            print(f"✓ Saved {model_name} model: {model_filename}")

    print("\n" + "="*70)
    print("FEATURE IMPORTANCE ANALYSIS:")

    feature_names = ['width', 'height', 'depth', 'Selling Price (USD)', 'Sign Area (sq.ft)']

    if best_model_name in ['Random Forest', 'Gradient Boosting', 'Decision Tree', 'XGBoost', 'LightGBM', 'Extra Trees']:
        print(f"\nFeature Importance ({best_model_name}):")
        if hasattr(best_model, 'feature_importances_'):
            importance_df = pd.DataFrame({
                'Feature': feature_names,
                'Importance': best_model.feature_importances_
            }).sort_values('Importance', ascending=False)

            for _, row in importance_df.iterrows():
                print(f"  {row['Feature']}: {row['Importance']:.3f}")

    print("\n" + "="*70)
    print("PREDICTION ANALYSIS:")

    # Create final dataset with predictions/imputed values
    df_final = df_clean.copy()
    for col in ['width', 'height', 'shipping_cost', 'depth', 'Selling Price (USD)', 'Sign Area (sq.ft)']:
        if col in df_imputed.columns:
            df_final[col] = df_imputed[col]

    print(f"\nDataset Statistics:")
    print(f"Total rows: {len(df_final)}")
    print(f"Complete data (no missing values): {len(df_final) - df_final.isnull().sum(axis=1).gt(0).sum()}")

    if 'shipping_cost' in df_final.columns:
        print(f"\nShipping Cost Statistics (after KNN imputation):")
        print(f"Mean: ${df_final['shipping_cost'].mean():.2f}")
        print(f"Median: ${df_final['shipping_cost'].median():.2f}")
        print(f"Std: ${df_final['shipping_cost'].std():.2f}")
        print(f"Min: ${df_final['shipping_cost'].min():.2f}")
        print(f"Max: ${df_final['shipping_cost'].max():.2f}")

    print(f"\nSample of data (showing first 5 rows):")
    sample_cols = ['width', 'height', 'depth', 'Selling Price (USD)', 'shipping_cost']
    sample_data = df_final.reindex(columns=sample_cols).head()
    for idx, row in sample_data.iterrows():
        print(f"Row {idx}: Width={row['width']:.1f}, Height={row['height']:.1f}, "
              f"Depth={row['depth']:.1f}, Price=${row['Selling Price (USD)']:.2f}, "
              f"Shipping=${row['shipping_cost']:.2f}")

    print("\n" + "="*70)
    print("AVAILABLE OBJECTS:")
    print(" df_final - Complete dataset with KNN imputed values")
    print(" best_model - Best performing model")
    print(" scaler - Fitted MaxAbsScaler")
    print(" models_results - Dictionary with all trained models")

    print("\n" + "="*70)
    print(f"SAVED FILES IN {models_dir}:")
    try:
        saved_files = os.listdir(models_dir)
        for file in saved_files:
            print(f" - {file}")
    except Exception:
        print(" Directory not found or empty")

    print("\n MISSION ACCOMPLISHED!")
    print(" ✓ All missing values filled using KNN Imputer")
    print(" ✓ Trained 9 different models with Bayesian Optimization")
    print(f" ✓ Best model: {best_model_name} (R² = {models_results[best_model_name]['R2']:.3f})")
    print(f" ✓ Models saved to {models_dir}")
    print(" ✓ Ready for further analysis and deployment!")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Train multiple regression models locally (converted from Kaggle).")
    parser.add_argument("--data", required=True, help="Path to the CSV dataset (e.g., data.csv)")
    parser.add_argument("--models_dir", default="./models", help="Directory to save trained models (default: ./models)")
    parser.add_argument("--reduce_search", action="store_true",
                        help="Reduce BayesSearchCV n_iter and CV folds for faster local testing (useful on laptop).")
    args = parser.parse_args()

    main(data_path=args.data, models_dir=args.models_dir, reduce_search=args.reduce_search)


ModuleNotFoundError: No module named 'lightgbm'