In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import GridSearchCV,train_test_split,cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

In [2]:
!pip install pyxlsb



In [3]:
!pip install xlrd



In [4]:
import pandas as pd
import numpy as np
import warnings

# Suppress the specific pandas warnings
warnings.filterwarnings('ignore', message='invalid value encountered in greater')
warnings.filterwarnings('ignore', message='invalid value encountered in less')

# Or suppress all RuntimeWarnings from pandas formatting
warnings.filterwarnings('ignore', category=RuntimeWarning, module='pandas.io.formats.format')

# Now read your data
df = pd.read_excel('rawData.xlsx')
df = df[df['Sign Type Broad Category'] == 'Blade Sign']
df = df.reset_index(drop=True)


print("Data loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Check for data quality issues that might cause these warnings
print("\n" + "="*50)
print("DATA QUALITY CHECK:")

# Check for missing values
print(f"Total missing values: {df.isnull().sum().sum()}")
print(f"Columns with missing values:")
missing_cols = df.isnull().sum()
for col in missing_cols[missing_cols > 0].index:
    print(f"  - {col}: {missing_cols[col]} missing")

# Check data types
print(f"\nData types:")
for col, dtype in df.dtypes.items():
    print(f"  - {col}: {dtype}")

# Check for mixed data types in numeric columns
print(f"\nChecking for mixed data types:")
for col in df.columns:
    if df[col].dtype == 'object':  # String columns might contain mixed types
        # Try to identify if it should be numeric
        sample_values = df[col].dropna().astype(str).str.strip()
        if len(sample_values) > 0:
            # Check if values look numeric
            numeric_pattern = sample_values.str.match(r'^-?\d+\.?\d*$')
            if numeric_pattern.any():
                numeric_count = numeric_pattern.sum()
                total_count = len(sample_values)
                if numeric_count > total_count * 0.5:  # More than 50% numeric
                    print(f"  - {col}: Appears to be numeric but stored as object ({numeric_count}/{total_count} numeric)")

# Safe display function that handles problematic data
def safe_display(df, n_rows=5):
    """Display dataframe without triggering formatting warnings"""
    try:
        # Create a copy for display
        display_df = df.head(n_rows).copy()
        
        # Replace problematic values for display
        for col in display_df.columns:
            if display_df[col].dtype in ['float64', 'int64']:
                # Replace inf and -inf with string representations
                display_df[col] = display_df[col].replace([np.inf, -np.inf], ['inf', '-inf'])
        
        return display_df
    except Exception as e:
        print(f"Display error: {e}")
        return df.head(n_rows)

print(f"\n" + "="*50)
print("FIRST FEW ROWS (safe display):")
display_data = safe_display(df)
print(display_data)

# Clean up numeric columns if needed
print(f"\n" + "="*50)
print("CLEANING NUMERIC COLUMNS:")

numeric_cols = []
for col in df.columns:
    if 'price' in col.lower() or 'cost' in col.lower() or 'amount' in col.lower() or 'width' in col.lower() or 'height' in col.lower() or 'area' in col.lower():
        numeric_cols.append(col)

if numeric_cols:
    print(f"Found potential numeric columns: {numeric_cols}")
    
    for col in numeric_cols:
        if col in df.columns:
            print(f"\nCleaning column: {col}")
            original_type = df[col].dtype
            
            try:
                # Convert to numeric, coercing errors to NaN
                df[col] = pd.to_numeric(df[col], errors='coerce')
                print(f"  - Converted from {original_type} to {df[col].dtype}")
                print(f"  - NaN values after conversion: {df[col].isnull().sum()}")
                
            except Exception as e:
                print(f"  - Could not convert {col}: {e}")

# Final summary
print(f"\n" + "="*50)
print("FINAL DATA SUMMARY:")
print(f"Shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
print(f"Total missing values: {df.isnull().sum().sum()}")

# Display basic statistics for numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
if len(numeric_columns) > 0:
    print(f"\nNumeric columns summary:")
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        print(df[numeric_columns].describe())

print("\nWarnings should now be suppressed!")
print("\nYour dataframe is ready to use: 'df'")

Data loaded successfully!
Shape: (538, 19)
Columns: ['Order ID', 'Order Date', 'Order Added in Month Tab', 'Account Name', 'Sign Type', 'Sign Type Broad Category', 'Month (AT)', 'Sign Width (in)', 'Sign Height (in)', 'Selling Price (USD)', 'Withdrawal Amount (USD)', 'Project Name', 'Status', 'Production Line', 'BOM - Material Cost (PKR) - Calculated By Ali Hassan', '📙 BOM - Production Cost (USD)', '📙 BOM - Shipping Cost (USD)', 'Sign Area (sq.ft)', 'Length of Curve (m)']

DATA QUALITY CHECK:
Total missing values: 1396
Columns with missing values:
  - Sign Width (in): 2 missing
  - Sign Height (in): 2 missing
  - Withdrawal Amount (USD): 538 missing
  - Status: 155 missing
  - BOM - Material Cost (PKR) - Calculated By Ali Hassan: 16 missing
  - 📙 BOM - Shipping Cost (USD): 145 missing
  - Length of Curve (m): 538 missing

Data types:
  - Order ID: object
  - Order Date: datetime64[ns]
  - Order Added in Month Tab: datetime64[ns]
  - Account Name: object
  - Sign Type: object
  - Sign Ty

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 538 entries, 0 to 537
Data columns (total 19 columns):
 #   Column                                                Non-Null Count  Dtype         
---  ------                                                --------------  -----         
 0   Order ID                                              538 non-null    object        
 1   Order Date                                            538 non-null    datetime64[ns]
 2   Order Added in Month Tab                              538 non-null    datetime64[ns]
 3   Account Name                                          538 non-null    object        
 4   Sign Type                                             538 non-null    object        
 5   Sign Type Broad Category                              538 non-null    object        
 6   Month (AT)                                            538 non-null    datetime64[ns]
 7   Sign Width (in)                                       536 non-null    float64   

In [6]:
print(f'The number of rows are {df.shape[0]} and columns are {df.shape[1]}')

The number of rows are 538 and columns are 19


In [7]:
df.columns

Index(['Order ID', 'Order Date', 'Order Added in Month Tab', 'Account Name',
       'Sign Type', 'Sign Type Broad Category', 'Month (AT)',
       'Sign Width (in)', 'Sign Height (in)', 'Selling Price (USD)',
       'Withdrawal Amount (USD)', 'Project Name', 'Status', 'Production Line',
       'BOM - Material Cost (PKR) - Calculated By Ali Hassan',
       '📙 BOM - Production Cost (USD)', '📙 BOM - Shipping Cost (USD)',
       'Sign Area (sq.ft)', 'Length of Curve (m)'],
      dtype='object')

In [8]:
df.head(5)

Unnamed: 0,Order ID,Order Date,Order Added in Month Tab,Account Name,Sign Type,Sign Type Broad Category,Month (AT),Sign Width (in),Sign Height (in),Selling Price (USD),Withdrawal Amount (USD),Project Name,Status,Production Line,BOM - Material Cost (PKR) - Calculated By Ali Hassan,📙 BOM - Production Cost (USD),📙 BOM - Shipping Cost (USD),Sign Area (sq.ft),Length of Curve (m)
0,BS-ET-7835,2024-12-31,2024-12-30,COMUNITYTreasures,Blade Sign,Blade Sign,2025-01-01,24.0,24.0,340.0,,ETSY Project,Shipped,Business Sign,,0.0,125.45,4,
1,BS-SM-7826 A,2024-12-30,2025-01-01,Signmakerz-Ads,3D Blade Sign,Blade Sign,2025-01-01,30.0,30.0,1248.0,,Google Ads Project,Shipped,Business Sign,,380.9,241.3,6,
2,BS-SM-7826 B,2024-12-30,2025-01-01,Signmakerz-Ads,3D Blade Sign,Blade Sign,2025-01-01,30.0,30.0,1248.0,,Google Ads Project,Shipped,Business Sign,,380.9,241.3,6,
3,BS-SM-7842,2024-12-31,2025-01-02,Signmakerz-Ads,Blade Sign,Blade Sign,2025-01-01,24.0,24.0,248.0,,Google Ads Project,Shipped,Business Sign,,108.76,92.73,4,
4,BS-SM-7851,2025-01-01,2025-01-03,Signmakerz-Ads,Blade Sign,Blade Sign,2025-01-01,30.0,30.0,378.0,,Google Ads Project,Shipped,Business Sign,,91.31,165.79,6,


In [9]:
df.isnull().sum()

Order ID                                                  0
Order Date                                                0
Order Added in Month Tab                                  0
Account Name                                              0
Sign Type                                                 0
Sign Type Broad Category                                  0
Month (AT)                                                0
Sign Width (in)                                           2
Sign Height (in)                                          2
Selling Price (USD)                                       0
Withdrawal Amount (USD)                                 538
Project Name                                              0
Status                                                  155
Production Line                                           0
BOM - Material Cost (PKR) - Calculated By Ali Hassan    538
📙 BOM - Production Cost (USD)                             0
📙 BOM - Shipping Cost (USD)             

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.describe(include='all')

Unnamed: 0,Order ID,Order Date,Order Added in Month Tab,Account Name,Sign Type,Sign Type Broad Category,Month (AT),Sign Width (in),Sign Height (in),Selling Price (USD),Withdrawal Amount (USD),Project Name,Status,Production Line,BOM - Material Cost (PKR) - Calculated By Ali Hassan,📙 BOM - Production Cost (USD),📙 BOM - Shipping Cost (USD),Sign Area (sq.ft),Length of Curve (m)
count,538,538,538,538,538,538,538,536.0,536.0,538.0,0.0,538,383,538,0.0,538.0,393.0,538.0,0.0
unique,538,,,46,5,1,,,,,,5,13,1,,,,,
top,BS-ET-11443,,,HudsonByDino,Blade Sign,Blade Sign,,,,,,ETSY Project,Shipped,Business Sign,,,,,
freq,1,,,73,461,538,,,,,,452,348,538,,,,,
mean,,2025-04-28 20:33:54.200743680,2025-05-03 21:24:45.501858816,,,,2025-04-18 10:58:26.319702528,20.707836,19.067351,339.624535,,,,,,69.699089,93.559415,3.020446,
min,,2023-05-23 00:00:00,2023-05-23 00:00:00,,,,2023-05-01 00:00:00,3.0,2.5,0.0,,,,,,0.0,0.0,0.0,
25%,,2025-03-03 00:00:00,2025-03-08 00:00:00,,,,2025-03-01 00:00:00,14.0,12.0,199.0,,,,,,28.015,45.5,1.0,
50%,,2025-04-23 00:00:00,2025-04-27 00:00:00,,,,2025-04-01 00:00:00,20.0,18.0,260.0,,,,,,48.91,72.54,2.0,
75%,,2025-07-11 12:00:00,2025-07-15 00:00:00,,,,2025-07-01 00:00:00,24.0,24.0,379.5,,,,,,84.375,107.99,4.0,
max,,2025-08-28 00:00:00,2025-08-30 00:00:00,,,,2025-08-01 00:00:00,60.0,63.0,3384.0,,,,,,610.77,487.18,18.0,


In [12]:
df['depth'] = 1

In [13]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from skopt import BayesSearchCV
import xgboost as xgb
import lightgbm as lgb
import pickle
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

# Create models directory for Kaggle
os.makedirs('/kaggle/working/models', exist_ok=True)

# Rename columns to simple names
df_clean = df.copy()
df_clean = df_clean.rename(columns={
    'Sign Width (in)': 'width',
    'Sign Height (in)': 'height', 
    'Depth': 'depth',
    '📙 BOM - Shipping Cost (USD)': 'shipping_cost'
})

print("Renamed columns successfully!")
print(f"Missing values before imputation:")
print(f"Width: {df_clean['width'].isnull().sum()}")
print(f"Height: {df_clean['height'].isnull().sum()}")
print(f"Shipping Cost: {df_clean['shipping_cost'].isnull().sum()}")

# Select all features including target for KNN imputation
feature_columns = ['width', 'height', 'depth', 'Selling Price (USD)', 'Sign Area (sq.ft)', 'shipping_cost']

# Create dataset with selected features
data_for_imputation = df_clean[feature_columns].copy()

print(f"\nApplying KNN Imputation with 4 neighbors...")
print(f"Dataset shape before imputation: {data_for_imputation.shape}")
print(f"Total missing values before: {data_for_imputation.isnull().sum().sum()}")

# Apply KNN Imputation to fill all missing values
knn_imputer = KNNImputer(n_neighbors=4)
data_imputed = knn_imputer.fit_transform(data_for_imputation)

# Convert back to DataFrame
df_imputed = pd.DataFrame(data_imputed, columns=feature_columns, index=df_clean.index)

print(f"\nAfter KNN Imputation:")
print(f"Width missing: {df_imputed['width'].isnull().sum()}")
print(f"Height missing: {df_imputed['height'].isnull().sum()}")
print(f"Shipping Cost missing: {df_imputed['shipping_cost'].isnull().sum()}")
print(f"Total missing values after: {df_imputed.isnull().sum().sum()}")

# Prepare features and target
X = df_imputed[['width', 'height', 'depth', 'Selling Price (USD)', 'Sign Area (sq.ft)']].copy()
y = df_imputed['shipping_cost'].copy()

print(f"\nFinal dataset for modeling:")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Scale features using MaxAbsScaler
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X)

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

print("\n" + "="*70)
print("TRAINING MODELS WITH BAYESIAN OPTIMIZATION...")

# Store all models and results
models_results = {}

# 1. Random Forest with Bayesian Optimization
print("\n1. Random Forest with Bayesian Optimization:")
rf_param_grid = {
    'n_estimators': (10, 200),
    'max_depth': (3, 20),
    'min_samples_split': (2, 15),
    'min_samples_leaf': (1, 10),
    'max_features': ['sqrt', 'log2', None]
}

rf_model = RandomForestRegressor(random_state=42)
rf_bayes_search = BayesSearchCV(
    estimator=rf_model,
    search_spaces=rf_param_grid,
    n_iter=50,
    cv=5,
    scoring='r2',
    random_state=42,
    n_jobs=-1
)

print("Training Random Forest...")
rf_bayes_search.fit(X_train, y_train)
print("Best RF Parameters:", rf_bayes_search.best_params_)

rf_pred = rf_bayes_search.best_estimator_.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test, rf_pred)

print(f"Random Forest Test Metrics:")
print(f"  MAE: ${rf_mae:.2f}")
print(f"  MSE: {rf_mse:.2f}")
print(f"  RMSE: ${rf_rmse:.2f}")
print(f"  R² Score: {rf_r2:.3f}")

models_results['Random Forest'] = {
    'MAE': rf_mae, 'MSE': rf_mse, 'RMSE': rf_rmse, 'R2': rf_r2,
    'model': rf_bayes_search.best_estimator_
}

# 2. Gradient Boosting with Bayesian Optimization
print("\n2. Gradient Boosting with Bayesian Optimization:")
gb_param_grid = {
    'n_estimators': (50, 300),
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.3),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 10),
    'subsample': (0.8, 1.0)
}

gb_model = GradientBoostingRegressor(random_state=42)
gb_bayes_search = BayesSearchCV(
    estimator=gb_model,
    search_spaces=gb_param_grid,
    n_iter=50,
    cv=5,
    scoring='r2',
    random_state=42,
    n_jobs=-1
)

print("Training Gradient Boosting...")
gb_bayes_search.fit(X_train, y_train)
print("Best GB Parameters:", gb_bayes_search.best_params_)

gb_pred = gb_bayes_search.best_estimator_.predict(X_test)
gb_mae = mean_absolute_error(y_test, gb_pred)
gb_mse = mean_squared_error(y_test, gb_pred)
gb_rmse = np.sqrt(gb_mse)
gb_r2 = r2_score(y_test, gb_pred)

print(f"Gradient Boosting Test Metrics:")
print(f"  MAE: ${gb_mae:.2f}")
print(f"  MSE: {gb_mse:.2f}")
print(f"  RMSE: ${gb_rmse:.2f}")
print(f"  R² Score: {gb_r2:.3f}")

models_results['Gradient Boosting'] = {
    'MAE': gb_mae, 'MSE': gb_mse, 'RMSE': gb_rmse, 'R2': gb_r2,
    'model': gb_bayes_search.best_estimator_
}

# 3. SVR with Bayesian Optimization
print("\n3. SVR with Bayesian Optimization:")
svr_param_grid = {
    'C': (1, 1000),
    'gamma': (0.001, 1),
    'epsilon': (0.01, 1)
}

svr_model = SVR(kernel='rbf')
svr_bayes_search = BayesSearchCV(
    estimator=svr_model,
    search_spaces=svr_param_grid,
    n_iter=50,
    cv=5,
    scoring='r2',
    random_state=42,
    n_jobs=-1
)

print("Training SVR...")
svr_bayes_search.fit(X_train, y_train)
print("Best SVR Parameters:", svr_bayes_search.best_params_)

svr_pred = svr_bayes_search.best_estimator_.predict(X_test)
svr_mae = mean_absolute_error(y_test, svr_pred)
svr_mse = mean_squared_error(y_test, svr_pred)
svr_rmse = np.sqrt(svr_mse)
svr_r2 = r2_score(y_test, svr_pred)

print(f"SVR Test Metrics:")
print(f"  MAE: ${svr_mae:.2f}")
print(f"  MSE: {svr_mse:.2f}")
print(f"  RMSE: ${svr_rmse:.2f}")
print(f"  R² Score: {svr_r2:.3f}")

models_results['SVR'] = {
    'MAE': svr_mae, 'MSE': svr_mse, 'RMSE': svr_rmse, 'R2': svr_r2,
    'model': svr_bayes_search.best_estimator_
}

# 4. Decision Tree with Bayesian Optimization
print("\n4. Decision Tree with Bayesian Optimization:")
dt_param_grid = {
    'max_depth': (3, 20),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 10),
    'max_features': ['sqrt', 'log2', None]
}

dt_model = DecisionTreeRegressor(random_state=42)
dt_bayes_search = BayesSearchCV(
    estimator=dt_model,
    search_spaces=dt_param_grid,
    n_iter=50,
    cv=5,
    scoring='r2',
    random_state=42,
    n_jobs=-1
)

print("Training Decision Tree...")
dt_bayes_search.fit(X_train, y_train)
print("Best DT Parameters:", dt_bayes_search.best_params_)

dt_pred = dt_bayes_search.best_estimator_.predict(X_test)
dt_mae = mean_absolute_error(y_test, dt_pred)
dt_mse = mean_squared_error(y_test, dt_pred)
dt_rmse = np.sqrt(dt_mse)
dt_r2 = r2_score(y_test, dt_pred)

print(f"Decision Tree Test Metrics:")
print(f"  MAE: ${dt_mae:.2f}")
print(f"  MSE: {dt_mse:.2f}")
print(f"  RMSE: ${dt_rmse:.2f}")
print(f"  R² Score: {dt_r2:.3f}")

models_results['Decision Tree'] = {
    'MAE': dt_mae, 'MSE': dt_mse, 'RMSE': dt_rmse, 'R2': dt_r2,
    'model': dt_bayes_search.best_estimator_
}

# 5. XGBoost with Bayesian Optimization
print("\n5. XGBoost with Bayesian Optimization:")
xgb_param_grid = {
    'n_estimators': (50, 300),
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.3),
    'subsample': (0.8, 1.0),
    'colsample_bytree': (0.8, 1.0),
    'reg_alpha': (0, 1),
    'reg_lambda': (0, 1)
}

xgb_model = xgb.XGBRegressor(random_state=42, eval_metric='rmse')
xgb_bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=xgb_param_grid,
    n_iter=50,
    cv=5,
    scoring='r2',
    random_state=42,
    n_jobs=-1
)

print("Training XGBoost...")
xgb_bayes_search.fit(X_train, y_train)
print("Best XGB Parameters:", xgb_bayes_search.best_params_)

xgb_pred = xgb_bayes_search.best_estimator_.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_mse = mean_squared_error(y_test, xgb_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_r2 = r2_score(y_test, xgb_pred)

print(f"XGBoost Test Metrics:")
print(f"  MAE: ${xgb_mae:.2f}")
print(f"  MSE: {xgb_mse:.2f}")
print(f"  RMSE: ${xgb_rmse:.2f}")
print(f"  R² Score: {xgb_r2:.3f}")

models_results['XGBoost'] = {
    'MAE': xgb_mae, 'MSE': xgb_mse, 'RMSE': xgb_rmse, 'R2': xgb_r2,
    'model': xgb_bayes_search.best_estimator_
}

# 6. LightGBM with Bayesian Optimization
print("\n6. LightGBM with Bayesian Optimization:")
lgb_param_grid = {
    'n_estimators': (50, 300),
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.3),
    'subsample': (0.8, 1.0),
    'colsample_bytree': (0.8, 1.0),
    'reg_alpha': (0, 1),
    'reg_lambda': (0, 1),
    'num_leaves': (10, 100)
}

lgb_model = lgb.LGBMRegressor(random_state=42, verbose=-1)
lgb_bayes_search = BayesSearchCV(
    estimator=lgb_model,
    search_spaces=lgb_param_grid,
    n_iter=50,
    cv=5,
    scoring='r2',
    random_state=42,
    n_jobs=-1
)

print("Training LightGBM...")
lgb_bayes_search.fit(X_train, y_train)
print("Best LGB Parameters:", lgb_bayes_search.best_params_)

lgb_pred = lgb_bayes_search.best_estimator_.predict(X_test)
lgb_mae = mean_absolute_error(y_test, lgb_pred)
lgb_mse = mean_squared_error(y_test, lgb_pred)
lgb_rmse = np.sqrt(lgb_mse)
lgb_r2 = r2_score(y_test, lgb_pred)

print(f"LightGBM Test Metrics:")
print(f"  MAE: ${lgb_mae:.2f}")
print(f"  MSE: {lgb_mse:.2f}")
print(f"  RMSE: ${lgb_rmse:.2f}")
print(f"  R² Score: {lgb_r2:.3f}")

models_results['LightGBM'] = {
    'MAE': lgb_mae, 'MSE': lgb_mse, 'RMSE': lgb_rmse, 'R2': lgb_r2,
    'model': lgb_bayes_search.best_estimator_
}

# 7. Extra Trees with Bayesian Optimization
print("\n7. Extra Trees with Bayesian Optimization:")
et_param_grid = {
    'n_estimators': (10, 200),
    'max_depth': (3, 20),
    'min_samples_split': (2, 15),
    'min_samples_leaf': (1, 10),
    'max_features': ['sqrt', 'log2', None]
}

et_model = ExtraTreesRegressor(random_state=42)
et_bayes_search = BayesSearchCV(
    estimator=et_model,
    search_spaces=et_param_grid,
    n_iter=50,
    cv=5,
    scoring='r2',
    random_state=42,
    n_jobs=-1
)

print("Training Extra Trees...")
et_bayes_search.fit(X_train, y_train)
print("Best ET Parameters:", et_bayes_search.best_params_)

et_pred = et_bayes_search.best_estimator_.predict(X_test)
et_mae = mean_absolute_error(y_test, et_pred)
et_mse = mean_squared_error(y_test, et_pred)
et_rmse = np.sqrt(et_mse)
et_r2 = r2_score(y_test, et_pred)

print(f"Extra Trees Test Metrics:")
print(f"  MAE: ${et_mae:.2f}")
print(f"  MSE: {et_mse:.2f}")
print(f"  RMSE: ${et_rmse:.2f}")
print(f"  R² Score: {et_r2:.3f}")

models_results['Extra Trees'] = {
    'MAE': et_mae, 'MSE': et_mse, 'RMSE': et_rmse, 'R2': et_r2,
    'model': et_bayes_search.best_estimator_
}

# 8. Ridge Regression with Bayesian Optimization
print("\n8. Ridge Regression with Bayesian Optimization:")
ridge_param_grid = {
    'alpha': (0.1, 100)
}

ridge_model = Ridge(random_state=42)
ridge_bayes_search = BayesSearchCV(
    estimator=ridge_model,
    search_spaces=ridge_param_grid,
    n_iter=30,
    cv=5,
    scoring='r2',
    random_state=42,
    n_jobs=-1
)

print("Training Ridge...")
ridge_bayes_search.fit(X_train, y_train)
print("Best Ridge Parameters:", ridge_bayes_search.best_params_)

ridge_pred = ridge_bayes_search.best_estimator_.predict(X_test)
ridge_mae = mean_absolute_error(y_test, ridge_pred)
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_rmse = np.sqrt(ridge_mse)
ridge_r2 = r2_score(y_test, ridge_pred)

print(f"Ridge Test Metrics:")
print(f"  MAE: ${ridge_mae:.2f}")
print(f"  MSE: {ridge_mse:.2f}")
print(f"  RMSE: ${ridge_rmse:.2f}")
print(f"  R² Score: {ridge_r2:.3f}")

models_results['Ridge'] = {
    'MAE': ridge_mae, 'MSE': ridge_mse, 'RMSE': ridge_rmse, 'R2': ridge_r2,
    'model': ridge_bayes_search.best_estimator_
}

# 9. ElasticNet with Bayesian Optimization
print("\n9. ElasticNet with Bayesian Optimization:")
elastic_param_grid = {
    'alpha': (0.1, 10),
    'l1_ratio': (0.1, 0.9)
}

elastic_model = ElasticNet(random_state=42)
elastic_bayes_search = BayesSearchCV(
    estimator=elastic_model,
    search_spaces=elastic_param_grid,
    n_iter=30,
    cv=5,
    scoring='r2',
    random_state=42,
    n_jobs=-1
)

print("Training ElasticNet...")
elastic_bayes_search.fit(X_train, y_train)
print("Best ElasticNet Parameters:", elastic_bayes_search.best_params_)

elastic_pred = elastic_bayes_search.best_estimator_.predict(X_test)
elastic_mae = mean_absolute_error(y_test, elastic_pred)
elastic_mse = mean_squared_error(y_test, elastic_pred)
elastic_rmse = np.sqrt(elastic_mse)
elastic_r2 = r2_score(y_test, elastic_pred)

print(f"ElasticNet Test Metrics:")
print(f"  MAE: ${elastic_mae:.2f}")
print(f"  MSE: {elastic_mse:.2f}")
print(f"  RMSE: ${elastic_rmse:.2f}")
print(f"  R² Score: {elastic_r2:.3f}")

models_results['ElasticNet'] = {
    'MAE': elastic_mae, 'MSE': elastic_mse, 'RMSE': elastic_rmse, 'R2': elastic_r2,
    'model': elastic_bayes_search.best_estimator_
}

print("\n" + "="*70)
print("FINAL MODEL COMPARISON:")

# Find best model based on R2 score
best_model_name = max(models_results.keys(), key=lambda x: models_results[x]['R2'])
best_model = models_results[best_model_name]['model']

print(f"Best Model: {best_model_name}")
print(f"Performance Summary:")
for name, results in sorted(models_results.items(), key=lambda x: x[1]['R2'], reverse=True):
    print(f"\n{name}:")
    print(f"  MAE: ${results['MAE']:.2f}")
    print(f"  RMSE: ${results['RMSE']:.2f}")
    print(f"  R² Score: {results['R2']:.3f}")

print(f"\n" + "="*70)
print("SAVING HIGH-ACCURACY MODELS (R² > 0.7):")

# Save models with high accuracy
high_accuracy_models = {name: results for name, results in models_results.items() if results['R2'] > 0.7}

if high_accuracy_models:
    for model_name, results in high_accuracy_models.items():
        # Save using joblib (recommended for sklearn models)
        model_filename = f"/kaggle/working/models/{model_name.replace(' ', '_').lower()}_model.joblib"
        joblib.dump(results['model'], model_filename)
        print(f"✓ Saved {model_name} model: {model_filename}")
        
        # Also save as pickle for compatibility
        pickle_filename = f"/kaggle/working/models/{model_name.replace(' ', '_').lower()}_model.pkl"
        with open(pickle_filename, 'wb') as f:
            pickle.dump(results['model'], f)
        print(f"✓ Saved {model_name} pickle: {pickle_filename}")
    
    # Save the best model separately
    best_model_joblib = f"/kaggle/working/models/best_model_{best_model_name.replace(' ', '_').lower()}.joblib"
    joblib.dump(best_model, best_model_joblib)
    print(f"✓ Saved BEST model: {best_model_joblib}")
    
    # Save scaler
    scaler_filename = "/kaggle/working/models/scaler.joblib"
    joblib.dump(scaler, scaler_filename)
    print(f"✓ Saved scaler: {scaler_filename}")
    
    # Save feature names
    feature_names = ['width', 'height', 'depth', 'Selling Price (USD)', 'Sign Area (sq.ft)']
    features_filename = "/kaggle/working/models/feature_names.pkl"
    with open(features_filename, 'wb') as f:
        pickle.dump(feature_names, f)
    print(f"✓ Saved feature names: {features_filename}")
    
    # Save model results summary
    results_filename = "/kaggle/working/models/model_results.pkl"
    with open(results_filename, 'wb') as f:
        pickle.dump(models_results, f)
    print(f"✓ Saved model results: {results_filename}")
    
else:
    print("No models achieved R² > 0.7. Saving top 3 models instead:")
    top_models = sorted(models_results.items(), key=lambda x: x[1]['R2'], reverse=True)[:3]
    for model_name, results in top_models:
        model_filename = f"/kaggle/working/models/{model_name.replace(' ', '_').lower()}_model.joblib"
        joblib.dump(results['model'], model_filename)
        print(f"✓ Saved {model_name} model: {model_filename}")

print(f"\n" + "="*70)
print("FEATURE IMPORTANCE ANALYSIS:")

# Feature importance for tree-based models
feature_names = ['width', 'height', 'depth', 'Selling Price (USD)', 'Sign Area (sq.ft)']

if best_model_name in ['Random Forest', 'Gradient Boosting', 'Decision Tree', 'XGBoost', 'LightGBM', 'Extra Trees']:
    print(f"\nFeature Importance ({best_model_name}):")
    if hasattr(best_model, 'feature_importances_'):
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': best_model.feature_importances_
        }).sort_values('Importance', ascending=False)
        
        for _, row in importance_df.iterrows():
            print(f"  {row['Feature']}: {row['Importance']:.3f}")

print(f"\n" + "="*70)
print("PREDICTION ANALYSIS:")

# Create final dataset with predictions
df_final = df_clean.copy()
df_final['width'] = df_imputed['width']
df_final['height'] = df_imputed['height']
df_final['shipping_cost'] = df_imputed['shipping_cost']

print(f"\nDataset Statistics:")
print(f"Total rows: {len(df_final)}")
print(f"Complete data (no missing values): {len(df_final)}")

print(f"\nShipping Cost Statistics (after KNN imputation):")
print(f"Mean: ${df_final['shipping_cost'].mean():.2f}")
print(f"Median: ${df_final['shipping_cost'].median():.2f}")
print(f"Std: ${df_final['shipping_cost'].std():.2f}")
print(f"Min: ${df_final['shipping_cost'].min():.2f}")
print(f"Max: ${df_final['shipping_cost'].max():.2f}")

# Show sample predictions
print(f"\nSample of data (showing first 5 rows):")
sample_data = df_final[['width', 'height', 'depth', 'Selling Price (USD)', 'shipping_cost']].head()
for idx, row in sample_data.iterrows():
    print(f"Row {idx}: Width={row['width']:.1f}, Height={row['height']:.1f}, "
          f"Depth={row['depth']:.1f}, Price=${row['Selling Price (USD)']:.2f}, "
          f"Shipping=${row['shipping_cost']:.2f}")

print(f"\n" + "="*70)
print("AVAILABLE OBJECTS:")
print(" df_final - Complete dataset with KNN imputed values")
print(" best_model - Best performing model")
print(" scaler - Fitted MaxAbsScaler")
print(" models_results - Dictionary with all trained models")
print(" All individual model objects (rf_bayes_search, xgb_bayes_search, etc.)")

print(f"\n" + "="*70)
print("SAVED FILES IN /kaggle/working/models/:")
try:
    saved_files = os.listdir('/kaggle/working/models/')
    for file in saved_files:
        print(f" - {file}")
except:
    print(" Directory not found or empty")

print(f"\n MISSION ACCOMPLISHED!")
print(f" ✓ All missing values filled using KNN Imputer")
print(f" ✓ Trained 9 different models with Bayesian Optimization")
print(f" ✓ Best model: {best_model_name} (R² = {models_results[best_model_name]['R2']:.3f})")
print(f" ✓ High-accuracy models saved to /kaggle/working/models/")
print(f" ✓ Ready for further analysis and deployment!")

Renamed columns successfully!
Missing values before imputation:
Width: 2
Height: 2
Shipping Cost: 145

Applying KNN Imputation with 4 neighbors...
Dataset shape before imputation: (538, 6)
Total missing values before: 149

After KNN Imputation:
Width missing: 0
Height missing: 0
Shipping Cost missing: 0
Total missing values after: 0

Final dataset for modeling:
Features shape: (538, 5)
Target shape: (538,)

Train set: 430 samples
Test set: 108 samples

TRAINING MODELS WITH BAYESIAN OPTIMIZATION...

1. Random Forest with Bayesian Optimization:
Training Random Forest...
Best RF Parameters: OrderedDict([('max_depth', 11), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 200)])
Random Forest Test Metrics:
  MAE: $18.30
  MSE: 726.13
  RMSE: $26.95
  R² Score: 0.807

2. Gradient Boosting with Bayesian Optimization:
Training Gradient Boosting...
Best GB Parameters: OrderedDict([('learning_rate', 0.049658010809848274), ('max_depth', 4), ('min_sampl