In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
import joblib


In [None]:
# Step 1: Load the dataset
df = pd.read_csv('/content/state_weather_aqi_data_mf2.csv')  # Replace with your actual file path
df


Unnamed: 0,state,city,station,date,time,PM2.5,PM10,NO2,NH3,SO2,CO,OZONE,AQI,Predominant_Parameter
0,Andhra_Pradesh,Amaravati,"Secretariat, Amaravati - APPCB",03-01-2020,10:00:00,68,64,17,4,28,31,40,68,PM2.5
1,Andhra_Pradesh,Rajamahendravaram,"Anand Kala Kshetram, Rajamahendravaram - APPCB",03-01-2020,10:00:00,67,70,23,2,13,49,77,77,OZONE
2,Andhra_Pradesh,Tirupati,"Tirumala, Tirupati - APPCB",03-01-2020,10:00:00,32,64,26,5,6,19,16,32,PM2.5
3,Andhra_Pradesh,Visakhapatnam,"GVM Corporation, Visakhapatnam - APPCB",03-01-2020,10:00:00,93,93,31,3,9,57,61,93,PM10
4,Andhra_Pradesh,Amaravati,"Secretariat, Amaravati - APPCB",05-01-2020,06:00:00,60,55,20,5,18,29,53,60,PM2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1560,West_Bengal,Kolkata,"Jadavpur, Kolkata - WBPCB",07-01-2020,03:00:00,163,142,57,7,10,39,8,163,PM2.5
1561,West_Bengal,Kolkata,"Rabindra Bharati University, Kolkata - WBPCB",07-01-2020,03:00:00,279,215,78,11,23,31,53,279,PM2.5
1562,West_Bengal,Kolkata,"Rabindra Sarobar, Kolkata - WBPCB",07-01-2020,03:00:00,162,136,43,4,16,34,44,162,PM2.5
1563,West_Bengal,Kolkata,"Victoria, Kolkata - WBPCB",07-01-2020,03:00:00,213,152,55,5,11,95,64,213,PM2.5


In [None]:
# Drop specified columns
df = df.drop(['state', 'city', 'station', 'date', 'time','Predominant_Parameter'], axis=1, errors='ignore')
df

Unnamed: 0,PM2.5,PM10,NO2,NH3,SO2,CO,OZONE,AQI
0,68,64,17,4,28,31,40,68
1,67,70,23,2,13,49,77,77
2,32,64,26,5,6,19,16,32
3,93,93,31,3,9,57,61,93
4,60,55,20,5,18,29,53,60
...,...,...,...,...,...,...,...,...
1560,163,142,57,7,10,39,8,163
1561,279,215,78,11,23,31,53,279
1562,162,136,43,4,16,34,44,162
1563,213,152,55,5,11,95,64,213


In [None]:
# Step 2: Handle missing values
df.fillna(df.mean(), inplace=True)  # Fill missing values with column mean
df

Unnamed: 0,PM2.5,PM10,NO2,NH3,SO2,CO,OZONE,AQI
0,68,64,17,4,28,31,40,68
1,67,70,23,2,13,49,77,77
2,32,64,26,5,6,19,16,32
3,93,93,31,3,9,57,61,93
4,60,55,20,5,18,29,53,60
...,...,...,...,...,...,...,...,...
1560,163,142,57,7,10,39,8,163
1561,279,215,78,11,23,31,53,279
1562,162,136,43,4,16,34,44,162
1563,213,152,55,5,11,95,64,213


In [None]:
# Step 3: Calculate Epods (based on example weights)
alpha, beta, gamma, delta = 0.3, 0.25, 0.2, 0.25  # Example weights for each pollutant
df['Epods'] = (alpha * df['PM2.5'] + beta * df['PM10'] + gamma * df['NO2'] + delta * df['SO2'])


In [None]:
# Step 4: Define features (X) and target (y)
X = df[['PM2.5','PM10','NO2','NH3','SO2','CO','OZONE']]
y = df['AQI']


In [None]:


# Step 5: Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 7: Define models and hyperparameters for Grid Search
models_and_params = {
    "Random Forest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingRegressor(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.05, 0.1, 0.2],
            "max_depth": [3, 5, 7]
        }
    },
    "AdaBoost": {
        "model": AdaBoostRegressor(random_state=42),
        "params": {
            "n_estimators": [50, 100],
            "learning_rate": [0.01, 0.1, 1.0]
        }
    },
    "Extra Trees": {
        "model": ExtraTreesRegressor(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5]
        }
    },
    "Linear Regression": {
        "model": LinearRegression(),
        "params": {}
    },
    "Ridge Regression": {
        "model": Ridge(),
        "params": {
            "alpha": [0.1, 1.0, 10.0]
        }
    },
    "Lasso Regression": {
        "model": Lasso(),
        "params": {
            "alpha": [0.1, 1.0, 10.0]
        }
    },
    "Support Vector Regression (SVR)": {
        "model": SVR(),
        "params": {
            "kernel": ["linear", "rbf"],
            "C": [0.1, 1.0, 10.0],
            "gamma": ["scale", "auto"]
        }
    },
    "Decision Tree": {
        "model": DecisionTreeRegressor(random_state=42),
        "params": {
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 10]
        }
    },
    "K-Nearest Neighbors (KNN)": {
        "model": KNeighborsRegressor(),
        "params": {
            "n_neighbors": [3, 5, 7],
            "weights": ["uniform", "distance"]
        }
    }
}

# Step 8: Perform Grid Search for all models
results = {}
for name, model_and_params in models_and_params.items():
    model = model_and_params["model"]
    params = model_and_params["params"]

    # If the model has hyperparameters to tune
    if params:
        grid_search = GridSearchCV(estimator=model, param_grid=params, cv=3, n_jobs=-1, scoring="neg_mean_squared_error")
        grid_search.fit(X_train_scaled, y_train)

        # Get the best model and its parameters
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        # If no parameters to tune (e.g., for LinearRegression)
        best_model = model.fit(X_train_scaled, y_train)  # Explicitly fit the model
        best_params = "N/A"

    # Ensure the model is fitted properly before prediction
    if not hasattr(best_model, 'predict'):
        best_model.fit(X_train_scaled, y_train)

    # Make predictions and evaluate the model
    y_pred = best_model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[name] = {
        "Best Params": best_params,
        "MSE": mse,
        "R²": r2
    }

# Step 9: Display results
results_df = pd.DataFrame(results).T
print("Model Comparison with Grid Search:")
print(results_df)

# Step 10: Save the best model (optional)
best_model_name = results_df['R²'].idxmax()
best_model = models_and_params[best_model_name]["model"].fit(X_train_scaled, y_train)

# Save the best model and scaler for future use
joblib.dump(best_model, f'{best_model_name}_best_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


Model Comparison with Grid Search:
                                                                       Best Params  \
Random Forest                    {'max_depth': None, 'min_samples_split': 2, 'n...   
Gradient Boosting                {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...   
AdaBoost                                {'learning_rate': 1.0, 'n_estimators': 50}   
Extra Trees                      {'max_depth': None, 'min_samples_split': 2, 'n...   
Linear Regression                                                              N/A   
Ridge Regression                                                    {'alpha': 1.0}   
Lasso Regression                                                    {'alpha': 0.1}   
Support Vector Regression (SVR)   {'C': 1.0, 'gamma': 'scale', 'kernel': 'linear'}   
Decision Tree                          {'max_depth': None, 'min_samples_split': 2}   
K-Nearest Neighbors (KNN)                {'n_neighbors': 5, 'weights': 'distance'}   

                  

['scaler.pkl']

In [None]:

# Step 5: Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 7: Define models and hyperparameters for Grid Search
models_and_params = {
    "Random Forest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingRegressor(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.05, 0.1, 0.2],
            "max_depth": [3, 5, 7]
        }
    },
    "AdaBoost": {
        "model": AdaBoostRegressor(random_state=42),
        "params": {
            "n_estimators": [50, 100],
            "learning_rate": [0.01, 0.1, 1.0]
        }
    },
    "Extra Trees": {
        "model": ExtraTreesRegressor(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5]
        }
    },
    "Linear Regression": {
        "model": LinearRegression(),
        "params": {}
    },
    "Ridge Regression": {
        "model": Ridge(),
        "params": {
            "alpha": [0.1, 1.0, 10.0]
        }
    },
    "Lasso Regression": {
        "model": Lasso(),
        "params": {
            "alpha": [0.1, 1.0, 10.0]
        }
    },
    "Support Vector Regression (SVR)": {
        "model": SVR(),
        "params": {
            "kernel": ["linear", "rbf"],
            "C": [0.1, 1.0, 10.0],
            "gamma": ["scale", "auto"]
        }
    },
    "Decision Tree": {
        "model": DecisionTreeRegressor(random_state=42),
        "params": {
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 10]
        }
    },
    "K-Nearest Neighbors (KNN)": {
        "model": KNeighborsRegressor(),
        "params": {
            "n_neighbors": [3, 5, 7],
            "weights": ["uniform", "distance"]
        }
    },
    "CatBoost": {
        "model": CatBoostRegressor(learning_rate=0.1, iterations=1000, depth=6, random_state=42, cat_features=[]),  # For CatBoost, specify categorical features if available
        "params": {
            "iterations": [500, 1000],
            "depth": [6, 8, 10],
            "learning_rate": [0.05, 0.1, 0.2],
            "l2_leaf_reg": [1, 3, 5]
        }
    }
}

# Step 8: Perform Grid Search for all models
results = {}
for name, model_and_params in models_and_params.items():
    model = model_and_params["model"]
    params = model_and_params["params"]

    # If the model has hyperparameters to tune
    if params:
        grid_search = GridSearchCV(estimator=model, param_grid=params, cv=3, n_jobs=-1, scoring="neg_mean_squared_error")
        grid_search.fit(X_train_scaled, y_train)

        # Get the best model and its parameters
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        # If no parameters to tune (e.g., for LinearRegression)
        best_model = model.fit(X_train_scaled, y_train)  # Explicitly fit the model
        best_params = "N/A"

    # Ensure the model is fitted properly before prediction
    if not hasattr(best_model, 'predict'):
        best_model.fit(X_train_scaled, y_train)

    # Make predictions and evaluate the model
    y_pred = best_model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[name] = {
        "Best Params": best_params,
        "MSE": mse,
        "R²": r2
    }

# Step 9: Display results
results_df = pd.DataFrame(results).T
print("Model Comparison with Grid Search:")
print(results_df)

# Step 10: Save the best model (optional)
best_model_name = results_df['R²'].idxmax()
best_model = models_and_params[best_model_name]["model"].fit(X_train_scaled, y_train)

# Save the best model and scaler for future use
joblib.dump(best_model, f'{best_model_name}_best_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


0:	learn: 94.7433501	total: 48.3ms	remaining: 48.3s
1:	learn: 90.8009630	total: 50.8ms	remaining: 25.3s
2:	learn: 86.7607184	total: 52.4ms	remaining: 17.4s
3:	learn: 82.9093860	total: 53.9ms	remaining: 13.4s
4:	learn: 79.1876035	total: 55.4ms	remaining: 11s
5:	learn: 75.6019191	total: 57ms	remaining: 9.45s
6:	learn: 72.2546022	total: 58.4ms	remaining: 8.29s
7:	learn: 69.1013686	total: 59.8ms	remaining: 7.42s
8:	learn: 66.1871956	total: 61.3ms	remaining: 6.75s
9:	learn: 63.2215775	total: 62.8ms	remaining: 6.22s
10:	learn: 60.5411490	total: 64.3ms	remaining: 5.78s
11:	learn: 58.0711137	total: 65.8ms	remaining: 5.41s
12:	learn: 55.5615839	total: 67.2ms	remaining: 5.1s
13:	learn: 53.3915822	total: 68.6ms	remaining: 4.83s
14:	learn: 51.2488610	total: 70ms	remaining: 4.59s
15:	learn: 48.9746581	total: 71.4ms	remaining: 4.39s
16:	learn: 46.8592847	total: 74ms	remaining: 4.28s
17:	learn: 44.8761442	total: 75.5ms	remaining: 4.12s
18:	learn: 43.0055594	total: 77ms	remaining: 3.97s
19:	learn: 41.

['scaler.pkl']

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import joblib


# Step 5: Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 7: Define models and parameter grids for GridSearchCV
models = {
    "Random Forest": RandomForestRegressor(random_state=42),
    "KNN": KNeighborsRegressor(),
    "Linear Regression": LinearRegression(),
    "SVM": SVR(),
    "Naive Bayes": GaussianNB(),  # Not ideal for regression, but included for completeness
    "XGBoost": xgb.XGBRegressor(random_state=42)
}

# Hyperparameters for tuning via GridSearchCV
param_grids = {
    "Random Forest": {'n_estimators': [50, 100], 'max_depth': [5, 10, None], 'min_samples_split': [2, 5]},
    "KNN": {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance']},
    "Linear Regression": {},  # Linear regression doesn't have hyperparameters to tune
    "SVM": {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    "Naive Bayes": {},  # GaussianNB has no tunable hyperparameters
    "XGBoost": {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 6, 10]}
}

# Step 8: Evaluate the models using GridSearchCV with cross-validation
def evaluate_model(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_  # Best model after tuning

    # Predictions
    y_pred = best_model.predict(X_test_scaled)

    # R-Squared (R²)
    r2 = r2_score(y_test, y_pred)

    # Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)

    # Mean Absolute Error
    mae = mean_absolute_error(y_test, y_pred)

    # Cross-Validation Score (R²)
    cross_val = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2').mean()

    return best_model, r2, mse, mae, cross_val

# Step 9: Loop through all models and evaluate
results = {}
for model_name, model in models.items():
    param_grid = param_grids.get(model_name, {})
    best_model, r2, mse, mae, cross_val = evaluate_model(model, param_grid, X_train_scaled, y_train)

    results[model_name] = {
        "Best Model": best_model,
        "R-squared": r2,
        "MSE": mse,
        "MAE": mae,
        "Cross-Validation R²": cross_val
    }

# Step 10: Display results
results_df = pd.DataFrame(results).T
print("Model Evaluation Results:")
print(results_df)

# Step 11: Save the best model (optional)
best_model_name = results_df['R-squared'].idxmax()
best_model = results[best_model_name]["Best Model"]
best_model.fit(X_train_scaled, y_train)

# Save the best model and scaler for future use
joblib.dump(best_model, f'{best_model_name}_best_model.pkl')
joblib.dump(scaler, 'scaler.pkl')




Model Evaluation Results:
                                                          Best Model  \
Random Forest      (DecisionTreeRegressor(max_depth=10, max_featu...   
KNN                KNeighborsRegressor(n_neighbors=7, weights='di...   
Linear Regression                                 LinearRegression()   
SVM                                        SVR(C=1, kernel='linear')   
Naive Bayes                                             GaussianNB()   
XGBoost            XGBRegressor(base_score=None, booster=None, ca...   

                  R-squared          MSE        MAE Cross-Validation R²  
Random Forest      0.998523     13.92485   1.742275            0.996163  
KNN                0.975153   234.273269  11.025096             0.96121  
Linear Regression  0.980912   179.975729    9.32936            0.975026  
SVM                0.977803   209.287062   6.584546            0.968798  
Naive Bayes        0.877622  1153.846645  15.022364            0.862652  
XGBoost            0.9977

['scaler.pkl']

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
import joblib

# Step 5: Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 7: Define models and parameter grids for GridSearchCV
models = {
    "Random Forest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingRegressor(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.05, 0.1, 0.2],
            "max_depth": [3, 5, 7]
        }
    },
    "AdaBoost": {
        "model": AdaBoostRegressor(random_state=42),
        "params": {
            "n_estimators": [50, 100],
            "learning_rate": [0.01, 0.1, 1.0]
        }
    },
    "Extra Trees": {
        "model": ExtraTreesRegressor(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5]
        }
    },
    "Linear Regression": {
        "model": LinearRegression(),
        "params": {}
    },
    "Ridge Regression": {
        "model": Ridge(),
        "params": {
            "alpha": [0.1, 1.0, 10.0]
        }
    },
    "Lasso Regression": {
        "model": Lasso(),
        "params": {
            "alpha": [0.1, 1.0, 10.0]
        }
    },
    "Support Vector Regression (SVR)": {
        "model": SVR(),
        "params": {
            "kernel": ["linear", "rbf"],
            "C": [0.1, 1.0, 10.0],
            "gamma": ["scale", "auto"]
        }
    },
    "Decision Tree": {
        "model": DecisionTreeRegressor(random_state=42),
        "params": {
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 10]
        }
    },
    "K-Nearest Neighbors (KNN)": {
        "model": KNeighborsRegressor(),
        "params": {
            "n_neighbors": [3, 5, 7],
            "weights": ["uniform", "distance"]
        }
    },
    "CatBoost": {
        "model": CatBoostRegressor(learning_rate=0.1, iterations=1000, depth=6, random_state=42, cat_features=[]),  # Specify categorical features if available
        "params": {
            "iterations": [500, 1000],
            "depth": [6, 8, 10],
            "learning_rate": [0.05, 0.1, 0.2],
            "l2_leaf_reg": [1, 3, 5]
        }
    }
}

# Step 8: Evaluate the models using GridSearchCV with cross-validation
def evaluate_model(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_  # Best model after tuning

    # Predictions
    y_pred = best_model.predict(X_test_scaled)

    # R-Squared (R²)
    r2 = r2_score(y_test, y_pred)

    # Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)

    # Mean Absolute Error
    mae = mean_absolute_error(y_test, y_pred)

    # Cross-Validation Score (R²)
    cross_val = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2').mean()

    return best_model, r2, mse, mae, cross_val

# Step 9: Loop through all models and evaluate
results = {}
for model_name, model_data in models.items():
    model = model_data['model']
    param_grid = model_data['params']
    best_model, r2, mse, mae, cross_val = evaluate_model(model, param_grid, X_train_scaled, y_train)

    results[model_name] = {
        "Best Model": best_model,
        "R-squared": r2,
        "MSE": mse,
        "MAE": mae,
        "Cross-Validation R²": cross_val
    }

# Step 10: Display results
results_df = pd.DataFrame(results).T
print("Model Evaluation Results:")
print(results_df)

# Step 11: Save the best model (optional)
best_model_name = results_df['R-squared'].idxmax()
best_model = results[best_model_name]["Best Model"]
best_model.fit(X_train_scaled, y_train)

# Save the best model and scaler for future use
joblib.dump(best_model, f'{best_model_name}_best_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
39:	learn: 19.1562611	total: 76.2ms	remaining: 1.83s
40:	learn: 18.5712524	total: 77.6ms	remaining: 1.82s
41:	learn: 17.9678915	total: 79ms	remaining: 1.8s
42:	learn: 17.4101891	total: 80.7ms	remaining: 1.79s
43:	learn: 16.8399348	total: 81.7ms	remaining: 1.77s
44:	learn: 16.2760016	total: 83.1ms	remaining: 1.76s
45:	learn: 15.7971974	total: 84.5ms	remaining: 1.75s
46:	learn: 15.3411179	total: 85.9ms	remaining: 1.74s
47:	learn: 14.9467986	total: 87.4ms	remaining: 1.73s
48:	learn: 14.5069819	total: 88.9ms	remaining: 1.73s
49:	learn: 14.1622762	total: 90.3ms	remaining: 1.72s
50:	learn: 13.7903366	total: 91.7ms	remaining: 1.71s
51:	learn: 13.4715978	total: 93.3ms	remaining: 1.7s
52:	learn: 13.1245564	total: 94.8ms	remaining: 1.69s
53:	learn: 12.8555376	total: 96.2ms	remaining: 1.69s
54:	learn: 12.5542094	total: 97.6ms	remaining: 1.68s
55:	learn: 12.2736810	total: 99ms	remaining: 1.67s
56:	learn: 12.0506710	total: 100ms	remai

['scaler.pkl']