In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from catboost import CatBoostRegressor
import seaborn as sns
from scipy.stats import uniform, randint


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.drop(['id'], axis=1, inplace=True)

In [5]:
X_train = train.drop(['cost', 'prepared_food'], axis=1)
y_train = train[['cost']]
X_test = test.drop(['id', 'prepared_food'], axis=1)

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

In [7]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [9]:
import xgboost as xgb

xgb_regressor = xgb.XGBRegressor()

param_grid = {
    'n_estimators': range(100, 1001, 100),  # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.5],  # Step size shrinkage
    'max_depth': range(3, 11),  # Maximum depth of a tree
    'min_child_weight': range(1, 6),  # Minimum sum of instance weight (hessian) needed in a child
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],  # Minimum loss reduction required to make a further partition on a leaf node
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],  # Fraction of samples used for fitting the trees
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],  # Fraction of features used for fitting the trees
}

# Create the Randomized Search object
random_search = RandomizedSearchCV(
    xgb_regressor,
    param_distributions=param_grid,
    n_iter=100,  # Number of random combinations to try
    scoring='neg_mean_squared_error',  # Use mean squared error as the evaluation metric
    cv=5,  # Cross-validation folds
    verbose=2,  # Increase for more output
    n_jobs=-1  # Use all available CPU cores
)

# Fit the Randomized Search to your data
random_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters and corresponding score
print("Best hyperparameters found:")
print(random_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.5, max_depth=6, min_child_weight=5, n_estimators=200, subsample=0.7; total time=  10.2s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.5, max_depth=6, min_child_weight=5, n_estimators=200, subsample=0.7; total time=  10.6s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.5, max_depth=6, min_child_weight=5, n_estimators=200, subsample=0.7; total time=  11.3s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.5, max_depth=6, min_child_weight=5, n_estimators=200, subsample=0.7; total time=  11.8s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.5, max_depth=6, min_child_weight=5, n_estimators=200, subsample=0.7; total time=  11.8s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.01, max_depth=3, min_child_weight=3, n_estimators=400, subsample=0.9; total time=  14.1s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.01