In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, HalvingGridSearchCV, HalvingRandomSearchCV
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_absolute_percentage_error
from scipy.stats import randint
import warnings
warnings.filterwarnings("ignore")
df=sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [53]:
categoric_Cols=['sex','smoker','day','time']
#using onehot encoding
df=pd.get_dummies(df,columns=categoric_Cols,drop_first=True)
df

Unnamed: 0,total_bill,tip,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,2,True,True,False,False,True,True
1,10.34,1.66,3,False,True,False,False,True,True
2,21.01,3.50,3,False,True,False,False,True,True
3,23.68,3.31,2,False,True,False,False,True,True
4,24.59,3.61,4,True,True,False,False,True,True
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,False,True,False,True,False,True
240,27.18,2.00,2,True,False,False,True,False,True
241,22.67,2.00,2,False,False,False,True,False,True
242,17.82,1.75,2,False,True,False,True,False,True


# **GridSearchCV**

In [54]:
X=df.drop('tip',axis=1)
y=df['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model=RandomForestRegressor()
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(model, 
                           param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-6)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Best Hyperparameters:", grid_search.best_params_)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test, y_pred)))
print('MAE',mean_absolute_error(y_test, y_pred))
print('MAPE',mean_absolute_percentage_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Mean Squared Error: 0.9347359364755895
Root Mean Squared Error: 0.9668174266507558
MAE 0.75504018728195
MAPE 0.32514670722249844
R-squared: 0.2521941082804696


In [55]:
df=sns.load_dataset('tips')
categoric_Cols=['sex','smoker','day','time']
#using onehot encoding
df=pd.get_dummies(df,columns=categoric_Cols,drop_first=True)
df

Unnamed: 0,total_bill,tip,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,2,True,True,False,False,True,True
1,10.34,1.66,3,False,True,False,False,True,True
2,21.01,3.50,3,False,True,False,False,True,True
3,23.68,3.31,2,False,True,False,False,True,True
4,24.59,3.61,4,True,True,False,False,True,True
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,False,True,False,True,False,True
240,27.18,2.00,2,True,False,False,True,False,True
241,22.67,2.00,2,False,False,False,True,False,True
242,17.82,1.75,2,False,True,False,True,False,True


# **RandomizedSearchCV**

In [56]:
X=df.drop('tip',axis=1)
y=df['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model=RandomForestRegressor()
param_dist = {
    'n_estimators': randint(50, 150),
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

random_search = RandomizedSearchCV(estimator=model,
                                   param_distributions=param_dist,
                                   n_iter=20,  # Try 20 combinations
                                   cv=5,
                                   scoring='neg_mean_squared_error',
                                   n_jobs=-2,
                                   random_state=42)

random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Best Hyperparameters:", random_search.best_params_)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test, y_pred)))
print('MAE',mean_absolute_error(y_test, y_pred))
print('MAPE',mean_absolute_percentage_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))

Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 88}
Mean Squared Error: 0.9537099022456146
Root Mean Squared Error: 0.976580719779791
MAE 0.7489122630687784
MAPE 0.31021433495341605
R-squared: 0.23701458769243244


# **HalvingSearchCV**

In [57]:
df=sns.load_dataset('tips')
categoric_Cols=['sex','smoker','day','time']
#using onehot encoding
df=pd.get_dummies(df,columns=categoric_Cols,drop_first=True)
df

Unnamed: 0,total_bill,tip,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,2,True,True,False,False,True,True
1,10.34,1.66,3,False,True,False,False,True,True
2,21.01,3.50,3,False,True,False,False,True,True
3,23.68,3.31,2,False,True,False,False,True,True
4,24.59,3.61,4,True,True,False,False,True,True
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,False,True,False,True,False,True
240,27.18,2.00,2,True,False,False,True,False,True
241,22.67,2.00,2,False,False,False,True,False,True
242,17.82,1.75,2,False,True,False,True,False,True


In [58]:
from sklearn.experimental import enable_halving_search_cv
X=df.drop('tip',axis=1)
y=df['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model=RandomForestRegressor()
param_dist = {
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

random_search = HalvingRandomSearchCV(estimator=model,
                                      param_distributions=param_dist,
                                      factor=3,
                                      resource='n_estimators',  # must be an integer hyperparameter
                                      max_resources=150,
                                      cv=5,
                                      scoring='neg_mean_squared_error',
                                      random_state=42,
                                      n_jobs=-2)

random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Best Hyperparameters:", random_search.best_params_)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test, y_pred)))
print('MAE',mean_absolute_error(y_test, y_pred))
print('MAPE',mean_absolute_percentage_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 81}
Mean Squared Error: 0.9393850642538313
Root Mean Squared Error: 0.9692187907040553
MAE 0.750773054668699
MAPE 0.3145285130825495
R-squared: 0.2484747207954492
