# Hyperparameter tuning and selection of best model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder

In [2]:
# load and encode the dataset and split it into train and test sets
df = sns.load_dataset('tips')
le = LabelEncoder()
for i in df.columns:
    if df[i].dtype=='category':
        df[i]=le.fit_transform(df[i])
X = df.drop('tip', axis=1)
y = df['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# define our models to be evaluated

models = {
            'LinearRegression': LinearRegression(),
            'Ridge Regression': Ridge(),
            'Lasso Regression': Lasso(),
            'SVR': SVR(),
            'RandomForestRegressor': RandomForestRegressor(),
            'GradientBoostingRegressor': GradientBoostingRegressor(),
            'KNeighborsRegressor': KNeighborsRegressor()
            }

params = {
    'LinearRegression': {'fit_intercept': [True, False]},
    'Ridge Regression': {'alpha': [0.1, 1.0, 10.0]},
    'Lasso Regression': {'alpha': [0.1, 1.0, 10.0]},
    'SVR': {'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']},
    'RandomForestRegressor': {'n_estimators': [5, 10, 50], 'max_features': ['sqrt', 'log2']},
    'GradientBoostingRegressor': {'n_estimators': [5, 10, 50], 'max_features': ['sqrt', 'log2']},
    'KNeighborsRegressor': {'n_neighbors': [5, 10, 15], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}}

In [8]:
tuned_models = {}

for name, model in models.items():
    grid = GridSearchCV(model, params[name], cv=5, scoring='neg_mean_squared_error')
    grid.fit(X_train, y_train)
    
    print(f'Best Parameters for {name}: {grid.best_params_}')
    tuned_models[name] = grid.best_estimator_

best_score = float('inf')
best_model_name = None

for name, model in tuned_models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    if mse < best_score:
        best_score = mse
        best_model_name = name
print(f'Best Model: {best_model_name}')

Best Parameters for LinearRegression: {'fit_intercept': True}
Best Parameters for Ridge Regression: {'alpha': 10.0}
Best Parameters for Lasso Regression: {'alpha': 0.1}
Best Parameters for SVR: {'gamma': 'scale', 'kernel': 'linear'}
Best Parameters for RandomForestRegressor: {'max_features': 'sqrt', 'n_estimators': 10}
Best Parameters for GradientBoostingRegressor: {'max_features': 'sqrt', 'n_estimators': 50}
Best Parameters for KNeighborsRegressor: {'algorithm': 'brute', 'n_neighbors': 15, 'weights': 'distance'}
Best Model: Lasso Regression
