In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [2]:

data = pd.read_csv('/content/tips.csv')

print(data.head())


   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [3]:
# Select features (independent variables)
X = data[['total_bill', 'size']]

# Select target variable (dependent variable)
y = data['tip']


In [4]:
# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

Training samples: 195, Testing samples: 49


In [5]:

# Initialize Ridge Regression model
ridge = Ridge()

In [6]:


# Initialize Decision Tree model
dtree = DecisionTreeRegressor(random_state=42)


In [7]:
# Define hyperparameter grid
param_grid = {
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation (cv=5)
grid_search = GridSearchCV(dtree, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get best parameters
best_params_grid = grid_search.best_params_
print("Best parameters from Grid Search:", best_params_grid)

Best parameters from Grid Search: {'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2}


In [8]:
# Define wider hyperparameter search space
param_dist = {
    'max_depth': np.arange(2, 20),
    'min_samples_split': np.arange(2, 20),
    'min_samples_leaf': np.arange(1, 10)
}

# Perform Randomized Search (10 iterations)
random_search = RandomizedSearchCV(dtree, param_distributions=param_dist,
                                   n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
random_search.fit(X_train, y_train)

# Get best parameters
best_params_random = random_search.best_params_
print("Best parameters from Randomized Search:", best_params_random)


Best parameters from Randomized Search: {'min_samples_split': 8, 'min_samples_leaf': 1, 'max_depth': 4}


In [11]:
# Use best model from Grid Search
best_dtree_grid = grid_search.best_estimator_
y_pred_grid = best_dtree_grid.predict(X_test)
mse_grid = mean_squared_error(y_test, y_pred_grid)

# Use best model from Randomized Search
best_dtree_random = random_search.best_estimator_
y_pred_random = best_dtree_random.predict(X_test)
mse_random = mean_squared_error(y_test, y_pred_random)

# Print MSE comparison
print(f"Test MSE (Grid Search): {mse_grid:.4f}")
print(f"Test MSE (Randomized Search): {mse_random:.4f}")

Test MSE (Grid Search): 1.0466
Test MSE (Randomized Search): 1.0333
