# training the ML model to be able to predict the tips that will be given each time someone eat and pay

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn

# for preprocessing the data
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder

# for splitting the dataset
from sklearn.model_selection import train_test_split, GridSearchCV 

# pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# evaluating regression model accuracy
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
Data = pd.read_csv("C:\\Datasets\Datasets for FREE AI Classes in Every City\\tips.csv")
Data.head()

Unnamed: 0,total_bill,tip,gender,smoker,day,time,size
0,2125.5,360.79,Male,No,Thur,Lunch,1
1,2727.18,259.42,Female,No,Sun,Dinner,5
2,1066.02,274.68,Female,Yes,Thur,Dinner,4
3,3493.45,337.9,Female,No,Sun,Dinner,1
4,3470.56,567.89,Male,Yes,Sun,Lunch,6


In [3]:
print(Data.shape, '\n')
print(Data.info())

(744, 7) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  744 non-null    float64
 1   tip         744 non-null    float64
 2   gender      744 non-null    object 
 3   smoker      744 non-null    object 
 4   day         744 non-null    object 
 5   time        744 non-null    object 
 6   size        744 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 40.8+ KB
None


In [9]:
# getting the feature matrix and target vector
X = Data.drop(columns=['tip'])
y = Data['tip']

# to avoid ValueError: Expected 2D array, got 1D array instead, i reshaped y
y_reshaped = np.array(y).reshape(-1, 1)
print(y_reshaped.shape)


(744, 1)


In [21]:
# splitting into train and test set
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size = 0.8, random_state=42)
# Xtrain.head()

In [22]:
# preprocessing the target vector by scaling it

# ytrain_reshaped = ytrain.values.reshape(-1,1)  [it's the same thing as doing it like the one I did below]
# print(ytrain_reshaped.shape)

# changing it into a 2D array
ytrain_reshaped = np.array(ytrain).reshape(-1, 1)
# print(ytrain_reshaped.shape)
ytest_reshaped = np.array(ytest).reshape(-1, 1)
# print(ytest_reshaped.shape)

scaler = MinMaxScaler()
ytrain_scaled = scaler.fit_transform(ytrain_reshaped)
ytest_scaled = scaler.transform(ytest_reshaped)


In [23]:
# preprocessing the feature matrics and building pipeline
num_features  = X.select_dtypes(include=['float64','int64']).columns
cat_features  = X.select_dtypes(include=['object']).columns

num_transformer = MinMaxScaler()
cat_transformer = OneHotEncoder(sparse_output=False)

preprocessor = ColumnTransformer(transformers=[('num',num_transformer,num_features), ('cat',cat_transformer,cat_features)])
pipeline = Pipeline(steps=[('preprocessor',preprocessor), ('model', LinearRegression())])

models = {'linear':LinearRegression(), 'ridge':Ridge(), 
          'lasso':Lasso(), 'forest':RandomForestRegressor(random_state=42), 'svr':SVR()
         }

param_grid  = {'model':list(models.values())}

grid_search  = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(Xtrain, ytrain)

In [24]:
# checking for the best model
best_model = grid_search.best_params_
best_model

{'model': Lasso()}

In [25]:
# # using the best model for the fitting
best_model = grid_search.best_estimator_
best_model

In [26]:
# predicting
y_pred = best_model.predict(Xtest)
print(f'Prediction: {y_pred}')
print(f'Actual: {ytest}')

Prediction: [298.42617334 353.3491254  310.3759255  352.6303127  280.42709834
 331.76166225 297.60286115 350.47957209 381.27567585 295.307245
 328.99165807 283.26441379 295.85715767 270.54986487 297.59797858
 351.22156133 251.33590924 293.44957712 340.73053825 321.57052991
 327.34364853 327.32068914 335.44416284 353.71446866 296.709534
 317.56466134 337.18786212 310.95866741 278.35338399 295.80023096
 368.29681958 341.55995519 334.8115926  312.1327198  289.59504647
 313.83317107 332.73719014 282.32476162 266.36160599 386.74046822
 286.01212265 308.25833819 329.21517871 364.43881917 335.77042886
 344.81639855 327.26362667 319.96289592 310.28717456 324.2461149
 319.37905563 270.00764196 335.15464055 267.89242538 322.93411346
 327.82391276 358.02522338 352.2325911  287.55746841 360.79583255
 348.04163993 301.49300736 303.26740624 364.10979651 337.35562065
 307.42909223 278.13699307 345.33736202 258.23377581 343.7481433
 321.81517422 321.61895833 309.42733332 298.66216997 393.71681755
 353

In [27]:
# Calculate performance metrics
rmse = np.sqrt(mean_squared_error(ytest, y_pred))
r2 = r2_score(ytest, y_pred)
mae = mean_absolute_error(ytest, y_pred)

# print(f"Best model: {grid_search.best_estimator_}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

print(f"R^2 Score: {r2}")

RMSE: 163.2845088246304
MAE: 128.7880256075297
R^2 Score: 0.021074596786469835
