In [None]:
import pandas as pd
import numpy as np
import seaborn as sns


In [None]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [None]:
tips_df=sns.load_dataset("tips")
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [8]:
tips_df.isna()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
239,False,False,False,False,False,False,False
240,False,False,False,False,False,False,False
241,False,False,False,False,False,False,False
242,False,False,False,False,False,False,False


# Dividing Features and Labels
Tip is the target attribute , so it is a Label and all other values can be used as features .



In [10]:
x=tips_df.drop(['tip'],axis=1)
y=tips_df['tip']
x.head()


Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [11]:
y.head()

0    1.01
1    1.66
2    3.50
3    3.31
4    3.61
Name: tip, dtype: float64

# Converting Categorical Data into Numerical Data

In [12]:
numerical=x.drop(['sex','smoker','day','time'],axis=1)
numerical.head()

Unnamed: 0,total_bill,size
0,16.99,2
1,10.34,3
2,21.01,3
3,23.68,2
4,24.59,4


#Separating the Categorical Data

In [13]:
categorical=x.filter(['sex','smoke','day','time'])
categorical.head()

Unnamed: 0,sex,day,time
0,Female,Sun,Dinner
1,Male,Sun,Dinner
2,Male,Sun,Dinner
3,Male,Sun,Dinner
4,Female,Sun,Dinner


In [14]:
categorical['day'].value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

#Converting the Categorical Data into numerical Data
#Method used  : Hot Encoding

In [16]:
cat_numerical=pd.get_dummies(categorical)
cat_numerical.head()

Unnamed: 0,sex_Male,sex_Female,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,0,1,0,0,0,1,0,1
1,1,0,0,0,0,1,0,1
2,1,0,0,0,0,1,0,1
3,1,0,0,0,0,1,0,1
4,0,1,0,0,0,1,0,1


# Joining the numerical columns with One-Hot encoded cols

In [26]:
X= pd.concat([numerical, cat_numerical],axis=1)

X.head()

Unnamed: 0,total_bill,size,sex_Male,sex_Female,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,2,0,1,0,0,0,1,0,1
1,10.34,3,1,0,0,0,0,1,0,1
2,21.01,3,1,0,0,0,0,1,0,1
3,23.68,2,1,0,0,0,0,1,0,1
4,24.59,4,0,1,0,0,0,1,0,1


# Dividing the Data set into Traning and Testing Set

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.20, random_state=0)

#Normalizing the Data




In [28]:
from sklearn.preprocessing import StandardScaler

sc=StandardScaler()
X_train= sc.fit_transform(X_train)
X_test= sc.transform(X_test)

# Applying Linear Regression and Traning the Model using the Data Set

In [29]:
from sklearn.linear_model import LinearRegression
lin_reg= LinearRegression()

regressor= lin_reg.fit(X_train, y_train)

y_pred=regressor.predict(X_test)

# Checking the Accuraccy of the Model Via  Accuracy Mertics :
Mean Absolute Error(MAE) :-is calculated by taking the average of absolute error obtained by subtracting real values from predicted values.

Mean Squared Error(MSE) :- is similar to MAE. However, the error for each record is squared in case of MSE

Root Mean Squared Error(RMSE) :- is the under the root of mean squared error

In [30]:
from sklearn import metrics

print('Mean Absolute Error: ',metrics.mean_absolute_error(y_test, y_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_pred))

print('Root Mean Squared Error: ',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error:  0.7087874126122272
Mean Squared Error:  0.8972841371283601
Root Mean Squared Error:  0.9472508311573867


# Applying Random Forest Regression

In [31]:
from sklearn.ensemble import RandomForestRegressor

rf_reg= RandomForestRegressor(random_state=42, n_estimators=500)

regressor=rf_reg.fit(X_train, y_train)

y_pred= regressor.predict(X_test)

In [32]:
from sklearn import metrics

print('Mean Absolute Error: ',metrics.mean_absolute_error(y_test, y_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_pred))

print('Root Mean Squared Error: ',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error:  0.7442636734693878
Mean Squared Error:  0.9914399220734731
Root Mean Squared Error:  0.9957107622565265


# Conclusion
By looking at the MAE, it can be concluded that, on average there is an error of 0.70 for predictions, which means that on average there is an error of 0.70 for predictions, which means that on average, the predicted tip values are 0.70$ more or less than the actual tip values.