In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib

Using matplotlib backend: Qt5Agg


In [4]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'tips',
 'titanic']

In [5]:
df=sns.load_dataset('tips')

In [7]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [15]:
#now we are working with tips datasets we take the tip is a y and other in an input(x)
#divide the data into feature and label
x=df.drop(['tip'],axis=1)
y=df[['tip']]

In [24]:
# Converting Categorical Data to Numbers
#Machine learning algorithms can only work with numbers. Therefore, it is important to convert 
#categorical data into a numeric format.
#first step to create the datasets of all numeric values
numerical=x.drop(['sex','smoker','day','time'],axis=1)
numerical

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,29.03,Male,No,Sat,Dinner,3
240,27.18,Female,Yes,Sat,Dinner,2
241,22.67,Male,Yes,Sat,Dinner,2
242,17.82,Male,No,Sat,Dinner,2


In [27]:
#you need to create a dataframe that contains only categorical columns.
categorical=x.filter(['sex','smoker','day','time'])
categorical

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner
...,...,...,...,...
239,Male,No,Sat,Dinner
240,Female,Yes,Sat,Dinner
241,Male,Yes,Sat,Dinner
242,Male,No,Sat,Dinner


In [29]:
#one of the most common method to convert the categorical columns to numeric values is one hot coding
cat_numerical=pd.get_dummies(categorical,drop_first=True)
cat_numerical

Unnamed: 0,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,1,1,0,0,1,1
1,0,1,0,0,1,1
2,0,1,0,0,1,1
3,0,1,0,0,1,1
4,1,1,0,0,1,1
...,...,...,...,...,...,...
239,0,1,0,1,0,1
240,1,0,0,1,0,1
241,0,0,0,1,0,1
242,0,1,0,1,0,1


In [35]:
#The final step is to join the numerical columns with the one-hot encoded columns.
    x=pd.concat([numerical,cat_numerical],axis=1)

In [36]:
x

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,1,1,0,0,1,1
1,10.34,3,0,1,0,0,1,1
2,21.01,3,0,1,0,0,1,1
3,23.68,2,0,1,0,0,1,1
4,24.59,4,1,1,0,0,1,1
...,...,...,...,...,...,...,...,...
239,29.03,3,0,1,0,1,0,1
240,27.18,2,1,0,0,1,0,1
241,22.67,2,0,0,0,1,0,1
242,17.82,2,0,1,0,1,0,1


In [37]:
# Divide Data into Training and Test Sets
#the train_test_split is a function in sklearn which divid the data in subsets train and test
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [44]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state = 0)

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
7,26.88,4,0,1,0,0,1,1
83,32.68,2,0,0,0,0,0,0
176,17.89,2,0,0,0,0,1,1
106,20.49,2,0,0,0,1,0,1
156,48.17,6,0,1,0,0,1,1


In [53]:
#Data Scaling/Normalization
#this is the optional step but it's the better approached to do b/s our datasets sometime contain
#small values and sometime very larges values
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.fit_transform(x_test)

In [54]:
reg=LinearRegression()
reg.fit(x_train,y_train)

LinearRegression()

In [60]:
y_pred=reg.predict(x_test)

In [62]:
reg.score(x_test,y_test)

0.6432148543626198

In [64]:
from sklearn import metrics
print(metrics.mean_absolute_error(y_test,y_pred))
print(metrics.mean_squared_error(y_test,y_pred))
print(np.sqrt(metrics.mean_absolute_error(y_test,y_pred)))
#here mean absolute error mean that by average there is 63% error in prediction
#it's mean that on a average the prediction values is 63% less or more from the actual tips values

0.6366762541802362
0.7159134231087625
0.7979199547449833
