## Applying Naive Bayes on the tips Dataset
## Predict time based on all other features

In [2]:
import numpy as np
import pandas as pd 
import seaborn as sns

In [3]:
df1=sns.load_dataset("tips")

In [4]:
df1.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
df1.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [6]:
df1[['sex', 'smoker', 'day', 'time']]

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner
...,...,...,...,...
239,Male,No,Sat,Dinner
240,Female,Yes,Sat,Dinner
241,Male,Yes,Sat,Dinner
242,Male,No,Sat,Dinner


In [7]:
from sklearn.preprocessing import OneHotEncoder

# Create an instance of OneHotEncoder
encoder = OneHotEncoder()

In [8]:
# Fit the encoder to your categorical data
encoder.fit(df1[['sex','smoker','day',"time"]])

In [10]:
# Transform your categorical data into one-hot encoded format
one_hot_encoded_data = encoder.transform(df1[['sex',"smoker",'day',"time"]]).toarray()

In [11]:
one_hot_encoded_data

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])

In [15]:
# Get the column names
column_names = encoder.get_feature_names_out()

# Create a DataFrame with the one-hot encoded data and column names
df2 = pd.DataFrame(one_hot_encoded_data, columns=column_names)

In [16]:
df2.head()

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [14]:
df1.drop(["sex", "smoker", "day","time"], axis=1,inplace=True)

In [17]:
data=pd.concat([df1,df2],axis=1)

In [18]:
data.head()

Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,10.34,1.66,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,21.01,3.5,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,23.68,3.31,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,24.59,3.61,4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [20]:
data.columns

Index(['total_bill', 'tip', 'size', 'sex_Female', 'sex_Male', 'smoker_No',
       'smoker_Yes', 'day_Fri', 'day_Sat', 'day_Sun', 'day_Thur',
       'time_Dinner', 'time_Lunch'],
      dtype='object')

In [38]:
x=data.iloc[:,:-2]
y=data.iloc[:,-2]

In [39]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)

In [40]:
x_train.shape,x_test.shape

((170, 11), (74, 11))

In [41]:
from sklearn.naive_bayes import GaussianNB

In [42]:
gnb=GaussianNB()

In [43]:

gnb.fit(x_train,y_train)


In [44]:
y_pred=gnb.predict(x_test)

In [45]:
y_pred

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0.,
       1., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1.,
       1., 1., 1., 1., 1., 1.])

In [46]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [47]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[13  5]
 [ 1 55]]
              precision    recall  f1-score   support

         0.0       0.93      0.72      0.81        18
         1.0       0.92      0.98      0.95        56

    accuracy                           0.92        74
   macro avg       0.92      0.85      0.88        74
weighted avg       0.92      0.92      0.92        74

0.918918918918919


## 91% Accuracy