## Naive Bayes Algorithm

In [23]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [24]:
# Loading the Dataset
df = sns.load_dataset("penguins")
print(df.head())

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  
0       3750.0    Male  
1       3800.0  Female  
2       3250.0  Female  
3          NaN     NaN  
4       3450.0  Female  


In [25]:
#  Getting info of our dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [26]:
# Let's check for missng values in our dataset
print(df.isnull().sum().sort_values(ascending=False))

sex                  11
bill_depth_mm         2
bill_length_mm        2
flipper_length_mm     2
body_mass_g           2
island                0
species               0
dtype: int64


In [27]:
# Again have a look on our dataset
print(df.head())

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  
0       3750.0    Male  
1       3800.0  Female  
2       3250.0  Female  
3          NaN     NaN  
4       3450.0  Female  


In [28]:
# Let's filling the missing values by mean and mode
df["sex"] = df["sex"].fillna(df["sex"].mode()[0])
df["bill_length_mm"] = df["bill_length_mm"].fillna(df["bill_length_mm"].mean())
df["bill_depth_mm"] = df["bill_depth_mm"].fillna(df["bill_depth_mm"].mean())
df["flipper_length_mm"] = df["flipper_length_mm"].fillna(df["flipper_length_mm"].mean())
df["body_mass_g"] = df["body_mass_g"].fillna(df["body_mass_g"].mean())


In [29]:
# Let's again check for missing values in our dataset
print(df.isnull().sum().sort_values(ascending=False))

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


In [30]:
# Create a LabelEncoder object
le = LabelEncoder()

# Encode all categorical columns in the DataFrame
for col in df.select_dtypes(include=['object', 'category']).columns:
    df[col] = le.fit_transform(df[col])

In [31]:
# Select Feature and Target Variable
X = df.drop("island",axis=1)
y = df["island"]
# Splitting dataset in to train and test by 80/20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Guassian NB


In [34]:
# Initialising the Gaussian Model
gnb = GaussianNB()
# Training the Model
gnb.fit(X_train, y_train)
# Predicting the model
y_predict = gnb.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_predict)
print("Confusion Matrix:\n", cm)

# Accuracy Score
acc = accuracy_score(y_test, y_predict)
print("Accuracy Score:", acc)

# Classification Report
cr = classification_report(y_test, y_predict)
print("Classification Report:\n", cr)

Confusion Matrix:
 [[21  0 10]
 [ 0 16  9]
 [ 0  0 13]]
Accuracy Score: 0.7246376811594203
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.68      0.81        31
           1       1.00      0.64      0.78        25
           2       0.41      1.00      0.58        13

    accuracy                           0.72        69
   macro avg       0.80      0.77      0.72        69
weighted avg       0.89      0.72      0.75        69



##  MultiNomial NB

In [36]:
# Initialising the Gaussian Model
MNB = MultinomialNB()
# Training the Model
MNB.fit(X_train, y_train)
# Predicting the model
y_predict = MNB.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_predict)
print("Confusion Matrix:\n", cm)

# Accuracy Score
acc = accuracy_score(y_test, y_predict)
print("Accuracy Score:", acc)

# Classification Report
cr = classification_report(y_test, y_predict)
print("Classification Report:\n", cr)

Confusion Matrix:
 [[23  4  4]
 [ 2 21  2]
 [ 2  8  3]]
Accuracy Score: 0.6811594202898551
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.74      0.79        31
           1       0.64      0.84      0.72        25
           2       0.33      0.23      0.27        13

    accuracy                           0.68        69
   macro avg       0.61      0.60      0.60        69
weighted avg       0.68      0.68      0.67        69



## Bernoulli NB

In [38]:
import warnings
warnings.filterwarnings('ignore')
# Initialising the Gaussian Model
BNB = BernoulliNB()
# Training the Model
BNB.fit(X_train, y_train)
# Predicting the model
y_predict = BNB.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_predict)
print("Confusion Matrix:\n", cm)

# Accuracy Score
acc = accuracy_score(y_test, y_predict)
print("Accuracy Score:", acc)

# Classification Report
cr = classification_report(y_test, y_predict)
print("Classification Report:\n", cr) 

Confusion Matrix:
 [[21 10  0]
 [16  9  0]
 [ 0 13  0]]
Accuracy Score: 0.43478260869565216
Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.68      0.62        31
           1       0.28      0.36      0.32        25
           2       0.00      0.00      0.00        13

    accuracy                           0.43        69
   macro avg       0.28      0.35      0.31        69
weighted avg       0.36      0.43      0.39        69

