In [1]:
import pandas as pd
import numpy as np
from math import *

In [2]:
#read dataframe
df = pd.read_csv("Training_Penguins_data.csv")

In [3]:
#show first 5 row
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_kg,sex
0,Adelie,Torgersen,41.1,17.6,182,3.757,female
1,Adelie,Torgersen,38.6,21.2,191,3.758,male
2,Adelie,Torgersen,34.6,21.1,198,3.759,male
3,Adelie,Torgersen,36.6,17.8,185,3.76,female
4,Adelie,Torgersen,38.7,19.0,195,3.761,female


In [4]:
#dataframe informations
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            286 non-null    object 
 1   island             286 non-null    object 
 2   bill_length_mm     286 non-null    float64
 3   bill_depth_mm      286 non-null    float64
 4   flipper_length_mm  286 non-null    int64  
 5   body_mass_kg       286 non-null    float64
 6   sex                286 non-null    object 
dtypes: float64(3), int64(1), object(3)
memory usage: 15.8+ KB


In [5]:
# 2.1.1 Training Phase of the Naive Bayes Algorithm
# a) Separation of the training dataset into two sub-datasets based on the (sex) label
df_f=df[df["sex"]=="female"] #female df
df_m=df[df["sex"]=="male"] #male df

In [6]:
# reset index 
df_m.reset_index(drop=True, inplace=True)
df_f.reset_index(drop=True, inplace=True)
df_m.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_kg,sex
0,Adelie,Torgersen,38.6,21.2,191,3.758,male
1,Adelie,Torgersen,34.6,21.1,198,3.759,male
2,Adelie,Torgersen,42.5,20.7,197,3.762,male
3,Adelie,Torgersen,46.0,21.5,194,3.764,male
4,Adelie,Biscoe,38.8,17.2,180,3.769,male


In [7]:
# b) Computation of each class’ probability in the dataset i.e., P(male), P(female)

# p_male= total male penquin samples/ total samples
# p_female= total female penquin samples/ total samples

p_male= len(df_m)/len(df)
p_female= len(df_f)/len(df)

p_female

0.493006993006993

In [8]:
p_male

0.506993006993007

In [9]:
# c)Computation of each feature’s probability calculations for each class.
#-I- If the feature is discrete

# probability calculation for species both male and female
species_p_f={}
for each in df_f.species.unique():
    species_p_f[each]=df_f.species.value_counts()[each]/len(df_f)
    
species_p_m={}
for each in df_m.species.unique():
    species_p_m[each]=df_m.species.value_counts()[each]/len(df_m)

# probability calculation for island both male and female
island_p_f={}
for each in df_f.island.unique():
    island_p_f[each]=df_f.island.value_counts()[each]/len(df_f)
    
island_p_m={}
for each in df_m.island.unique():
    island_p_m[each]=df_m.island.value_counts()[each]/len(df_m)
    

In [10]:
species_p_f

{'Adelie': 0.41134751773049644,
 'Gentoo': 0.375886524822695,
 'Chinstrap': 0.2127659574468085}

In [11]:
species_p_m

{'Adelie': 0.4,
 'Gentoo': 0.38620689655172413,
 'Chinstrap': 0.21379310344827587}

In [12]:
island_p_f

{'Torgersen': 0.14184397163120568,
 'Biscoe': 0.48936170212765956,
 'Dream': 0.36879432624113473}

In [13]:
island_p_m

{'Torgersen': 0.13793103448275862,
 'Biscoe': 0.4896551724137931,
 'Dream': 0.3724137931034483}

In [14]:
#-II- If the feature is continuous or has an integer value

cont_m={} # continuous or has an integer value in males
cont_f={} # continuous or has an integer value in females

##Normal (or Gaussian) Distribution
def cont_pro(m,v,f):
    result =  (1/sqrt(2*3*v)) *exp(-0.5* pow((f- m),2)/v)  # (std)^2 = variance
    return result

#continuous or has an integer features
cont=["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_kg"]

#index0 :mean ,index1:var
for each in cont:
    cont_f[each]=[df_f[each].mean(),df_f[each].var()] # mean and variance for features in female
    cont_m[each]=[df_m[each].mean(),df_m[each].var()] # mean and variance for features in male     

In [15]:
cont_m

{'bill_length_mm': [46.293793103448266, 28.191280651341003],
 'bill_depth_mm': [17.79448275862068, 3.4137193486590034],
 'flipper_length_mm': [205.66206896551725, 208.62806513409956],
 'body_mass_kg': [3.9294275862068977, 0.008318607567049805]}

In [16]:
cont_f

{'bill_length_mm': [42.310638297872316, 24.664100303951372],
 'bill_depth_mm': [16.305673758865254, 3.080539007092198],
 'flipper_length_mm': [198.40425531914894, 154.02826747720366],
 'body_mass_kg': [3.927439716312055, 0.008455990982776091]}

In [17]:
# test

#read test dataframe
test = pd.read_csv("Testing_Penguins_data.csv")
test.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_kg,sex
0,Adelie,Torgersen,39.1,18.7,181,3.75,male
1,Adelie,Torgersen,39.5,17.4,186,3.751,female
2,Adelie,Torgersen,40.3,18.0,196,3.752,female
3,Adelie,Torgersen,36.7,19.3,193,3.753,female
4,Adelie,Torgersen,39.3,20.6,190,3.754,male


In [18]:
# p(male|F1, F2, F3, F4, F5, F6) = p(F1|C1) ∗ p(F2|C1) ∗ p(F3|C1) ∗ p(F4|C1) ∗ p(F5|C1) ∗ p(F6|C1) ∗ p(C1)
# p(female|F1, F2, F3, F4, F5, F6) = p(F1|C2) ∗ p(F2|C2) ∗ p(F3|C2) ∗ p(F4|C2) ∗ p(F5|C2) ∗ p(F6|C2) ∗ p(C2)

# 2.1.6 a) Guess class of each test sample in the test set with a trained model
y_predd= []
for each in range(len(test)):
    
    p_m= p_male #p(C1)
    p_f=p_female #p(C2)
    
    p_m *= species_p_m[test.loc[each,"species"]] # *p(F1|C1)
    p_m *= island_p_m[test.loc[each,"island"]] # *p(F2|C1)
    
    p_f *= species_p_f[test.loc[each,"species"]] # *p(F1|C2)
    p_f *= island_p_f[test.loc[each,"island"]] # *p(F2|C2)
    
    for i in ["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_kg"]:
        #cont_m[i][0] = mean ,cont_m[i][1] = variance ,test.loc[each,i] = test value
        p_m *= cont_pro(cont_m[i][0],cont_m[i][1],test.loc[each,i]) #∗ p(F3|C1) ∗ p(F4|C1) ∗ p(F5|C1) ∗ p(F6|C1)
        p_f *= cont_pro(cont_f[i][0],cont_f[i][1],test.loc[each,i]) # ∗ p(F3|C2) ∗ p(F4|C2) ∗ p(F5|C2) ∗ p(F6|C2)
    
    #greater class probability    
    
    if p_m>p_f:
        pred = "male" 
        
    else:
        pred= "female"
        
    y_predd.append(pred) 



In [19]:
#results

tn=0
fp=0
fn=0
tp=0

for each in range(len(y_predd)):
    
    if y_predd[each]=="male":
        if test.loc[each,"sex"]=="male": # real:male prediction:male
            tn+=1
        else: # real: female prediction:male
            fn+=1
    else:
        if test.loc[each,"sex"]=="male": # real:male prediction:female
            fp+=1
        else: # real:female prediction: female
            tp+=1


In [20]:
#2.1.6 b. Compute Total Accuracy

accuracy= (tp+tn)/(tp+fp+tn+fn)

#2.1.6 c. Compute each class Accuracy

f_acc= tp/(tp+fn) # female class accuracy # predict female/ real females
m_acc= tn/(tn+fp) # male class accuracy # predict male/ real males


In [21]:
#2.1.6 d. Report the total accuracy and class accuracy values

accuracy

0.673469387755102

In [22]:
f_acc

0.84

In [23]:
m_acc

0.5

In [24]:
# 2.1.6 e. Report the Confusion matrix

# tp fp
# fn tn

confusion_matrix=pd.DataFrame( columns=["actual_female","actual_male"],index=["predicted_female","predicted_male"])

In [25]:
confusion_matrix.loc["predicted_female","actual_female"]=tp
confusion_matrix.loc["predicted_female","actual_male"]=fp
confusion_matrix.loc["predicted_male","actual_female"]=fn
confusion_matrix.loc["predicted_male","actual_male"]=tn

In [26]:
confusion_matrix

Unnamed: 0,actual_female,actual_male
predicted_female,21,12
predicted_male,4,12
