In [1]:
import pandas as pd

df = pd.read_csv('penguins.csv')

In [2]:
df.head()

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,39.1,18.7,181,3750
1,Adelie,39.5,17.4,186,3800
2,Adelie,40.3,18,195,3250
3,Adelie,x,x,x,x
4,Adelie,36.7,19.3,193,3450


In [3]:
df = df.replace('x', pd.np.nan)
df = df.dropna()

  df = df.replace('x', pd.np.nan)


In [4]:
df.head()

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,39.1,18.7,181,3750
1,Adelie,39.5,17.4,186,3800
2,Adelie,40.3,18.0,195,3250
4,Adelie,36.7,19.3,193,3450
5,Adelie,39.3,20.6,190,3650


In [5]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['species'] = encoder.fit_transform(df['species'])

In [6]:
X = df.drop('species', axis=1)
y = df['species']

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
import numpy as np

class_priors = np.zeros(3)
for i in range(3):
    class_priors[i] = np.sum(y_train == i) / len(y_train)


In [10]:
class_means = np.zeros((3, X_train.shape[1]))
class_variances = np.zeros((3, X_train.shape[1]))

for i in range(3):
    X_class = X_train[y_train == i]
    class_means[i] = np.mean(X_class, axis=0)
    class_variances[i] = np.var(X_class, axis=0)


In [11]:
def naive_bayes(x):
    log_prob = np.zeros(3)
    for i in range(3):
        log_prob[i] = np.sum(-0.5 * np.log(2*np.pi*class_variances[i]) - 0.5 * ((x-class_means[i])**2) / class_variances[i]) + np.log(class_priors[i])
    return np.argmax(log_prob)


In [12]:
y_pred = np.zeros(len(y_test))
for i in range(len(X_test)):
    y_pred[i] = naive_bayes(X_test[i])


In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
cm = confusion_matrix(y_test, y_pred)


In [14]:
print("accuracy: " , accuracy)
print("precision: " , precision)
print("recall: " , recall)
print("cm: " , cm)


accuracy:  0.9509803921568627
precision:  [0.94       0.88235294 1.        ]
recall:  [0.95918367 0.83333333 1.        ]
cm:  [[47  2  0]
 [ 3 15  0]
 [ 0  0 35]]


In [15]:
### Using SK learn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy1 = accuracy_score(y_test, y_pred)
precision1 = precision_score(y_test, y_pred, average=None)
recall1 = recall_score(y_test, y_pred, average=None)
cm1 = confusion_matrix(y_test, y_pred)


print("accuracy1: " , accuracy1)
print("precision1: " , precision1)
print("recall1: " , recall1)
print("cm1: " , cm1)


accuracy1:  0.9509803921568627
precision1:  [0.94       0.88235294 1.        ]
recall1:  [0.95918367 0.83333333 1.        ]
cm1:  [[47  2  0]
 [ 3 15  0]
 [ 0  0 35]]


In [16]:
#### 1 vs ALL , using SK learn

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = GaussianNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Compute performance metrics for each class
classes = np.unique(y_test)
accuracy = []
precision = []
recall = []
cm = []
for c in classes:
    
    y_test_c = (y_test == c)
    y_pred_c = (y_pred == c)
    # Compute metrics for the current class
    accuracy_c = accuracy_score(y_test_c, y_pred_c)
    precision_c = precision_score(y_test_c, y_pred_c)
    recall_c = recall_score(y_test_c, y_pred_c)
    cm_c = confusion_matrix(y_test_c, y_pred_c)
    # Append metrics to the lists
    accuracy.append(accuracy_c)
    precision.append(precision_c)
    recall.append(recall_c)
    cm.append(cm_c)

# Convert the lists to arrays
accuracy3 = np.array(accuracy)
precision3 = np.array(precision)
recall3 = np.array(recall)
cm3 = np.array(cm)


In [17]:
print("accuracy1: " , accuracy3)
print("precision1: " , precision3)
print("recall1: " , recall3)
print("cm1: " , cm3)

accuracy1:  [0.95098039 0.95098039 1.        ]
precision1:  [0.94       0.88235294 1.        ]
recall1:  [0.95918367 0.83333333 1.        ]
cm1:  [[[50  3]
  [ 2 47]]

 [[82  2]
  [ 3 15]]

 [[67  0]
  [ 0 35]]]


In [None]:
################## metrics withput using library

# y_pred: predicted labels
# y_true: true labels

# Accuracy
accuracy = (y_pred == y_true).sum() / len(y_true)

# Confusion Matrix
classes = np.unique(y_true)
conf_mat = np.zeros((len(classes), len(classes)), dtype=int)
for i, true_label in enumerate(classes):
    for j, pred_label in enumerate(classes):
        conf_mat[i, j] = np.sum((y_true == true_label) & (y_pred == pred_label))

# Precision
precision = np.zeros(len(classes))
for i, label in enumerate(classes):
    precision[i] = conf_mat[i, i] / np.sum(conf_mat[:, i])

# Recall
recall = np.zeros(len(classes))
for i, label in enumerate(classes):
    recall[i] = conf_mat[i, i] / np.sum(conf_mat[i, :])
