In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('penguins.csv')

# Remove rows with missing values
data.dropna(inplace=True)

# Initialize the label encoder
label_encoder = LabelEncoder()

# Encode categorical columns
categorical_columns = ['species', 'island', 'sex']
for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])

# Display the first few rows of the processed dataset
print(data.head())


   species  island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0        0       2            39.1           18.7              181.0   
1        0       2            39.5           17.4              186.0   
2        0       2            40.3           18.0              195.0   
4        0       2            36.7           19.3              193.0   
5        0       2            39.3           20.6              190.0   

   body_mass_g  sex  
0       3750.0    1  
1       3800.0    0  
2       3250.0    0  
4       3450.0    0  
5       3650.0    1  


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report

# Split the dataset into features (X) and target (y)
X = data.drop(columns='species')  # Features (excluding the target column)
y = data['species']  # Target (species)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply Gaussian Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = gnb.predict(X_test)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report (includes precision, recall, f1-score)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Frequency table and likelihood (GaussianNB assumes Gaussian likelihoods)
# For each feature, we can display the means and variances for each class
means = gnb.theta_  # Means for each class and feature
variances = gnb.var_  # Variances for each class and feature

print("Means (Likelihoods) for each class:")
print(means)
print("\nVariances (Likelihoods) for each class:")
print(variances)


Confusion Matrix:
[[46  2  0]
 [ 1 22  0]
 [ 0  0 29]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97        48
           1       0.92      0.96      0.94        23
           2       1.00      1.00      1.00        29

    accuracy                           0.97       100
   macro avg       0.97      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100

Means (Likelihoods) for each class:
[[9.89795918e-01 3.89295918e+01 1.82765306e+01 1.90112245e+02
  3.67525510e+03 4.89795918e-01]
 [1.00000000e+00 4.87933333e+01 1.83888889e+01 1.95466667e+02
  3.73833333e+03 4.66666667e-01]
 [0.00000000e+00 4.76211111e+01 1.49900000e+01 2.17477778e+02
  5.10111111e+03 5.55555556e-01]]

Variances (Likelihoods) for each class:
[[6.02626516e-01 7.68522231e+00 1.39146554e+00 4.33656419e+01
  2.23360905e+05 2.50585700e-01]
 [6.89823146e-04 1.12217565e+01 1.17234414e+00 5.39829120e+01
  1.65905556e+05 2.49

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Apply Multinomial Naive Bayes classifier
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

# Predict the labels for the test set
y_pred_mnb = mnb.predict(X_test)

# Confusion matrix
conf_matrix_mnb = confusion_matrix(y_test, y_pred_mnb)
print("Confusion Matrix (Multinomial NB):")
print(conf_matrix_mnb)

# Classification report
print("Classification Report (Multinomial NB):")
print(classification_report(y_test, y_pred_mnb))


Confusion Matrix (Multinomial NB):
[[44  2  2]
 [ 9 14  0]
 [ 1  0 28]]
Classification Report (Multinomial NB):
              precision    recall  f1-score   support

           0       0.81      0.92      0.86        48
           1       0.88      0.61      0.72        23
           2       0.93      0.97      0.95        29

    accuracy                           0.86       100
   macro avg       0.87      0.83      0.84       100
weighted avg       0.86      0.86      0.85       100



In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import Binarizer

# Apply binarization to the features (convert continuous values to binary)
binarizer = Binarizer()
X_bin = binarizer.fit_transform(X)

# Split the binarized dataset
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(X_bin, y, test_size=0.3, random_state=42)

# Apply Bernoulli Naive Bayes classifier
bnb = BernoulliNB()
bnb.fit(X_train_bin, y_train_bin)

# Predict the labels for the test set
y_pred_bnb = bnb.predict(X_test_bin)

# Confusion matrix
conf_matrix_bnb = confusion_matrix(y_test_bin, y_pred_bnb)
print("Confusion Matrix (Bernoulli NB):")
print(conf_matrix_bnb)

# Classification report
print("Classification Report (Bernoulli NB):")
print(classification_report(y_test_bin, y_pred_bnb))


Confusion Matrix (Bernoulli NB):
[[34  0 14]
 [23  0  0]
 [ 0  0 29]]
Classification Report (Bernoulli NB):
              precision    recall  f1-score   support

           0       0.60      0.71      0.65        48
           1       0.00      0.00      0.00        23
           2       0.67      1.00      0.81        29

    accuracy                           0.63       100
   macro avg       0.42      0.57      0.48       100
weighted avg       0.48      0.63      0.54       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Example: Creating boolean parameters by thresholding a feature (e.g., bill_length_mm > 40)
X['bill_length_bool'] = (X['bill_length_mm'] > 40).astype(int)

# Now, split the dataset again
X_train_bool, X_test_bool, y_train_bool, y_test_bool = train_test_split(X[['bill_length_bool']], y, test_size=0.3, random_state=42)

# Apply Bernoulli Naive Bayes for boolean features
bnb_bool = BernoulliNB()
bnb_bool.fit(X_train_bool, y_train_bool)

# Predict the labels for the test set
y_pred_bnb_bool = bnb_bool.predict(X_test_bool)

# Confusion matrix
conf_matrix_bnb_bool = confusion_matrix(y_test_bool, y_pred_bnb_bool)
print("Confusion Matrix (Bernoulli NB with Boolean Feature):")
print(conf_matrix_bnb_bool)

# Classification report
print("Classification Report (Bernoulli NB with Boolean Feature):")
print(classification_report(y_test_bool, y_pred_bnb_bool))


Confusion Matrix (Bernoulli NB with Boolean Feature):
[[32  0 16]
 [ 0  0 23]
 [ 0  0 29]]
Classification Report (Bernoulli NB with Boolean Feature):
              precision    recall  f1-score   support

           0       1.00      0.67      0.80        48
           1       0.00      0.00      0.00        23
           2       0.43      1.00      0.60        29

    accuracy                           0.61       100
   macro avg       0.48      0.56      0.47       100
weighted avg       0.60      0.61      0.56       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
