<a href="https://colab.research.google.com/github/Charlee0616/Data-Mining/blob/main/C%26W_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import math
from scipy.stats import norm
import numpy as np

In [2]:
penguins = pd.read_csv("https://github.com/mbrudd/csci290/raw/refs/heads/main/data/penguins.csv")

In [3]:
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [4]:
def Naive_Bayes_Classifier(X, y, instance, numerical, categorical):
# X:df, y:target, instance:unseen instance, numerical:numerical attribute, categorical:categorical attributes

  priors = {}
  likelihoods = {}


  for clss in X[y].unique(): # Loops through the unique values in the target feature
    data = X[X[y] == clss] # Create a dataset for the rows where the clss is True
    priors[clss] = len(data) / len(X) # Calculate the priors for the dataset it created
    likelihoods[clss] = {} # Makes a dictionary where the clss is the key for the likelihoods
    for cat in categorical: # Gets likelihoods for categorical features
      likelihoods[clss][cat] = data[cat].value_counts(normalize=True).to_dict() # Gets the probabilites of the value counts in the cat attribute
    for num in numerical: # Gets likelihoods for numerical features
      likelihoods[clss][num] = norm( loc=data[num].mean(), scale = data[num].std()) # Gets the mean and standard deviation for the num attribute


    posterior = {} # Create a dictionary for calculating the posteriors

    for clss in priors:
      post = priors[clss] # Sets post equal to priors for that clss
      for cat in categorical: # Categorical features
        if cat in instance: # Checks if the cat is in the unseen instance
          if instance[cat] in likelihoods[clss][cat]: # Checks if the feature in instance is in the likelihoods
            post *= likelihoods[clss][cat][instance[cat]] # Multiplies post by the likelihoods for the cat features and then sets the post equal to that
          else:
            post *= 1
        else:
          pass
      for num in numerical: # Numerical features
        post *= likelihoods[clss][num].pdf(instance[num]) # Multiplies post by the likelihoods for the num features and then sets the post equal to that

      posterior[clss] = post # Sets the posterior for that clss equal to the post

  return max(posterior, key=posterior.get) # Returns the class with the highest posterior

In [5]:
# Dataset
df = pd.read_csv("https://github.com/WilliamPoe/CSCI-290/raw/refs/heads/main/Data/penguins.csv") # Penguins dataset
df.head()
# Tagret column in the dataset
target = 'species'
# Unseen Instance
instance = {'bill_length_mm':40.9, 'bill_depth_mm':13.7, 'flipper_length_mm':214	, 'body_mass_g':4650, 'sex':'female'}
# Numerical features
numerical = []
# Categorical features
categorical = []

## Remove [2:-1] if you want to test on all columns ##
for attrib in df.columns:
  if attrib != target:
    # Checks for numerical data types
    if df[attrib].dtype in ['float64', 'int64']:
      # Considers the column numerical if there are more than 10 unique values
      if len( df[attrib].unique() ) > 10:
        numerical.append(attrib)
    else:
      categorical.append(attrib)

print(numerical)
print(categorical)
# Calls Naive Bayes Classifier function
Naive_Bayes_Classifier(df, target, instance, numerical, categorical)

['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
['island', 'sex']


'Gentoo'

In [6]:
#Synthetic dataset
np.random.seed(42)
X = pd.DataFrame( {"x1": np.random.randn( 200 ), "x2": 2*np.random.randn( 200) - 5, "x3": np.random.randint(3, size=200) } )
X["target"] = (X["x1"] > -2)*(X["x2"] > -7 )*(X["x3"]!=1)*1

Starget = 'target'
# Unseen Instance
instance = {'x1':-3, 'x2':-8, 'x3':1}
# Numerical features
numerical = []
# Categorical features
categorical = []

## Remove [2:-1] if you want to test on all columns ##
for attrib in X.columns:
  if attrib != Starget:
    # Checks for numerical data types
    if X[attrib].dtype in ['float64', 'int64']:
      # Considers the column numerical if there are more than 10 unique values
      if len( X[attrib].unique() ) > 10:
        numerical.append(attrib)
      elif len(X[attrib].unique()) < 10:
        categorical.append(attrib)
    elif X[attrib].dtype in ['object']:
      categorical.append(attrib)

print(numerical)
print(categorical)
# Calls Naive Bayes Classifier function
Naive_Bayes_Classifier(X, Starget, instance, numerical, categorical)



['x1', 'x2']
['x3']


0

# **SciKit Learn Comparison**

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

**Test on Penguins**  

In [23]:
df.dropna(inplace=True)
data = pd.get_dummies(df, columns = ['species', 'island', 'sex'], drop_first=True)

In [24]:
X = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
model = GaussianNB()
model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_test)
y_pred

array(['Adelie', 'Chinstrap', 'Adelie', 'Gentoo', 'Adelie', 'Chinstrap',
       'Chinstrap', 'Gentoo', 'Gentoo', 'Gentoo', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Adelie', 'Gentoo',
       'Adelie', 'Chinstrap', 'Adelie', 'Adelie', 'Chinstrap', 'Gentoo',
       'Adelie', 'Adelie', 'Gentoo', 'Chinstrap', 'Gentoo', 'Chinstrap',
       'Gentoo', 'Chinstrap', 'Adelie', 'Adelie', 'Chinstrap',
       'Chinstrap', 'Gentoo', 'Gentoo', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Gentoo', 'Gentoo', 'Adelie', 'Adelie', 'Chinstrap',
       'Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Gentoo', 'Gentoo',
       'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Adelie', 'Gentoo',
       'Gentoo', 'Adelie', 'Chinstrap', 'Chinstrap', 'Adelie', 'Adelie',
       'Chinstrap', 'Adelie'], dtype='<U9')

In [21]:
user_pred = model.predict([[40.9, 13.7, 214	, 4650]])
user_pred



array(['Gentoo'], dtype='<U9')

In [27]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('Classification Report:')
print(classification_report(y_test, y_pred))

print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.97
Classification Report:
              precision    recall  f1-score   support

      Adelie       0.94      1.00      0.97        31
   Chinstrap       1.00      0.89      0.94        18
      Gentoo       1.00      1.00      1.00        18

    accuracy                           0.97        67
   macro avg       0.98      0.96      0.97        67
weighted avg       0.97      0.97      0.97        67

Confusion Matrix:
[[31  0  0]
 [ 2 16  0]
 [ 0  0 18]]


**Test on synthetic dataset**

In [17]:
#Synthetic dataset
np.random.seed(42)
S = pd.DataFrame( {"x1": np.random.randn( 200 ), "x2": 2*np.random.randn( 200) - 5, "x3": np.random.randint(3, size=200) } )
S["target"] = (S["x1"] > -2)*(S["x2"] > -7 )*(S["x3"]!=1)*1

In [18]:
cnb = CategoricalNB().fit( np.array( S[["x3"]] ).reshape(-1,1), np.array( S["target"] ).reshape(-1,1) )
cnb.predict( np.array([0,1,2] ).reshape(-1,1) )

  y = column_or_1d(y, warn=True)


array([1, 0, 1])

In [19]:
gnb = GaussianNB().fit( np.array( S[["x1","x2"]] ), np.array( S["target"] ).reshape(-1,1))
gnb.predict( np.array( pd.DataFrame( {"x1": [-3,-1.1], "x2": [-8,-3] } ) ) )

  y = column_or_1d(y, warn=True)


array([0, 1])