Continious (dogs)

In [5]:
import numpy as np
import pandas as pd
from scipy.stats import binom

# Load dataset
data = pd.read_csv('dogs.csv')

# Check the column names to identify the correct column for categories
print("Column names in the dataset:", data.columns)

# Set the correct category column name
category_column = 'class'  # Update this if the column name is different

# Ensure the column exists
if category_column not in data.columns:
    raise KeyError(f"The column '{category_column}' does not exist in the dataset.")

# Calculate parameters
def calculate_gaussian_params(data):
    mean = np.mean(data)
    std = np.std(data)
    return mean, std

def calculate_binomial_params(data):
    n = max(data)
    p = np.mean(data) / n
    return n, p

def calculate_uniform_params(data):
    a = np.min(data)
    b = np.max(data)
    return a, b

params = {}
categories = data[category_column].unique()

for category in categories:
    category_data = data[data[category_column] == category]
    params[category] = {
        'height': calculate_gaussian_params(category_data['height']),
        'weight': calculate_gaussian_params(category_data['weight']),
        'bark_days': calculate_binomial_params(category_data['bark_days']),
        'ear_head_ratio': calculate_uniform_params(category_data['ear_head_ratio'])
    }

def gaussian_pdf(x, mean, std):
    return (1 / (np.sqrt(2 * np.pi) * std)) * np.exp(-((x - mean) ** 2) / (2 * std ** 2))

def binomial_pmf(x, n, p):
    return binom.pmf(x, n, p)

def uniform_pdf(x, a, b):
    return 1 / (b - a) if a <= x <= b else 0

def naive_bayes_classification(sample, params):
    categories = params.keys()
    max_prob = -1
    best_category = None
    for category in categories:
        prob = 1
        for feature, value in sample.items():
            if feature in params[category]:
                if feature in ['height', 'weight']:
                    mean, std = params[category][feature]
                    prob *= gaussian_pdf(value, mean, std)
                elif feature == 'bark_days':
                    n, p = params[category][feature]
                    prob *= binomial_pmf(value, n, p)
                elif feature == 'ear_head_ratio':
                    a, b = params[category][feature]
                    prob *= uniform_pdf(value, a, b)
        if prob > max_prob:
            max_prob = prob
            best_category = category
    return best_category

# Example usage
sample = {'height': 50, 'weight': 20, 'bark_days': 3, 'ear_head_ratio': 0.2}
category = naive_bayes_classification(sample, params)
print(f"The sample belongs to category: {category}")


Column names in the dataset: Index(['height', 'weight', 'bark_days', 'ear_head_ratio', 'class'], dtype='object')
The sample belongs to category: 2


Discrete (emails)

training

In [6]:
import pandas as pd
from collections import Counter
import re

# Load dataset
emails = pd.read_csv('emails.csv')

# Check the column names to identify the correct columns
print("Column names in the dataset:", emails.columns)

# Preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and non-alphabetic characters
    text = re.sub(r'\W+', ' ', text)
    # Tokenize text into words
    words = text.split()
    return words

# Initialize counters
spam_words = Counter()
ham_words = Counter()
spam_count = 0
ham_count = 0

# Process each email
for index, row in emails.iterrows():
    words = preprocess_text(row['text'])
    if row['spam'] == 1:
        spam_words.update(words)
        spam_count += len(words)
    else:
        ham_words.update(words)
        ham_count += len(words)

# Calculate probabilities
spam_probs = {word: count / spam_count for word, count in spam_words.items()}
ham_probs = {word: count / ham_count for word, count in ham_words.items()}

# Calculate prior probabilities
total_emails = len(emails)
spam_prior = len(emails[emails['spam'] == 1]) / total_emails
ham_prior = len(emails[emails['spam'] == 0]) / total_emails

print("Spam Probabilities:", spam_probs)
print("Ham Probabilities:", ham_probs)


Column names in the dataset: Index(['text', 'spam'], dtype='object')


classification

In [7]:
import numpy as np

def classify_email(text, spam_probs, ham_probs, spam_prior, ham_prior):
    words = preprocess_text(text)
    # Calculate log probabilities to avoid underflow
    log_spam_prob = np.log(spam_prior)
    log_ham_prob = np.log(ham_prior)

    for word in words:
        if word in spam_probs:
            log_spam_prob += np.log(spam_probs[word])
        else:
            log_spam_prob += np.log(1e-6)  # Smoothing for unseen words
        if word in ham_probs:
            log_ham_prob += np.log(ham_probs[word])
        else:
            log_ham_prob += np.log(1e-6)  # Smoothing for unseen words

    if log_spam_prob > log_ham_prob:
        return 'spam'
    else:
        return 'ham'

# Example usage
new_email = "Congratulations! You have won a free lottery ticket."
classification = classify_email(new_email, spam_probs, ham_probs, spam_prior, ham_prior)
print(f"The email is classified as: {classification}")


The email is classified as: spam


: 