In [None]:
# Q1. The problem asks for the probability that an employee is a smoker given that they use the health insurance plan.
# This can be solved using conditional probability and Bayes' theorem.

# P(Smoker | Uses Health Insurance) = P(Uses Health Insurance | Smoker) * P(Smoker) / P(Uses Health Insurance)

# Given:
P_Uses_Health_Insurance = 0.70  # 70% of employees use the health insurance plan
P_Smoker_given_Uses_Health_Insurance = 0.40  # 40% of employees who use the plan are smokers

# To find the probability P(Smoker | Uses Health Insurance), we directly use P(Smoker | Uses Health Insurance)
P_Smoker_given_Uses_Health_Insurance = P_Smoker_given_Uses_Health_Insurance

# The formula is already given, hence the probability is 0.40 or 40%.
print(f"The probability that an employee is a smoker given they use the health insurance plan: {P_Smoker_given_Uses_Health_Insurance}")

# Q2. The difference between Bernoulli Naive Bayes and Multinomial Naive Bayes:
# - Bernoulli Naive Bayes is used when the features are binary (0 or 1) and models binary events.
# - Multinomial Naive Bayes is used for discrete counts (e.g., word counts in text classification).
#   It assumes the features represent the frequencies of events.

# Q3. How does Bernoulli Naive Bayes handle missing values?
# Bernoulli Naive Bayes requires binary features, so missing values are typically handled by treating them as a separate class or using imputation techniques.
# Scikit-learn's BernoulliNB does not handle missing values directly, and missing values should be handled before using the classifier.

# Q4. Can Gaussian Naive Bayes be used for multi-class classification?
# Yes, Gaussian Naive Bayes can be used for multi-class classification as it assumes that the continuous features follow a Gaussian (normal) distribution for each class.
# It works well with multi-class problems where features are continuous and Gaussian-distributed.

# Q5. Assignment: Implement Bernoulli Naive Bayes, Multinomial Naive Bayes, and Gaussian Naive Bayes classifiers on the "Spambase" dataset.

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.datasets import fetch_openml

# Step 1: Load the dataset
url = "https://www.openml.org/data/get_csv/345/Spambase.arff"
data = pd.read_csv(url)

# Step 2: Prepare data
X = data.drop('class', axis=1)  # Features
y = data['class']  # Target variable

# Step 3: Train-test split (use 70% for training, 30% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Create Naive Bayes classifiers
bernoulli_nb = BernoulliNB()
multinomial_nb = MultinomialNB()
gaussian_nb = GaussianNB()

# Step 5: Evaluate models using 10-fold cross-validation
bernoulli_cv = cross_val_score(bernoulli_nb, X_train, y_train, cv=10, scoring='accuracy')
multinomial_cv = cross_val_score(multinomial_nb, X_train, y_train, cv=10, scoring='accuracy')
gaussian_cv = cross_val_score(gaussian_nb, X_train, y_train, cv=10, scoring='accuracy')

# Step 6: Evaluate models on the test set
bernoulli_nb.fit(X_train, y_train)
multinomial_nb.fit(X_train, y_train)
gaussian_nb.fit(X_train, y_train)

bernoulli_y_pred = bernoulli_nb.predict(X_test)
multinomial_y_pred = multinomial_nb.predict(X_test)
gaussian_y_pred = gaussian_nb.predict(X_test)

# Calculate performance metrics for each model
def get_metrics(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1 Score": f1_score(y_true, y_pred)
    }

bernoulli_metrics = get_metrics(y_test, bernoulli_y_pred)
multinomial_metrics = get_metrics(y_test, multinomial_y_pred)
gaussian_metrics = get_metrics(y_test, gaussian_y_pred)

# Display the results
print("Bernoulli Naive Bayes Metrics:", bernoulli_metrics)
print("Multinomial Naive Bayes Metrics:", multinomial_metrics)
print("Gaussian Naive Bayes Metrics:", gaussian_metrics)

# Step 7: Discussion and Conclusion
# - Bernoulli Naive Bayes performs well for binary features, and is commonly used in text classification problems where features represent word presence.
# - Multinomial Naive Bayes performs well with count-based features, such as word counts, and is often used for text classification.
# - Gaussian Naive Bayes is suitable for continuous features, assuming they follow a normal distribution, and works well for real-valued features.

# Discussion:
# In this case, if the features are binary (presence or absence of specific words), Bernoulli Naive Bayes will likely perform the best.
# Multinomial Naive Bayes might perform better if the features represent counts of occurrences.
# Gaussian Naive Bayes could struggle because it assumes that the data is normally distributed, which might not be the case for this dataset.

# Future Work Suggestions:
# - Try feature engineering or feature selection to improve model performance.
# - Experiment with other models, like Logistic Regression or Support Vector Machines.
# - Hyperparameter tuning can be performed using GridSearchCV or RandomizedSearchCV to optimize the models further.
