Task 1: Theory Questions


In [None]:
# 1. Naive Bayes relies on the fundamental assumption that every input feature contributes independently to the output class, given the label.

# 2. GaussianNB suits continuous-valued inputs with Gaussian distribution, MultinomialNB works well with frequency/count-based features, while BernoulliNB expects binary (0/1) input, typical in text classification.

# 3. The model’s simplicity and independence assumption make it highly scalable and efficient for datasets with thousands of features (e.g., words in documents).


Task 2: Spam Detection using MultinomialNB

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix

# Load dataset
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
dataset = pd.read_csv(url, sep='\t', header=None, names=["Category", "Message"])

# Split the data
X_msg = dataset['Message']
y_label = dataset['Category']
X_msg_train, X_msg_test, y_label_train, y_label_test = train_test_split(X_msg, y_label, random_state=0, test_size=0.3)

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_msg_train)
X_test_vect = vectorizer.transform(X_msg_test)

# Train classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_vect, y_label_train)
y_predicted = nb_model.predict(X_test_vect)

# Evaluate
print("Classification Report:\n", classification_report(y_label_test, y_predicted, target_names=['ham', 'spam']))
print("Confusion Matrix:\n", confusion_matrix(y_label_test, y_predicted))


Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1451
        spam       1.00      0.71      0.83       221

    accuracy                           0.96      1672
   macro avg       0.98      0.85      0.90      1672
weighted avg       0.96      0.96      0.96      1672

Confusion Matrix:
 [[1451    0]
 [  65  156]]


Task 3: GaussianNB with Iris or Wine Dataset

In [3]:
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load Iris data
iris = load_iris()
X_data, y_data = iris.data, iris.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, random_state=42)

# Gaussian NB
model_gnb = GaussianNB()
model_gnb.fit(X_train, y_train)
gnb_acc = accuracy_score(y_test, model_gnb.predict(X_test))

# Logistic Regression
model_lr = LogisticRegression(max_iter=200)
model_lr.fit(X_train, y_train)
lr_acc = accuracy_score(y_test, model_lr.predict(X_test))

# Decision Tree
model_tree = DecisionTreeClassifier()
model_tree.fit(X_train, y_train)
tree_acc = accuracy_score(y_test, model_tree.predict(X_test))

print(f"GaussianNB Accuracy: {gnb_acc:.3f}")
print(f"Logistic Regression Accuracy: {lr_acc:.3f}")
print(f"Decision Tree Accuracy: {tree_acc:.3f}")


GaussianNB Accuracy: 1.000
Logistic Regression Accuracy: 1.000
Decision Tree Accuracy: 1.000
