In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
import string

# Load the SMS Spam Collection dataset from a CSV file
sms_df = pd.read_csv('C:/Users/Driti/Downloads/smsspamcollection/SMSSpamCollection.csv', sep='\t', names=['label', 'message'])

#Split data into training and testing sets
# Separate spam and ham samples
spam_data = sms_df[sms_df["label"] == "spam"]
ham_data = sms_df[sms_df["label"] == "ham"]

# Calculate the number of samples for each set
spam_train_samples = int(len(spam_data) * 0.8)
spam_test_samples = len(spam_data) - spam_train_samples
ham_train_samples = int(len(ham_data) * 0.8)
ham_test_samples = len(ham_data) - ham_train_samples

# Randomly select samples for training and testing from spam and ham data
spam_train_data = spam_data.sample(n=spam_train_samples, random_state=42)
spam_test_data = spam_data.drop(spam_train_data.index).sample(n=spam_test_samples, random_state=42)
ham_train_data = ham_data.sample(n=ham_train_samples, random_state=42)
ham_test_data = ham_data.drop(ham_train_data.index).sample(n=ham_test_samples, random_state=42)

# Combine spam and ham samples for training and testing sets
train_data = pd.concat([spam_train_data, ham_train_data])
test_data = pd.concat([spam_test_data, ham_test_data])

# Shuffle the training and testing sets
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
test_data = test_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Define the vocabulary
my_vocab = string.ascii_uppercase + string.ascii_lowercase + string.digits + string.punctuation

# Generate character n-gram features with custom vocabulary
vectorizer = CountVectorizer(analyzer="char_wb", ngram_range=(1, 3), vocabulary=my_vocab, lowercase=True)
train_features = vectorizer.fit_transform(train_data["message"])
test_features = vectorizer.transform(test_data["message"])

# Train a Naive Bayes classifier
classifier = MultinomialNB(alpha=1.0)
classifier.fit(train_features, train_data["label"])

# Get the vocabulary
vocabulary = vectorizer.get_feature_names_out()

# Calculate the probabilities
probabilities = {}
for i, label in enumerate(classifier.classes_):
    feature_prob = np.exp(classifier.feature_log_prob_[i])  # Convert log probabilities to linear scale
    probabilities[label] = dict(zip(vocabulary, feature_prob))

# Print the probabilities
for label, prob in probabilities.items():
    print(f"Probabilities for class '{label}':")
    for feature, feature_prob in prob.items():
        print(f"Feature '{feature}': {feature_prob}")
        
    
# Create the confusion matrix
predictions = classifier.predict(test_features)
cm = confusion_matrix(test_data["label"], predictions) 

accuracy = accuracy_score(test_data["label"], predictions)
precision = precision_score(test_data["label"], predictions, pos_label="spam", zero_division=0)
recall = recall_score(test_data["label"], predictions, pos_label="spam", zero_division=0)
f1score = f1_score(test_data["label"], predictions, average='macro')
precision_macro, recall_macro, f1score_macro, _ = score(test_data["label"], predictions, average='macro')

# Print the results
print("Accuracy: {:.3%}".format(accuracy))
print("Precision: {:.3%}".format(precision))
print("Recall: {:.3%}".format(recall))
print("F1-score: {:.3%}".format(f1score))
print("Confusion Matrix:\n", cm)
print("Macro-Average Precision: {:.3f}".format(precision_macro))
print("Macro-Average Recall: {:.3f}".format(recall_macro))
print("Macro-Average F1-score: {:.3f}".format(f1score_macro))



Probabilities for class 'ham':
Feature 'A': 4.507672057842451e-06
Feature 'B': 4.507672057842451e-06
Feature 'C': 4.507672057842451e-06
Feature 'D': 4.507672057842451e-06
Feature 'E': 4.507672057842451e-06
Feature 'F': 4.507672057842451e-06
Feature 'G': 4.507672057842451e-06
Feature 'H': 4.507672057842451e-06
Feature 'I': 4.507672057842451e-06
Feature 'J': 4.507672057842451e-06
Feature 'K': 4.507672057842451e-06
Feature 'L': 4.507672057842451e-06
Feature 'M': 4.507672057842451e-06
Feature 'N': 4.507672057842451e-06
Feature 'O': 4.507672057842451e-06
Feature 'P': 4.507672057842451e-06
Feature 'Q': 4.507672057842451e-06
Feature 'R': 4.507672057842451e-06
Feature 'S': 4.507672057842451e-06
Feature 'T': 4.507672057842451e-06
Feature 'U': 4.507672057842451e-06
Feature 'V': 4.507672057842451e-06
Feature 'W': 4.507672057842451e-06
Feature 'X': 4.507672057842451e-06
Feature 'Y': 4.507672057842451e-06
Feature 'Z': 4.507672057842451e-06
Feature 'a': 0.07289807251942815
Feature 'b': 0.01485277943