In [2]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
import string
import re

from imblearn.over_sampling import RandomOverSampler

# Load SMS data
sms_data = pd.read_csv("C:/Users/dell/AppData/Local/Programs/Python/Python311/SMSSpamCollection.csv", sep="\t", header=None, names=["label", "text"])

# Preprocess the text column
sms_data['text'] = sms_data['text'].fillna('')  # Replace missing values with empty string
sms_data['text'] = sms_data['text'].str.lower()  # Convert all characters to lowercase
sms_data['text'] = sms_data['text'].str.replace('[^\w\s]', '')  # Remove punctuation
sms_data['text'] = sms_data['text'].str.strip()  # Remove leading/trailing white space

# Save the preprocessed dataset
sms_data.to_csv("preprocessed_dataset.csv", index=False)

# Load the preprocessed dataset
preprodata = pd.read_csv("preprocessed_dataset.csv")

# Split data into training and testing sets
# Separate spam and ham samples
spam_data = preprodata[preprodata["label"] == "spam"]
ham_data = preprodata[preprodata["label"] == "ham"]

# Upsample minority class
spam_data_upsampled = resample(spam_data, 
                               replace=True,     # sample with replacement
                               n_samples=len(ham_data),    # to match majority class
                               random_state=123) # reproducible results

ham_train, ham_test = train_test_split(ham_data, test_size=0.3)
spam_train, spam_test = train_test_split(spam_data_upsampled, test_size=0.3)

#print("Shape of ham_data:", ham_data.shape)
#print("Shape of spam_data:", spam_data.shape)

#print("Shape of ham_train:", ham_train.shape)
#print("Shape of spam_train:", spam_train.shape)

#print("Shape of ham_test:", ham_test.shape)
#print("Shape of spam_test:", spam_test.shape)

# Concatenate ham_train and spam_train
train_data = pd.concat([ham_train, spam_train], axis=0)
train_data = train_data.sample(frac=1, random_state=42)
#print(train_data.iloc[:10, :10].to_string(index=False))

# Concatenate ham_test and spam_test
test_data = pd.concat([ham_test, spam_test], axis=0)
test_data = test_data.sample(frac=1, random_state=42)

# Define the vocabulary
custom_vocab = list(string.ascii_lowercase + string.digits + ' ')
#custom_vocab = list(set("".join(train_data["text"].tolist())))

# Generate character n-gram features with custom vocabulary
vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(1, 3), vocabulary=custom_vocab, lowercase=True)
X_train = vectorizer.fit_transform(train_data['text'])

# Fit and transform the training data to generate features
#X_train = vectorizer.fit_transform(train_data['text'])

#print(vectorizer.get_feature_names_out())
#print("X_train", X_train.toarray()[:10, :10])

#Set print options to print entire array
#np.set_printoptions(threshold=np.inf)

#Print X_train
#print(train_data[['label', 'text']].head(500))    
#print(X_train.toarray())

# Fit and transform the training data to generate features
X_train = vectorizer.fit_transform(train_data['text'])
y_train = train_data['label']

# Train a Multinomial Naive Bayes classifier with Laplace smoothing
nb_classifier = MultinomialNB(alpha=1.0, fit_prior=True)
nb_classifier.fit(X_train, y_train)

# Transform the test data to generate features
X_test = vectorizer.transform(test_data['text'])
y_test = test_data['label']

# Make predictions on the test data
y_pred = nb_classifier.predict(X_test)

print("Number of predicted 'spam' samples:", sum(y_pred == 'spam'))
print("Number of predicted 'ham' samples:", sum(y_pred == 'ham'))

# Get the vocabulary
vocabulary = vectorizer.get_feature_names_out()

# Calculate the probabilities
probabilities = {}
#for i, label in enumerate(nb_classifier.classes_):
 #   feature_prob = np.exp(nb_classifier.feature_log_prob_[i])  # Convert log probabilities to linear scale
  #  probabilities[label] = dict(zip(vocabulary, feature_prob))

for i, label in enumerate(nb_classifier.classes_):
    feature_prob = (nb_classifier.feature_count_[i] + 1) / (nb_classifier.class_count_[i] + X_train.shape[1])
    probabilities[label] = dict(zip(vocabulary, feature_prob))    

# Print the probabilities
for label, prob in probabilities.items():
    print(f"Probabilities for class '{label}':")
    for feature, feature_prob in prob.items():
        print(f"Feature '{feature}': {feature_prob}")
    print()

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label="spam")
recall = recall_score(y_test, y_pred, pos_label="spam")
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion matrix:\n", conf_matrix)

Number of predicted 'spam' samples: 1290
Number of predicted 'ham' samples: 1606
Probabilities for class 'ham':
Feature 'a': 0.19992573678921643
Feature 'b': 0.05599915506005214
Feature 'c': 0.07160365497395692
Feature 'd': 0.09969470902289301
Feature 'e': 0.2605703847173161
Feature 'f': 0.05522651149625809
Feature 'g': 0.08869783971548248
Feature 'h': 0.1304426925462195
Feature 'i': 0.18592784389929393
Feature 'j': 0.020829213817627236
Feature 'k': 0.06715069794239252
Feature 'l': 0.13570036433931082
Feature 'm': 0.09468500193357754
Feature 'n': 0.17226397607867458
Feature 'o': 0.22369708902725138
Feature 'p': 0.059979615227022005
Feature 'q': 0.004393307861373149
Feature 'r': 0.13546159796943077
Feature 's': 0.14846104996171602
Feature 't': 0.21306836617140731
Feature 'u': 0.10578560154834703
Feature 'v': 0.03733751086876598
Feature 'w': 0.07643235704023699
Feature 'x': 0.01329327048732097
Feature 'y': 0.09814985292064876
Feature 'z': 0.008772253754163666
Feature '0': 0.0019164258397