#### Import Libraries

In [2]:
import numpy as np
from skimage import feature, color, util, io
import os
import matplotlib.pyplot as plt
import torch
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch.nn.functional import one_hot

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### Data preprocessing

In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string

#remove the punctuations and stopwords

def text_process(text):

    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word.lower() for word in text.split() if word.lower() not in stopwords.words('english')]

    return " ".join(text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /media/commlab/TenTB/home/dmytro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Sepatate sms labels and sms text

In [4]:
# Set the path to your text file
file_path = '../smsspamcollection/SMSSpamCollection.txt'

# Initialize empty lists for data and labels
data_list = []
label_list = []

# Read in the file line by line
with open(file_path, 'r') as f:
    lines = f.readlines()
    for line in lines:
        # Split each line into data and label using a single space as the delimiter
        line = line.strip().split('\t')
        # Append data and label to their respective lists
        data_list.append(text_process(line[1]))
        label_list.append(line[0])
        

#### Get information on labels

In [5]:
n = len(label_list)
classNames = set(label_list)
classCount = {}
numbClasses = len(classNames)

for c in label_list: 
    classCount[c] =  classCount.get(c, 0) + 1

y = []
for c in label_list:
    if c == "ham":
        y.append(0)
    else:
        y.append(1)

print(classCount)
print(label_list)

{'ham': 4827, 'spam': 747}
['ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'spa

#### MAIN

In [14]:
%run ../heatmap.py
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
import random

x = data_list

k = 5  # number of folds

# Create a list of shuffled indices
shuffled_indices = random.sample(range(n), n)

# Use the shuffled indices to shuffle both lists
x_shuffled = [x[i] for i in shuffled_indices]
y_shuffled = [y[i] for i in shuffled_indices]

folds = [[] for _ in range(k)]
for i in range(k):
    left = round((i/k)*n)
    right = round(((i+1)/k)*n)
    folds[i] = [x_shuffled[left:right], y_shuffled[left:right]]

# Create a CountVectorizer object to convert the sentences into bag-of-words features
vectorizer = CountVectorizer()

avg_acc = 0
avg_precision = 0
avg_recall = 0
avg_f1 = 0
cm_total = np.zeros((2, 2))

for i in range(k):
    x_train = []
    y_train = []
    # split the data into training and test sets
    for j in range(k):
        if j != i: 
            x_train = x_train + folds[j][0]
            y_train = y_train + folds[j][1]

    x_test = folds[i][0]
    y_test = folds[i][1]

    # Convert the sentences into bag-of-words feature vectors
    x_train = vectorizer.fit_transform(x_train)
    x_test = vectorizer.transform(x_test)

    # Create a Naive Bayes classifier object
    clf = MultinomialNB(alpha = 100.0)
    # Fit the classifier on the training data
    clf.fit(x_train, y_train)

    # Make predictions on the testing data
    y_pred = clf.predict(x_test)
    cm = confusion_matrix(y_test, y_pred, labels = [0, 1])

    cm_total = cm_total + cm
    # Evaluate the performance of the classifier
    avg_acc += accuracy_score(y_test, y_pred)
    avg_precision += precision_score(y_test, y_pred)
    avg_recall += recall_score(y_test, y_pred)
    avg_f1 += f1_score(y_test, y_pred)

print(f"Accuracy: {avg_acc/k:.2f}")
print(f"Precision: {avg_precision/k:.2f}")
print(f"Recall: {avg_recall/k:.2f}")
print(f"F1-Score: {avg_f1/k:.2f}")

grid_labels = ['ham', 'spam']
# compute the sum of each row
row_sums = cm_total.sum(axis=1)
cm_normalized = cm_total / row_sums[:, np.newaxis]
im, cbar = heatmap(cm_normalized, grid_labels, grid_labels)
annotate_heatmap(im)
plt.savefig("confusionMatrix_alpha = 100.0.png")
plt.close()

Accuracy: 0.91
Precision: 1.00
Recall: 0.35
F1-Score: 0.52


<b> alpha = 0.00001 </b> <br>
Accuracy: 0.98 <br>
Precision: 0.94 <br>
Recall: 0.90 <br>
F1-Score: 0.92

<b>alpha = 1.0 </b> <br>
Accuracy: 0.98 <br>
Precision: 0.96 <br>
Recall: 0.91 <br>
F1-Score: 0.93

<b>alpha = 10.0 </b> <br>
Accuracy: 0.97 <br>
Precision: 0.99 <br>
Recall: 0.79 <br>
F1-Score: 0.88

<b>alpha = 100.0 </b> <br>
Accuracy: 0.91 <br>
Precision: 1.00 <br>
Recall: 0.35 <br>
F1-Score: 0.52

In [48]:
if torch.cuda.is_available():
    print(f'{torch.cuda.device_count()} GPU(s) are available!')
    for i in range(torch.cuda.device_count()):
        print(f'GPU {i}: {torch.cuda.get_device_name(i)}')
        print(f'\tCompute capability: {torch.cuda.get_device_capability(i)}')
        print(f'\tMemory: {torch.cuda.get_device_properties(i).total_memory / 1024 ** 2:.2f} MB')
else:
    print('CUDA is not available.')


2 GPU(s) are available!
GPU 0: NVIDIA GeForce RTX 2080 Ti
	Compute capability: (7, 5)
	Memory: 11019.56 MB
GPU 1: NVIDIA GeForce RTX 2080 Ti
	Compute capability: (7, 5)
	Memory: 11018.25 MB
