# Team Work 2 : Spam Filtering

**Authors:** CHRETIEN Jérémy, DAVIDSON Colin, LAFAGE Adrien, REMBUSCH Gabrielle and WILBRINK Aurore.

In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords as stpw
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import time

[nltk_data] Downloading package stopwords to /home/eisti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eisti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**Starts time count**

In [2]:
start = time.time()

**Loads data from CSV file.**

In [3]:
data = pd.read_csv("SMS_source.csv")
data.head()

Unnamed: 0,labels,contents
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Splitting

In [4]:
np.random.seed(0) # to get the same randomness
threshold = np.random.rand(len(data)) < 0.7
train_set = data[threshold]
test_set = data[~threshold]
print(f"{len(train_set)} training instances (~70%).\n{len(test_set)} testing instances (~30%).")

3915 training instances (~70%).
1659 testing instances (~30%).


### Train split distribution

In [5]:
ham_prop = len(train_set[train_set.labels == "ham"])/len(train_set) * 100
spam_prop = len(train_set[train_set.labels == "spam"])/len(train_set) * 100

print(f"ham: {ham_prop:.4}%\nspam: {spam_prop:.4}%")

ham: 86.54%
spam: 13.46%


### Test split distribution

In [6]:
ham_prop = len(test_set[test_set.labels == "ham"])/len(test_set) * 100
spam_prop = len(test_set[test_set.labels == "spam"])/len(test_set) * 100

print(f"ham: {ham_prop:.4}%\nspam: {spam_prop:.4}%")

ham: 86.74%
spam: 13.26%


## Data Processing

In [7]:
def preprocess(content):
    content = content.lower()
    content = content.translate(str.maketrans("","", string.punctuation))
    content = re.sub(r'\d+ *|\b[a-z]\b *', "", content) # remove isolated letters
    content = content.strip()
    tokens = word_tokenize(content)
    stopwords = set(stpw.words("english"))
    # removes stopwords and duplicates
    content = " ".join(
        list(dict.fromkeys([t for t in tokens if not t in stopwords]))
    )
    return content

In [8]:
train_set.contents = train_set.contents.apply(preprocess)

In [9]:
train_set.head()

Unnamed: 0,labels,contents
0,ham,go jurong point crazy available bugis great wo...
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,dun say early hor already
4,ham,nah dont think goes usf lives around though
5,spam,freemsg hey darling weeks word back id like fu...


**Splits train set by labels.**

In [10]:
spam_data = train_set[train_set.labels == "spam"]
spam_freq = len(spam_data)/len(train_set)
ham_data = train_set[train_set.labels == "ham"]
ham_freq = len(ham_data)/len(train_set)
print(f"Number of spam instances: {len(spam_data)}.\nNumber of ham instances: {len(ham_data)}.")

Number of spam instances: 527.
Number of ham instances: 3388.


**Computes word occurences for each class.**

In [11]:
# Spam
spam_contents = (" ".join(spam_data.contents)).split(" ")
uniq_words = list(dict.fromkeys(spam_contents))
occurence_spam_data = pd.DataFrame(
    {
        'word': uniq_words,
        'occurence': [ spam_contents.count(word) for word in uniq_words],
    }
)
occurence_spam_data = occurence_spam_data.sort_values(by="occurence", ascending=False)

In [12]:
# Ham
ham_contents = (" ".join(ham_data.contents)).split(" ")
uniq_words = list(dict.fromkeys(ham_contents))
occurence_ham_data = pd.DataFrame(
    {
        'word': uniq_words,
        'occurence': [ ham_contents.count(word) for word in uniq_words],
    }
)
occurence_ham_data = occurence_ham_data.sort_values(by="occurence", ascending=False)

**Concatenates the words of each class into one list avoiding duplicates.**

In [13]:
uniq_words = list(dict.fromkeys(list(occurence_ham_data.word)+list(occurence_spam_data.word)))
print(f"Total number of words: {len(uniq_words)}.")

Total number of words: 7148.


**Creates the final DataFrame used in the next section. (it takes some time)**

In [14]:
knowledge = pd.DataFrame(
    {
        "words": uniq_words,
        "ham": [
            occurence_ham_data[occurence_ham_data.word == word].occurence.item()
            if len(occurence_ham_data[occurence_ham_data.word == word]) > 0 else 0
            for word in uniq_words
        ],
        "spam": [
            occurence_spam_data[occurence_spam_data.word == word].occurence.item()
            if len(occurence_spam_data[occurence_spam_data.word == word]) > 0 else 0
            for word in uniq_words
        ]
    }
)

knowledge.head()

Unnamed: 0,words,ham,spam
0,im,291,7
1,get,223,53
2,ok,173,3
3,go,168,21
4,dont,167,15


In [15]:
test_set.contents = test_set.contents.apply(preprocess)
test_set.head()

Unnamed: 0,labels,contents
1,ham,ok lar joking wif oni
7,ham,per request melle oru minnaminunginte nurungu ...
8,spam,winner valued network customer selected receiv...
10,ham,im gon na home soon dont want talk stuff anymo...
13,ham,ive searching right words thank breather promi...


## Naive Bayes Classifier

In [16]:
class NaiveBayes:
    """
    Classificator based on Naive Bayes' method to make predictions on tweets (Spam filtering)
    Parameters
    ----------
    knowledge: Pandas DataFrame
        DataFrame of the word occurences for each label/class.
    labels: Pandas Series
        Serie of labels.
    """
    def __init__(self, knowledge, labels):
        self.knowledge = knowledge
        self.labels = labels
        self.alpha = 1
        self.N = len(self.knowledge.words)
        self.priors = self.computes_priors(self.labels)

    def computes_priors(self, labels):
        """ Computes prior probabilities.
        Parameter
        ---------
        labels: Pandas Series
            Serie of labels.
        Recall
        ------
        prior is P(label=l_i).
        """
        priors = []
        for count in labels.value_counts():
            priors.append(count/len(labels))            

        return priors

    def computes_likelihood(self, word, label):
        """ Computes likelihood of the existence of a word in a sentence
        knowing a the sentence's label.
        Parameters
        ----------
        word: string

        label: string
        """
        occ = self.knowledge.loc[self.knowledge.words == word, label].item()
        total = self.labels.value_counts()[label]
        
        return (occ + self.alpha) / (total + self.alpha * self.N)

    def predict(self, sentence):
        """ Predicts a label for a sentence.
        Parameter
        ---------
        sentence: string list.
        """
        ham_p = self.priors[0]
        spam_p = self.priors[1]

        for word in sentence:
            if word in list(self.knowledge.words):
                ham_p *= self.computes_likelihood(word, "ham")
                spam_p *= self.computes_likelihood(word, "spam")
        
        if ham_p > spam_p:
            return "ham"
        elif spam_p > ham_p:
            return "spam"
        else:
            return "unknown"

    def accuracy(self, predicted, expected):
        """ Computes accuracy according to both lists of
        expected and predicted values.
        Parameters
        ----------
        predicted: list

        expected: list
        """
        data = pd.DataFrame({
            'expected': expected,
            'predicted': predicted
        })
        return len(data[data.expected == data.predicted])/len(data)

    def precision(self, predicted, expected, label):
        """ Computes the precision on a label according to
        both lists of expected and predicted values.
        Parameters
        ----------
        predicted: list

        expected: list

        label: string
        """
        data = pd.DataFrame({
            'expected': expected,
            'predicted': predicted
        })
        data = data[data.expected == label]
        return len(data[data.expected == data.predicted])/len(data)

    def recall(self, predicted, expected, label):
        """ Computes recall on a label according to both lists
        of expected and predicted values.
        Parameters
        ----------
        predicted: list

        expected: list

        label: string
        """
        data = pd.DataFrame({
            'expected': expected,
            'predicted': predicted
        })
        data = data[data.predicted == label]
        return len(data[data.expected == data.predicted])/len(data)

    def f1_score(self, predicted, expected, label):
        """ Computes F1 score on a label according to both lists of
        expected and predicted values.
        Parameters
        ----------
        predicted: list

        expected: list

        label: string
        """
        precision_ = self.precision(predicted, expected, label)
        recall_ = self.recall(predicted, expected, label)
        return 2 * (precision_*recall_) / (precision_ + recall_)

    def eval(self, predicted, expected):
        """
        Prints infos on classificator's efficiency based on expectations and results
        """
        print(f"Acccuracy: {self.accuracy(predicted, expected)}")
        print(f"Precision on ham label: {self.precision(predicted, expected, 'ham')}")
        print(f"Precision on spam label: {self.precision(predicted, expected, 'spam')}")
        print(f"Recall on ham label: {self.recall(predicted, expected, 'ham')}")
        print(f"Recall on spam label: {self.recall(predicted, expected, 'spam')}")
        print(f"F1 score on ham label: {self.f1_score(predicted, expected, 'ham')}")
        print(f"F1 score on spam label: {self.f1_score(predicted, expected, 'spam')}")


In [17]:
clf = NaiveBayes(knowledge, train_set.labels)

In [18]:
clf.predict(["im"])

'ham'

In [19]:
clf.predict(["call", "free"])

'spam'

## Test classifier performance

In [20]:
expected = list(test_set.labels)
predicted = [clf.predict(sentence.split(" ")) for sentence in test_set.contents]

In [21]:
clf.eval(predicted, expected)

Acccuracy: 0.9783001808318263
Precision on ham label: 1.0
Precision on spam label: 0.8363636363636363
Recall on ham label: 0.9755932203389831
Recall on spam label: 1.0
F1 score on ham label: 0.9876458476321209
F1 score on spam label: 0.9108910891089108


**Time performance**

In [22]:
print("Execution time : " + str(time.time() - start))

Execution time : 85.97044634819031
