In [4]:
# load the dataset
import pandas as pd

df = pd.read_csv("/home/artur/Desktop/Spam_Classifier/src/utils/spam.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [163]:
df["v1"].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [17]:
import nltk # natural language toolkit

nltk.download("wordnet")
nltk.download("stopwords")


[nltk_data] Downloading package wordnet to /home/artur/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/artur/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
test_message = "Hey,,, GGggGG feet it going? <HTML><bads> bads 'randoms' badly"

test_message_tokenized = tokenizer.tokenize(test_message) # creates an array with each string without adding caractheres
test_message_tokenized

['Hey',
 'GGggGG',
 'feet',
 'it',
 'going',
 'HTML',
 'bads',
 'bads',
 'randoms',
 'badly']

In [14]:
test_message_lowercased = [t.lower() for t in test_message_tokenized] # set all string in test_message_tokenized to lowercase
test_message_lowercased

['hey',
 'gggggg',
 'feet',
 'it',
 'going',
 'html',
 'bads',
 'bads',
 'randoms',
 'badly']

In [21]:
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()

test_message_lemmatized_tokens = [lemmatizer.lemmatize(t) for t in test_message_lowercased] # put words in infinitive
test_message_lemmatized_tokens

[nltk_data] Downloading package omw-1.4 to /home/artur/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['hey',
 'gggggg',
 'foot',
 'it',
 'going',
 'html',
 'bad',
 'bad',
 'randoms',
 'badly']

In [23]:
from nltk.corpus import stopwords

stopwords = stopwords.words("english") # get regular words that can be ignored for example (it, from, to, how)

test_message_usefull_tokens = [t for t in test_message_lemmatized_tokens if t not in stopwords]
test_message_usefull_tokens

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [25]:
# transform a message (email) into a list of tokens
def message_to_token_list(s):
    tokens = tokenizer.tokenize(s)
    lowercased_tokens = [t.lower() for t in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
    useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]
    return useful_tokens


message_to_token_list(test_message)

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [30]:
df = df.sample(frac=1, random_state=1)
df = df.reset_index(drop=True)

split_index = int(len(df) * 0.7) # the first 70% is for train and the last 30 is for test

train_df, test_df = df[:split_index], df[split_index:]

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df, test_df

(        v1                                                 v2 Unnamed: 2  \
 0      ham  Wen ur lovable bcums angry wid u, dnt take it ...        NaN   
 1      ham                    Is there any training tomorrow?        NaN   
 2      ham                            Howz that persons story        NaN   
 3      ham  I don't know, same thing that's wrong everyso ...        NaN   
 4      ham  Must come later.. I normally bathe him in da a...        NaN   
 ...    ...                                                ...        ...   
 3895  spam  Update_Now - Xmas Offer! Latest Motorola, Sony...        NaN   
 3896   ham                    Just glad to be talking to you.        NaN   
 3897   ham  Nah can't help you there, I've never had an ip...        NaN   
 3898   ham                             Yes.i'm in office da:)        NaN   
 3899   ham  Book which lesson? then you msg me... I will c...        NaN   
 
      Unnamed: 3 Unnamed: 4  
 0           NaN        NaN  
 1           N

In [34]:
token_counter = {}

for message in train_df["v2"]:
    message_as_token_lst = message_to_token_list(message)
    
    for token in message_as_token_lst:
        if token in token_counter:
            token_counter[token] += 1
        else:
            token_counter[token] = 1

len(token_counter)  # amount of unique token in token_counter

6599

In [154]:
token_counter

{'wen': 16,
 'ur': 289,
 'lovable': 6,
 'bcums': 2,
 'angry': 7,
 'wid': 16,
 'u': 907,
 'dnt': 7,
 'take': 94,
 'seriously': 7,
 'coz': 14,
 'childish': 4,
 'n': 105,
 'true': 15,
 'way': 70,
 'showing': 3,
 'deep': 8,
 'affection': 3,
 'care': 48,
 'luv': 26,
 'kettoda': 2,
 'manda': 2,
 'nice': 42,
 'day': 194,
 'da': 109,
 'training': 3,
 'tomorrow': 69,
 'howz': 5,
 'person': 35,
 'story': 13,
 'know': 190,
 'thing': 84,
 'wrong': 13,
 'everyso': 1,
 'often': 2,
 'panicks': 1,
 'start': 31,
 'goin': 17,
 'bout': 13,
 'bein': 2,
 'good': 187,
 'enough': 19,
 'û_': 14,
 'must': 17,
 'come': 191,
 'later': 85,
 'normally': 2,
 'bathe': 8,
 'afternoon': 21,
 'mah': 8,
 'ha': 93,
 'û': 27,
 'either': 17,
 'clever': 1,
 'simple': 11,
 'pear': 1,
 'perfect': 3,
 'christmas': 11,
 'hmm': 10,
 'yeah': 60,
 'grooved': 1,
 'im': 57,
 'looking': 22,
 'forward': 6,
 'pound': 22,
 'special': 38,
 'babe': 57,
 'lost': 11,
 'try': 33,
 'rebooting': 1,
 'hi': 104,
 'dude': 17,
 'hw': 9,
 'r': 132,

In [41]:
# check how many times the token appears in token_counter

#processed_token -> each token in token_counter
#threshold > positive integer
def keep_token(processed_token, threshold):
    if processed_token not in token_counter:
        return False
    else:
        return token_counter[processed_token] > threshold
    
# keep_token("euro", 10) -> return false because the token "euro" does not appear more than 10 times
keep_token("euro", 1)

True

In [236]:
features = set() # bag of words

for token in token_counter:
    if keep_token(token, 200):
        features.add(token)

features

{'2', '4', 'call', 'get', 'go', 'gt', 'lt', 'ok', 'u', 'ur', 'å'}

In [237]:
features = list(features)
features

['ur', 'å', 'get', '4', 'u', 'lt', 'call', 'go', 'gt', 'ok', '2']

In [238]:
#create a map with the bag of words (features) and give an index to each one
token_to_index_mapping = {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping

{'ur': 0,
 'å': 1,
 'get': 2,
 '4': 3,
 'u': 4,
 'lt': 5,
 'call': 6,
 'go': 7,
 'gt': 8,
 'ok': 9,
 '2': 10}

In [239]:
message_to_token_list('ur u <br> .com get font font lt randoms ok ok')

['ur', 'u', 'br', 'com', 'get', 'font', 'font', 'lt', 'randoms', 'ok', 'ok']

In [240]:

# Bag of words (counts)

# -> ur, å, get, 4,  u, lt, call, go, gt, ok,  2]
# -> 0  1   2    3   4  5     6   7   8   9   10 [tokens index in token_to_index_mapping]
# -> 1  0   1    0   1  1     0   0   0   2   0 [times each token appeared in message/email]

In [241]:
import numpy as np

#create vector and count appearences of tokens in email
def message_to_count_vector(message):
    count_vector = np.zeros(len(features))
    processed_list_of_tokens = message_to_token_list(message)
    
    for token in processed_list_of_tokens:
        if token not in features:
            continue
        index = token_to_index_mapping[token]
        count_vector[index] += 1
    
    return count_vector

#message_to_count_vector('ur u <br> .com get font font lt randoms ok ok')
# train_df['v2'].iloc[12],   message_to_count_vector(train_df['v2'].iloc[1000])

In [242]:
def df_to_X_y(dff):
    y = dff['v1']
    
    message_col = dff["v2"]
    count_vectors = []
    
    for message in message_col:
        count_vector = message_to_count_vector(message)
        count_vectors.append(count_vector)
        
    x = np.array(count_vectors).astype(int)
    
    return x, y

In [243]:
x_train, y_train = df_to_X_y(train_df)

x_test, y_test = df_to_X_y(test_df)

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((3900, 11), (3900,), (1672, 11), (1672,))

In [244]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(x_train)

x_train, x_test = scaler.transform(x_train), scaler.transform(x_test)

x_train

array([[0.16666667, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.5       , ..., 0.        , 0.        ,
        0.        ]])

In [245]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression().fit(x_train, y_train)
print(classification_report(y_test, lr.predict(x_test)))

              precision    recall  f1-score   support

         ham       0.91      0.99      0.95      1441
        spam       0.89      0.38      0.53       231

    accuracy                           0.91      1672
   macro avg       0.90      0.69      0.74      1672
weighted avg       0.91      0.91      0.89      1672



In [246]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier().fit(x_train, y_train)
print(classification_report(y_test, rf.predict(x_test)))

              precision    recall  f1-score   support

         ham       0.92      1.00      0.96      1441
        spam       0.95      0.46      0.62       231

    accuracy                           0.92      1672
   macro avg       0.93      0.73      0.79      1672
weighted avg       0.92      0.92      0.91      1672

