# Spam classification
An email spam classifier using NLTK (natural language processing toolkit) in Python. Using the bag of words (BOW) approach to building the model, after performing tokenizing, lemmatization / stemming, and removing stop words. 

Dataset: https://www.kaggle.com/chandramoulinaidu/spam-classification-for-basic-nlp

In [2]:
import pandas as pd

df = pd.read_csv("../data/Spam Email raw text for NLP.csv")
df.head()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6


In [3]:
df.tail()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0
5795,0,"If you run Pick, and then use the ""New FTOC"" b...",00747.352d424267d36975a7b40b85ffd0885e


In [4]:
df["CATEGORY"].value_counts() # 1 = spam, 0 = not spam

CATEGORY
0    3900
1    1896
Name: count, dtype: int64

In [5]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/eriklarsson/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eriklarsson/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
test_message = "Hey,, GGggGG feet it going <HTML><bads> bads 'randoms' badly"

test_message_tokenized = tokenizer.tokenize(test_message)
test_message_tokenized

['Hey',
 'GGggGG',
 'feet',
 'it',
 'going',
 'HTML',
 'bads',
 'bads',
 'randoms',
 'badly']

In [8]:
test_message_lowercased = [t.lower() for t in test_message_tokenized]
test_message_lowercased

['hey',
 'gggggg',
 'feet',
 'it',
 'going',
 'html',
 'bads',
 'bads',
 'randoms',
 'badly']

In [9]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer() 

test_message_lemmatized = [lemmatizer.lemmatize(t) for t in test_message_lowercased]
test_message_lemmatized

['hey',
 'gggggg',
 'foot',
 'it',
 'going',
 'html',
 'bad',
 'bad',
 'randoms',
 'badly']

In [10]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [11]:
test_message_useful_tokens = [t for t in test_message_lemmatized if t not in stopwords]
test_message_useful_tokens

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [12]:
def message_to_token_list(s):
  tokens = tokenizer.tokenize(s)
  lowercased_tokens = [t.lower() for t in tokens]
  lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
  useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]

  return useful_tokens

message_to_token_list(test_message)

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [14]:
df = df.sample(frac=1, random_state=1)
df = df.reset_index(drop=True)

split_index = int(len(df) * 0.8)
train_df, test_df = df[:split_index], df[split_index:]

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df, test_df

(      CATEGORY                                            MESSAGE  \
 0            1  <HTML>\n\n<BODY>\n\n<META HTTP-EQUIV=3D"Conten...   
 1            1  <html>\n\n\n\n\n\n</head>\n\n\n\n<body>\n\n\n\...   
 2            1  <HTML><HEAD><TITLE>New Web Technology</TITLE>\...   
 3            1  \n\n<HTML>\n\n<HEAD>\n\n<TITLE>bizmagoffer</TI...   
 4            0  URL: http://boingboing.net/#85485824\n\nDate: ...   
 ...        ...                                                ...   
 4631         0  Doom 3 will be based on a peer to peer archite...   
 4632         1  \n\n\n\n<HTML>\n\n<HEAD>\n\n<TITLE>Cards In Ad...   
 4633         1  <BODY BGCOLOR="#FFFFFF">\n\n\n\n<DIV ALIGN="ce...   
 4634         0  Yes - great minds think alike. But even withpu...   
 4635         0  On Tue, Aug 06, 2002 at 04:21:22 +0100, David ...   
 
                                    FILE_NAME  
 0     00275.87c74dc27e397ccd3b2b581bbefef515  
 1     00531.f3fffa4504c7009a03dd0d44a4562a84  
 2     01078.5

In [15]:
token_counter = {}

for message in train_df['MESSAGE']:
    message_as_token_lst = message_to_token_list(message)
    for token in message_as_token_lst:
        if token in token_counter:
            token_counter[token] += 1
        else:
            token_counter[token] = 1

len(token_counter)

76915

In [19]:
def keep_token(proccessed_token, threshold):
  if proccessed_token not in token_counter:
    return False
  else:
    return token_counter[proccessed_token] > threshold

print(keep_token('random', 10))
print(keep_token('random', 100))

True
False


In [20]:
features = set()

for token in token_counter:
    if keep_token(token, 10000):
        features.add(token)

features

{'3d', 'b', 'br', 'com', 'font', 'http', 'nbsp', 'p', 'size', 'td', 'tr'}

In [21]:
features = list(features)
features

['tr', 'com', 'http', 'nbsp', 'size', 'font', 'br', '3d', 'td', 'b', 'p']

In [22]:
token_to_index_mapping = {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping 

{'tr': 0,
 'com': 1,
 'http': 2,
 'nbsp': 3,
 'size': 4,
 'font': 5,
 'br': 6,
 '3d': 7,
 'td': 8,
 'b': 9,
 'p': 10}

In [23]:
message_to_token_list('3d b <br> .com bad font font com randoms')

['3d', 'b', 'br', 'com', 'bad', 'font', 'font', 'com', 'randoms']

In [24]:
# "Bag of Words" (counts vector)

# ->  http  tr  size  3d  font  br  com  td   p   b
# ->    0    1    2    3   4    5    6    7   8   9
# ->   [0,   0,   0,   1,  2,   1,   2,   0,  0,  1]

[0.,  0.,  0.,   1., 2.,  1., 2.,  0., 0., 1.]

[0.0, 0.0, 0.0, 1.0, 2.0, 1.0, 2.0, 0.0, 0.0, 1.0]

In [25]:
import numpy as np

def message_to_count_vector(message):
  count_vector = np.zeros(len(features))

  processed_list_of_tokens = message_to_token_list(message)

  for token in processed_list_of_tokens:
    if token not in features:
      continue
    index = token_to_index_mapping[token]
    count_vector[index] += 1
  
  return count_vector

message_to_count_vector('3d b <br> .com bad font font com randoms')

array([0., 2., 0., 0., 0., 2., 1., 1., 0., 1., 0.])

In [26]:
message_to_count_vector(train_df['MESSAGE'].iloc[3])

array([32., 15., 13.,  7., 12., 30.,  7.,  0., 36., 14.,  7.])

In [27]:
train_df.iloc[3]

CATEGORY                                                     1
MESSAGE      \n\n<HTML>\n\n<HEAD>\n\n<TITLE>bizmagoffer</TI...
FILE_NAME               00627.4e9619c454da17a27d4a66c87583dd49
Name: 3, dtype: object

In [28]:
def df_to_X_y(dff):
  y = dff['CATEGORY'].to_numpy().astype(int)

  message_col = dff['MESSAGE']
  count_vectors = []

  for message in message_col:
    count_vector = message_to_count_vector(message)
    count_vectors.append(count_vector)

  X = np.array(count_vectors).astype(int)

  return X, y

In [29]:
X_train, y_train = df_to_X_y(train_df)

X_test, y_test = df_to_X_y(test_df)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4636, 11), (4636,), (1160, 11), (1160,))

In [30]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train)

X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

X_train

array([[0.        , 0.        , 0.00386847, ..., 0.        , 0.2254902 ,
        0.02439024],
       [0.        , 0.00947867, 0.00386847, ..., 0.        , 0.31372549,
        0.1097561 ],
       [0.20472441, 0.01421801, 0.00580271, ..., 0.13468013, 0.2745098 ,
        0.04065041],
       ...,
       [0.14173228, 0.12322275, 0.04448743, ..., 0.09427609, 0.        ,
        0.00406504],
       [0.        , 0.00473934, 0.00386847, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00386847, ..., 0.        , 0.        ,
        0.        ]])

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression().fit(X_train, y_train)
print(classification_report(y_test, lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.77      1.00      0.87       790
           1       0.99      0.35      0.52       370

    accuracy                           0.79      1160
   macro avg       0.88      0.68      0.69      1160
weighted avg       0.84      0.79      0.76      1160



In [32]:
# Compare logistic regression to random forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier().fit(X_train, y_train)
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89       790
           1       0.88      0.59      0.70       370

    accuracy                           0.84      1160
   macro avg       0.85      0.77      0.80      1160
weighted avg       0.85      0.84      0.83      1160

