**Spam Classification using NLP**

In [22]:
import pandas as pd 

df = pd.read_csv("Spam Email raw text for NLP.csv")
df.head()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6


In [23]:
df.tail()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0
5795,0,"If you run Pick, and then use the ""New FTOC"" b...",00747.352d424267d36975a7b40b85ffd0885e


In [24]:
df['CATEGORY'].value_counts()

CATEGORY
0    3900
1    1896
Name: count, dtype: int64

In [1]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to C:\Users\Tejas
[nltk_data]     H\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Tejas
[nltk_data]     H\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
tokenizer = nltk.RegexpTokenizer(r'\w+')
test_message = "Hey..GGggGG feet it going? <HTML><bads> bads 'randoms' badly"

test_message_tokenized = tokenizer.tokenize(test_message)
test_message_tokenized

['Hey',
 'GGggGG',
 'feet',
 'it',
 'going',
 'HTML',
 'bads',
 'bads',
 'randoms',
 'badly']

In [4]:
test_message_lowercase = [t.lower() for t in test_message_tokenized]
test_message_lowercase 

['hey',
 'gggggg',
 'feet',
 'it',
 'going',
 'html',
 'bads',
 'bads',
 'randoms',
 'badly']

In [5]:
from nltk.stem import WordNetLemmatizer

lemmatized = WordNetLemmatizer()

test_message_lemmatized_tokens = [lemmatized.lemmatize(t) for t in test_message_lowercase]
test_message_lemmatized_tokens

['hey',
 'gggggg',
 'foot',
 'it',
 'going',
 'html',
 'bad',
 'bad',
 'randoms',
 'badly']

In [6]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
test_message_useful_tokens = [t for t in test_message_lemmatized_tokens if t not in stopwords]
test_message_useful_tokens

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [10]:
def message_to_token_list(s):
    tokens = tokenizer.tokenize(s)
    lowercased_tokens = [t.lower() for t in tokens]
    lemmatized_tokens = [lemmatized.lemmatize(t) for t in lowercased_tokens]
    useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]
                                  
    return useful_tokens
message_to_token_list(test_message)

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [25]:
df = df.sample(frac=1, random_state=1)
df = df.reset_index(drop=True)

split_index = int(len(df)*0.8)
train_df,test_df = df[:split_index], df[split_index:]

train_df = train_df.reset_index(drop=True)
train_df = test_df.reset_index(drop=True)

train_df,test_df

(      CATEGORY                                            MESSAGE  \
 0            0  This is just an semi-educated guess - if I'm w...   
 1            1  ------=_NextPart_000_00B0_58C75D0E.A4523D08\n\...   
 2            0  I seem to be getting the known spam message nu...   
 3            0  \n\n\n\n>>>>> On Mon, 30 Sep 2002, "Ted" == Te...   
 4            1  This is a multi-part message in MIME format.\n...   
 ...        ...                                                ...   
 1155         1  <html>\n\n\n\n<body>\n\n\n\n<font size="2" PTS...   
 1156         0  \n\n\n\nformail did the trick. Thanks to those...   
 1157         0  URL: http://www.askbjoernhansen.com/archives/2...   
 1158         1  <html>\n\n<head>\n\n   <meta http-equiv=3D"Con...   
 1159         0  >>>>> "E" == Elias Sinderson <elias@cse.ucsc.e...   
 
                                    FILE_NAME  
 0     01503.5e13994a5676296ed31b14e83367031c  
 1     00441.3b9c3055e08bda4c0f7eea43749e324c  
 2     00623.8

In [26]:
token_counter = {}

for message in train_df['MESSAGE']:
    message_as_token_list = message_to_token_list(message)

    for token in message_as_token_list:
        if token in token_counter:
            token_counter[token] += 1
        else:
            token_counter[token] = 1
    
len(token_counter)

34408

In [27]:
token_counter

{'semi': 4,
 'educated': 5,
 'guess': 35,
 'wrong': 77,
 'someone': 105,
 'please': 412,
 'correct': 28,
 'spamd': 35,
 'setuid': 3,
 'user': 659,
 'running': 97,
 'spamc': 19,
 'since': 177,
 'calling': 31,
 'global': 71,
 'procmailrc': 20,
 'file': 256,
 'run': 170,
 'root': 82,
 'likely': 29,
 'called': 81,
 'open': 108,
 'user_prefs': 9,
 'man': 74,
 'page': 202,
 'u': 920,
 'username': 4,
 'argument': 35,
 'ha': 679,
 'obsoleted': 1,
 'use': 528,
 'per': 125,
 'config': 28,
 'whose': 28,
 'load': 39,
 'though': 107,
 'eg': 5,
 'mail': 872,
 'nobody': 19,
 'cyrus': 2,
 'etc': 165,
 'still': 182,
 'flag': 11,
 'solution': 62,
 'set': 170,
 'dropprivs': 4,
 'yes': 120,
 'procmail': 34,
 'drop': 30,
 'privilege': 8,
 'might': 132,
 'suid': 2,
 'sgid': 2,
 'useful': 34,
 'want': 333,
 'guarantee': 68,
 'bottom': 156,
 'half': 65,
 'executed': 6,
 'behalf': 33,
 'recipient': 28,
 'hope': 60,
 'help': 219,
 'also': 397,
 'right': 470,
 'st': 36,
 'original': 132,
 'message': 599,
 'spama

In [28]:
def keep_token(processed_token, threshold):
    if processed_token not in token_counter:
        return False
    else:
        return token_counter[processed_token] > threshold
    
keep_token('second',92)

False

In [15]:
features = set()

for token in token_counter:
    if keep_token(token,1000):
        features.add(token)

features

set()

In [16]:
features = list(features)
features

[]

In [17]:
token_to_index_mapping = {t:i for t,i in zip(features, range(len(features)))}
token_to_index_mapping

{}

In [18]:
message_to_token_list('3d b <br> .com bad font font com randoms')

['3d', 'b', 'br', 'com', 'bad', 'font', 'font', 'com', 'randoms']

In [18]:
# bag of words (counts vector)

# -> http  tr  size  3d  font  br  com  td  p  b
# ->  0     1    2    3    4   5    6    7  8  9
# -> [0,    0    0,   1,   2,  1,   2,   1, 0, 1]


In [19]:
import numpy as np  

def message_to_count_vector(message):
    count_vector = np.zeros(len(features))

    processed_list_of_tokens = message_to_token_list(message)

    for token in processed_list_of_tokens:
        if token not in features:
            continue
        index = token_to_index_mapping[token]
        count_vector[index] += 1

    return count_vector
message_to_count_vector('3d b <br> .com bad font font com randoms')

array([], dtype=float64)

In [29]:
message_to_count_vector(train_df['MESSAGE'].iloc[0])


array([], dtype=float64)

In [34]:
train_df.iloc[1]               # 1 - spam  0 - not spam 

CATEGORY                                                     1
MESSAGE      ------=_NextPart_000_00B0_58C75D0E.A4523D08\n\...
FILE_NAME               00441.3b9c3055e08bda4c0f7eea43749e324c
Name: 1, dtype: object

In [28]:
def df_to_x_y(dff):
    y = dff['CATEGORY'].to_numpy().astype(int)

    message_col = dff['MESSAGE']
    count_vectors = []

    for message in message_col:
        count_vector = message_to_count_vector(message)
        count_vectors.append(count_vector)

    X = np.array(count_vectors).astype(int)

    return X,y


In [29]:
x_train, y_train = df_to_x_y(train_df)

x_test, y_test = df_to_x_y(test_df)

x_train.shape , y_train.shape, x_test.shape, y_test.shape

((1160, 26), (1160,), (1160, 26), (1160,))

In [30]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(x_train)

x_train, x_test = scaler.transform(x_train), scaler.transform(x_test)

x_train

array([[0.        , 0.        , 0.        , ..., 0.02272727, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.00355872, 0.        , ..., 0.06818182, 0.        ,
        0.        ],
       ...,
       [0.0078125 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.03558719, 0.14049587, ..., 0.04545455, 0.03333333,
        0.05839416],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression().fit(x_train,y_train)
print(classification_report(y_test, lr.predict(x_test)))

              precision    recall  f1-score   support

           0       0.77      1.00      0.87       788
           1       0.99      0.37      0.54       372

    accuracy                           0.80      1160
   macro avg       0.88      0.68      0.70      1160
weighted avg       0.84      0.80      0.76      1160



In [32]:
# compare logistic regression with random forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier().fit(x_train,y_train)
print(classification_report(y_test,rf.predict(x_test)))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       788
           1       0.99      0.92      0.96       372

    accuracy                           0.97      1160
   macro avg       0.98      0.96      0.97      1160
weighted avg       0.97      0.97      0.97      1160

