In [1]:
import pandas as pd
from dictionary import dictionary as dictionary
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report
from re import sub as regex
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer as Stemmer

In [2]:
df = pd.read_csv("undersampled_spam.csv", encoding='latin')
df = df.drop(columns=["Unnamed: 0"])
df.head()

Unnamed: 0,type,sms,processedSMS,filteredSMS
0,0,ok. I am a gentleman and will treat you with d...,ok gentleman treat digniti respect,"['ok', 'gentleman', 'treat', 'digniti', 'respe..."
1,0,Lol its ok I didn't remember til last nite,lol ok rememb til last nite,"['lol', 'ok', 'rememb', 'til', 'last', 'nite']"
2,0,Apo all other are mokka players only,apo mokka player,"['apo', 'mokka', 'player']"
3,0,Hey cutie. How goes it? Here in WALES its kind...,hey cuti goe wale kinda ok like hill shit stil...,"['hey', 'cuti', 'goe', 'wale', 'kinda', 'ok', ..."
4,0,How much i gave to you. Morning.,much gave morn,"['much', 'gave', 'morn']"


In [3]:
def embed(df):
    embedding_layer = []
    for sentences in df:
        sentence = []
        if len(sentences) < 1:
            embedding_layer.append([dictionary['[empty]']])
        else:
            for word in sentences:
                if any(charater.isdigit() for charater in word):
                    sentence.append(dictionary['[number]'])
                elif word in dictionary:
                    sentence.append(dictionary[word])
            embedding_layer.append(sentence)
    
    embedding_dim = []
    for i in embedding_layer:
        padding = []
        for j in range(200 - len(i)):
            padding.append(0)
        padding.extend(i)
        embedding_dim.append(padding)
    return embedding_dim
    
df['embed'] = embed(df['filteredSMS'])
df.head()  

Unnamed: 0,type,sms,processedSMS,filteredSMS,embed
0,0,ok. I am a gentleman and will treat you with d...,ok gentleman treat digniti respect,"['ok', 'gentleman', 'treat', 'digniti', 'respe...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,0,Lol its ok I didn't remember til last nite,lol ok rememb til last nite,"['lol', 'ok', 'rememb', 'til', 'last', 'nite']","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,0,Apo all other are mokka players only,apo mokka player,"['apo', 'mokka', 'player']","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,0,Hey cutie. How goes it? Here in WALES its kind...,hey cuti goe wale kinda ok like hill shit stil...,"['hey', 'cuti', 'goe', 'wale', 'kinda', 'ok', ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,0,How much i gave to you. Morning.,much gave morn,"['much', 'gave', 'morn']","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [4]:
features, labels = embed(df['filteredSMS']), df["type"]
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.15)
x_train, x_val, y_train, y_val = train_test_split(features, labels, test_size=0.30)

In [5]:
model = ComplementNB().fit(x_train, y_train)

In [6]:
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.91      0.93      0.92       126
           1       0.91      0.88      0.89        99

    accuracy                           0.91       225
   macro avg       0.91      0.90      0.90       225
weighted avg       0.91      0.91      0.91       225



In [7]:
y_pred = model.predict(x_val)
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.87      0.91      0.89       219
           1       0.91      0.87      0.89       230

    accuracy                           0.89       449
   macro avg       0.89      0.89      0.89       449
weighted avg       0.89      0.89      0.89       449



In [8]:
def processSMS(sms):
    # Removes special characters - only letters and numbers remain + lowers text
    sms = regex(r'[^a-zA-Z0-9]', ' ', sms.lower())
    # Utilizes NLTK to tokenize text
    sms = word_tokenize(sms)
    # Utilizes removes NLTK stopwords from sms
    sms = [word for word in sms if word not in stopwords.words('english')]
    # Utilizes NLTK Stemmer to stem words
    sms = [Stemmer().stem(word) for word in sms]
    return [sms]

In [9]:
sms = 'free amazon gift card'
model.predict(embed(processSMS(sms)))

array([0], dtype=int64)

In [10]:
sms = 'how are you today'
model.predict(embed(processSMS(sms)))

array([0], dtype=int64)