In [1]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("undersampled_spam.csv", encoding='latin')
df = df.drop(columns=["Unnamed: 0"])
df.head()

Unnamed: 0,type,sms,processedSMS,filteredSMS
0,0,ok. I am a gentleman and will treat you with d...,ok gentleman treat digniti respect,"['ok', 'gentleman', 'treat', 'digniti', 'respe..."
1,0,Lol its ok I didn't remember til last nite,lol ok rememb til last nite,"['lol', 'ok', 'rememb', 'til', 'last', 'nite']"
2,0,Apo all other are mokka players only,apo mokka player,"['apo', 'mokka', 'player']"
3,0,Hey cutie. How goes it? Here in WALES its kind...,hey cuti goe wale kinda ok like hill shit stil...,"['hey', 'cuti', 'goe', 'wale', 'kinda', 'ok', ..."
4,0,How much i gave to you. Morning.,much gave morn,"['much', 'gave', 'morn']"


In [3]:
embedder = spacy.load('en_core_web_sm')
embedding_layer = [embedder(i).vector.reshape(1, -1)[0] for i in df['processedSMS']]

In [4]:
features, labels = embedding_layer , df['type']
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.15)
x_train, x_val, y_train, y_val = train_test_split(features, labels, test_size=0.30)

In [5]:
pipeline = Pipeline([('Normalizing', MinMaxScaler()), ('Model', ComplementNB())]).fit(x_train, y_train) 

In [6]:
y_pred = pipeline.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90       120
           1       0.88      0.90      0.89       105

    accuracy                           0.90       225
   macro avg       0.90      0.90      0.90       225
weighted avg       0.90      0.90      0.90       225



In [7]:
y_pred = pipeline.predict(x_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.90      0.92       238
           1       0.89      0.92      0.91       211

    accuracy                           0.91       449
   macro avg       0.91      0.91      0.91       449
weighted avg       0.91      0.91      0.91       449



In [8]:
input = "free amazon gift card"
pipeline.predict(embedder(input).vector.reshape(1, -1))

array([1], dtype=int64)

In [9]:
input = "how are you"
pipeline.predict(embedder(input).vector.reshape(1, -1))

array([0], dtype=int64)