In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [15]:
df = pd.read_csv("undersampled_spam.csv", encoding='latin')
df = df.drop(columns=["Unnamed: 0"])
df.head()

Unnamed: 0,type,sms,processedSMS,filteredSMS
0,0,Congrats ! Treat pending.i am not on mail for ...,congrat treat pend mail 2 day mail thru respec...,"['congrat', 'treat', 'pend', 'mail', '2', 'day..."
1,0,U GOIN OUT 2NITE?,u goin 2nite,"['u', 'goin', '2nite']"
2,0,Please dont say like that. Hi hi hi,pleas dont say like hi hi hi,"['pleas', 'dont', 'say', 'like', 'hi', 'hi', '..."
3,0,Ard 4 lor...,ard 4 lor,"['ard', '4', 'lor']"
4,0,"My friend, she's studying at warwick, we've pl...",friend studi warwick plan go shop concert tmw ...,"['friend', 'studi', 'warwick', 'plan', 'go', '..."


In [16]:
features, labels = df['processedSMS'], df['type']
X_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.15)
X_train, x_val, y_train, y_val = train_test_split(features, labels, test_size=0.30)

In [17]:
pipe = Pipeline([('vectorizer', TfidfVectorizer()), ('model', MultinomialNB())]).fit(X_train, y_train)

In [18]:
y_pred = pipe.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       122
           1       0.96      0.96      0.96       103

    accuracy                           0.96       225
   macro avg       0.96      0.96      0.96       225
weighted avg       0.96      0.96      0.96       225



In [19]:
y_pred = pipe.predict(x_val)
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.93      0.94      0.94       212
           1       0.95      0.94      0.94       237

    accuracy                           0.94       449
   macro avg       0.94      0.94      0.94       449
weighted avg       0.94      0.94      0.94       449



In [20]:
pipe.predict(['amazon free gift card'])

array([1], dtype=int64)

In [21]:
pipe.predict(['how are you today'])

array([0], dtype=int64)