In [1]:
import pandas as pd
df = pd.read_csv('../data/cleaned_spam.csv')

In [2]:
print(df)

     label                                            message
0      ham  go jurong point crazy available bugis n great ...
1      ham                            ok lar joking wif u oni
2     spam  free entry wkly comp win fa cup final tkts st ...
3      ham                u dun say early hor u c already say
4      ham  nah dont think he goes usf he lives around though
...    ...                                                ...
5164  spam  nd time tried contact u u pound prize claim ea...
5165   ham                          b going esplanade fr home
5166   ham                        pity mood soany suggestions
5167   ham  guy bitching acted like id interested buying s...
5168   ham                                     rofl true name

[5169 rows x 2 columns]


In [3]:
df.head()

Unnamed: 0,label,message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah dont think he goes usf he lives around though


In [4]:
#converting labels to numbers so that computers can understand 

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label_num'] = le.fit_transform(df['label'])

In [6]:
df.head()

Unnamed: 0,label,message,label_num
0,ham,go jurong point crazy available bugis n great ...,0
1,ham,ok lar joking wif u oni,0
2,spam,free entry wkly comp win fa cup final tkts st ...,1
3,ham,u dun say early hor u c already say,0
4,ham,nah dont think he goes usf he lives around though,0


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
df.isnull().values.any()

True

In [10]:
df.head().sum()

label                                         hamhamspamhamham
message      go jurong point crazy available bugis n great ...
label_num                                                    1
dtype: object

In [11]:
df.isnull().sum()

label        0
message      4
label_num    0
dtype: int64

In [14]:
df = df[df['message'].str.strip() != '']

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5169 entries, 0 to 5168
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   label      5169 non-null   object
 1   message    5165 non-null   object
 2   label_num  5169 non-null   int32 
dtypes: int32(1), object(2)
memory usage: 101.1+ KB


In [16]:
df = df[df['message'].fillna('').str.strip() != '']

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5165 entries, 0 to 5168
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   label      5165 non-null   object
 1   message    5165 non-null   object
 2   label_num  5165 non-null   int32 
dtypes: int32(1), object(2)
memory usage: 141.2+ KB


In [18]:
df.isnull().sum()

label        0
message      0
label_num    0
dtype: int64

In [19]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['message'])
y = df['label_num']

In [20]:
print(X)

  (0, 7877)	0.1972980531970151
  (0, 240)	0.34525506405018613
  (0, 2863)	0.1616666051245841
  (0, 1249)	0.29121408023733514
  (0, 922)	0.3294490895338877
  (0, 3818)	0.2825154247549802
  (0, 8115)	0.24838761381461125
  (0, 2902)	0.19394266442118066
  (0, 924)	0.29121408023733514
  (0, 476)	0.26670945023868176
  (0, 1532)	0.26670945023868176
  (0, 5392)	0.26939895310026485
  (0, 3666)	0.34525506405018613
  (0, 2806)	0.15626419361886204
  (1, 4980)	0.5462151914163192
  (1, 8008)	0.430209087941822
  (1, 3634)	0.5230695217542618
  (1, 3854)	0.4067004135288511
  (1, 4948)	0.2784998283043077
  (2, 5082)	0.2516925448044852
  (2, 329)	0.17669502363853803
  (2, 5739)	0.2516925448044852
  (2, 7535)	0.13262968221826538
  (2, 5686)	0.2516925448044852
  (2, 5793)	0.17380253256410205
  :	:
  (5161, 2584)	0.5618041558188724
  (5161, 2824)	0.3637632589188606
  (5161, 3195)	0.3679330800605978
  (5162, 6936)	0.5152106794457366
  (5162, 6558)	0.5152106794457366
  (5162, 5315)	0.5152106794457366
  (5162,

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [22]:
print(X_train)

  (0, 3167)	0.34558815810673343
  (0, 3748)	0.3545065785997939
  (0, 2277)	0.43323366009779823
  (0, 744)	0.43323366009779823
  (0, 8132)	0.3137640908684269
  (0, 7946)	0.22941753888251615
  (0, 1945)	0.2788061872981628
  (0, 8224)	0.24925122390013293
  (0, 5863)	0.29761002615090215
  (1, 5660)	0.612784296821067
  (1, 5529)	0.546164177071234
  (1, 6205)	0.3663442152951116
  (1, 7857)	0.35066582100900795
  (1, 8280)	0.26272703543697135
  (2, 3904)	0.41875104432347465
  (2, 916)	0.4760347779539389
  (2, 3861)	0.3144895589406708
  (2, 4759)	0.2970405757237268
  (2, 7199)	0.2995131265896444
  (2, 2452)	0.3470347816387963
  (2, 2172)	0.3727534374676103
  (2, 2863)	0.24862680655321864
  (3, 3654)	0.4575279608311305
  (3, 3272)	0.47947877362476243
  (3, 463)	0.47947877362476243
  :	:
  (4129, 4306)	0.17398974711464302
  (4130, 4776)	0.25463692484725947
  (4130, 83)	0.2668536371494205
  (4130, 6853)	0.2668536371494205
  (4130, 1988)	0.2154893728167696
  (4130, 6978)	0.22910773876541873
  (4130

In [23]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(class_prior=[0.5, 0.5])
model.fit(X_train, y_train)

In [24]:
y_pred = model.predict(X_test)

In [25]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib 

In [26]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9409486931268151


In [27]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Confusion Matrix:
 [[850  52]
 [  9 122]]


In [28]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.94      0.97       902
           1       0.70      0.93      0.80       131

    accuracy                           0.94      1033
   macro avg       0.85      0.94      0.88      1033
weighted avg       0.95      0.94      0.94      1033



In [31]:
joblib.dump(model, '../models/spam_classifier.pkl')
joblib.dump(vectorizer, '../models/vectorizer.pkl')


['../models/vectorizer.pkl']