In [11]:
import pandas as pd
from sklearn.utils import resample

df = pd.read_csv('data/spam.csv', encoding='latin-1')
df = df.rename(columns={'v1': 'label', 'v2': 'text'})
df = df[['label', 'text']]  

df['label'] = df['label'].map({'ham': 0, 'spam': 1})

print(df.columns)
print(df['label'].value_counts())

df_ham = df[df['label'] == 0]
df_spam = df[df['label'] == 1]

df_spam_upsampled = resample(df_spam,
                             replace=True,
                             n_samples=len(df_ham),
                             random_state=42)

df_balanced = pd.concat([df_ham, df_spam_upsampled])
print(df_balanced['label'].value_counts())
print(df_balanced.columns)


Index(['label', 'text'], dtype='object')
label
0    4825
1     767
Name: count, dtype: int64
label
0    4825
1    4825
Name: count, dtype: int64
Index(['label', 'text'], dtype='object')


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text) 
    words = text.split()
    filtered_words = [ps.stem(word) for word in words if word not in stop_words]
    return ' '.join(filtered_words)

df_balanced['cleaned_text'] = df_balanced['text'].apply(clean_text)

print(df_balanced.columns)
print(df_balanced[['text', 'cleaned_text']].head(3))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Index(['label', 'text', 'cleaned_text'], dtype='object')
                                                text  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
3  U dun say so early hor... U c already then say...   

                                        cleaned_text  
0  go jurong point crazi avail bugi n great world...  
1                              ok lar joke wif u oni  
3                u dun say earli hor u c alreadi say  


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df_balanced['cleaned_text']).toarray()
y = df_balanced['label']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.989119170984456
Confusion Matrix:
 [[976   9]
 [ 12 933]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       985
           1       0.99      0.99      0.99       945

    accuracy                           0.99      1930
   macro avg       0.99      0.99      0.99      1930
weighted avg       0.99      0.99      0.99      1930



In [15]:
import pickle

with open('pkl/vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

with open('pkl/model.pkl', 'wb') as f:
    pickle.dump(model, f)
