In [8]:
import pandas as pd
import numpy as np
import pickle
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('stopwords')

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

df = pd.read_csv('spam.csv', encoding='latin1')
df = df[['v1', 'v2']]
df.columns = ['target', 'text']
df['target'] = df['target'].map({'ham': 0, 'spam': 1})

def transform_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    y = [ps.stem(i) for i in tokens if i.isalnum() and i not in stop_words]
    return " ".join(y)

df['transformed_text'] = df['text'].apply(transform_text)

hv = HashingVectorizer(n_features=3000, alternate_sign=False)
X = hv.transform(df['transformed_text'])
y = df['target'].values

classes = np.array([0, 1])
weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y
)
class_weights = dict(zip(classes, weights))
print("Class Weights:", class_weights)

model = SGDClassifier(
    loss='log_loss',
    class_weight=class_weights,
    random_state=42
)

batch_size = 500
for i in range(0, X.shape[0], batch_size):
    X_batch = X[i : i + batch_size]
    y_batch = y[i : i + batch_size]
    model.partial_fit(X_batch, y_batch, classes=classes)

y_pred = model.predict(X)
print("Final Accuracy:", accuracy_score(y, y_pred))

pickle.dump(hv, open('hv_vectorizer.pkl', 'wb'))
pickle.dump(model, open('online_model.pkl', 'wb'))

print("Model and Vectorizer saved successfully!")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Class Weights: {np.int64(0): np.float64(0.5774093264248704), np.int64(1): np.float64(3.7295850066934406)}
Final Accuracy: 0.9685929648241206
Model and Vectorizer saved successfully!
