In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

df = pd.read_csv('spam.csv', encoding='latin-1')

df = df.iloc[:, :2]
df.columns = ['class', 'text']
df['class'] = df['class'].map({'ham': 0, 'spam': 1})

nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower().split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    return ' '.join(text)

df['text'] = df['text'].apply(clean_text)

print(df.head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


   class                                               text
0      0  go jurong point crazy available bugis n great ...
1      0                            ok lar joking wif u oni
2      1  free entry wkly comp win fa cup final tkts st ...
3      0                u dun say early hor u c already say
4      0                nah think go usf life around though


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(df['text'])

y = df['class']

print("Features created successfully!")

Features created successfully!


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Model ki Accuracy: {accuracy * 100:.2f}%")

Model ki Accuracy: 96.23%


In [4]:
new_message = "Your brother is a genius"

cleaned_message = clean_text(new_message)

new_message_vectorized = vectorizer.transform([cleaned_message])

prediction = model.predict(new_message_vectorized)

if prediction[0] == 1:
    print("This message is Spam. 🚫")
else:
    print("This message is Not Spam (Ham). ✅")

This message is Not Spam (Ham). ✅


In [5]:
import joblib

# Yahan 'model' tumhara trained classifier hai (e.g., Naive Bayes)
# Aur 'vectorizer' tumhara TfidfVectorizer hai

# Model ko save karo
joblib.dump(model, 'spam_detector_model.pkl')

# Vectorizer ko save karo (yeh bhi zaroori hai)
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("Model aur Vectorizer save ho gaye hain!")

Model aur Vectorizer save ho gaye hain!
