In [55]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
import string
import nltk
from nltk.corpus import stopwords

In [56]:
# Load the dataset 
data = pd.read_csv("spam.csv")
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [57]:
data.shape

(5572, 2)

In [58]:
# Encode labels ('spam' -> 1, 'ham' -> 0)
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [59]:
data['Category'] = encoder.fit_transform(data['Category'])

In [60]:
data.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [61]:
# missing values
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [62]:
# duplicates values
data.duplicated().sum()

np.int64(415)

In [63]:
# rremove duplicates
data = data.drop_duplicates(keep = 'first')

In [64]:
data.duplicated().sum()

np.int64(0)

In [65]:
data.shape

(5157, 2)

In [66]:
data.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [67]:
data['Category'].value_counts()

Category
0    4516
1     641
Name: count, dtype: int64

In [71]:
# Preprocess the text
def preprocess_text(text):
    text = text.lower()                      # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = text.split()                    # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return " ".join(tokens)

data['clean_text'] = data['Message'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['clean_text'] = data['Message'].apply(preprocess_text)


In [83]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data['clean_text'], data['Category'], test_size=0.2)

In [84]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [85]:
# Train model (Naive Bayes)
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [86]:
# Evaluate
predictions = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))
print("Recall:", recall_score(y_test, predictions))

Accuracy: 0.9534883720930233
Precision: 1.0
Recall: 0.6595744680851063


In [87]:
# Prediction on new email
def predict_email(text):
    cleaned = preprocess_text(text)
    vector = vectorizer.transform([cleaned])
    return model.predict(vector)[0]

In [88]:
print("Spam" if predict_email("Congratulations! You've won a $1000 Walmart gift card.") else "Ham")

Ham


project/
│
├── app.py
├── model.pkl
├── vectorizer.pkl
└── templates/
    └── index.html



In [89]:
!pip install flask



In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import pickle

# Example dataset (replace with your actual dataset)
X = df['Message']
y = df['Category']  # Assuming already encoded with LabelEncoder

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Text vectorization
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

# Train the model
classifier = MultinomialNB()
classifier.fit(X_train_features, y_train)

# Save the model and vectorizer
pickle.dump(classifier, open('model.pkl', 'wb'))
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))
