In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
2886,ham,K I'll take care of it,,,
2848,spam,YOUR CHANCE TO BE ON A REALITY FANTASY SHOW ca...,,,
2803,ham,Lil fever:) now fine:),,,
3153,spam,Not heard from U4 a while. Call 4 rude chat pr...,,,
5380,ham,Its sunny in california. The weather's just cool,,,


In [3]:
column=["Unnamed: 2","Unnamed: 3","Unnamed: 4"]
df.drop(columns=column,inplace=True)

df.rename(columns={'v1': 'label', 'v2': 'message'}, inplace=True)

df['label'] = df['label'].map({'ham': 0, 'spam': 1})

df.sample(5)

Unnamed: 0,label,message
4615,0,ÌÏ called dad oredi...
3922,0,Okay lor... Will they still let us go a not ah...
5527,0,"Total disappointment, when I texted you was th..."
1844,0,Da is good good player.why he is unsold.
2125,0,"Sorry im getting up now, feel really bad- tota..."


In [4]:
df['label'].value_counts()

label
0    4825
1     747
Name: count, dtype: int64

In [5]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["message"])
y = df["label"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [17]:
from imblearn.under_sampling import RandomUnderSampler
# Apply Random Under-Sampling
undersampler = RandomUnderSampler(random_state=42)
X_train_res, y_train_res = undersampler.fit_resample(X_train, y_train)

print("Original dataset shape:", y_train.value_counts())
print("Resampled dataset shape:", y_train_res.value_counts())


Original dataset shape: label
0    3860
1     597
Name: count, dtype: int64
Resampled dataset shape: label
0    597
1    597
Name: count, dtype: int64


In [8]:
model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(y_pred)
print("Recall:", recall_score(y_test, y_pred))

[0 0 1 ... 0 0 1]
Recall: 0.8933333333333333


In [None]:
# Allow user to input their own email content
new_mail = [input("Enter your email content: ")]  # Wrap input in a list to make it a single document

# Transform the input email content using the same vectorizer
new_data_features = vectorizer.transform(new_mail)

# Predict if the email is spam or not
prediction = model.predict(new_data_features)
print("Prediction:", prediction)
# Print the result
print("Spam" if prediction[0] == 1 else "Not Spam")


'# Allow user to input their own email content\nnew_mail = [input("Enter your email content: ")]  # Wrap input in a list to make it a single document\n\n# Transform the input email content using the same vectorizer\nnew_data_features = vectorizer.transform(new_mail)\n\n# Predict if the email is spam or not\nprediction = model.predict(new_data_features)\nprint("Prediction:", prediction)\n# Print the result\nprint("Spam" if prediction[0] == 1 else "Not Spam")\n'

In [21]:
import joblib

# Assuming your model and vectorizer are named clf and tfidf
joblib.dump(model, 'spam_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']