In [17]:
import pandas as pd
df=pd.read_csv("/content/spam.csv", encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# **Cleaning**

In [18]:
df = df[['v1', 'v2']]
df.columns = ['label', 'message']
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
# Convert labels to binary
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# **Machine Learning**

In [25]:
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import plotly.figure_factory as ff

In [26]:
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42) #80/20

In [27]:
#Text Vectorization using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [29]:
model = MultinomialNB() #naive bayes
model.fit(X_train_tfidf, y_train)

In [30]:
y_pred = model.predict(X_test_tfidf)

In [33]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
accuracy=accuracy*100
print("Accuracy in percentage:",accuracy,"%")

Accuracy: 0.9668161434977578

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Accuracy in percentage: 96.68161434977578 %


In [34]:
cm = confusion_matrix(y_test, y_pred)
labels = ["Ham", "Spam"]
fig = ff.create_annotated_heatmap(
z=cm,
x=labels,
y=labels,
colorscale='Blues',
showscale=True
)
fig.update_layout(title="Confusion Matrix", xaxis_title="Predicted", yaxis_title="Actual")
fig.show()

In [35]:
fig2 = px.histogram(df, x="label", color="label", title="Distribution of Ham vs Spam Emails", barmode="group",
labels={"label": "Email Type (0=Ham, 1=Spam)"})
fig2.show()

# **Model Implimentation**

In [38]:
def check_email(message):
  message_tfidf = vectorizer.transform([message])
  prediction = model.predict(message_tfidf)[0]
  return "Spam" if prediction == 1 else "Ham"

In [39]:
sample_email = "Congratulations! You've won a $1000 gift card. Click here to claim."
print("Sample Email:", sample_email)
print("Prediction:", check_email(sample_email))

Sample Email: Congratulations! You've won a $1000 gift card. Click here to claim.
Prediction: Spam
