<a href="https://colab.research.google.com/github/AnjanaAbY/CBTCIP/blob/main/Spam_Email_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [71]:
#Importing basic libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [72]:
#Load Dataset
dataframe=pd.read_csv('/content/SpamEmailDetection.csv',usecols=[0,1])
print(dataframe)

        v1                                                 v2
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham              Will �_ b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [73]:
#Display dataset information
print("Description:\n",dataframe.describe())

Description:
           v1                      v2
count   5572                    5572
unique     2                    5163
top      ham  Sorry, I'll call later
freq    4825                      30


In [74]:
#checking null values
print("Null values:\n",dataframe.isnull().sum())

Null values:
 v1    0
v2    0
dtype: int64


In [75]:
#Rename columns for clarity
dataframe=dataframe.rename(columns={'v1':'label','v2':'EmailText'})

In [76]:
#Splitting the dataset into training and testing sets
x_train,x_test,y_train,y_test=train_test_split(dataframe['EmailText'],dataframe['label'],test_size=0.2,random_state=42)

In [80]:
# Import necessary libraries for text processing and machine learning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import MultinomialNB

In [82]:
#for text preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, x, y=None):
        return self

    def transform(self, x):
        return x.apply(self._preprocess)

    def _preprocess(self, text):
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text,flags=re.MULTILINE)
        # Remove non-alphabetic characters
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Convert to lower case
        text = text.lower()
        # Custom preprocessing logic (e.g., removing common words specific to the dataset)
        custom_stopwords = ['click', 'free', 'win', 'prize']
        text = ' '.join([word for word in text.split() if word not in custom_stopwords])
        return text

In [84]:
# Create a pipeline that vectorizes the text data and then applies the classifier
model = Pipeline([
    ('preprocessor',TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(stop_words='english', max_df=0.7)),
    ('classifier',MultinomialNB())
])

In [85]:
# Train the model
model.fit(x_train, y_train)

In [86]:
# Make predictions
y_pred = model.predict(x_test)

In [87]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report=classification_report(y_test,y_pred,target_names=['ham','spam'])

In [88]:
# Display results
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n",report)

Accuracy: 0.9668161434977578
Confusion Matrix:
 [[965   0]
 [ 37 113]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [92]:
# Example emails for prediction
sample_emails = [
    "Congratulations! You've won a free ticket to the Bahamas. Click here to claim your prize.",
    "Hey, just wanted to check in and see how you're doing. Let's catch up soon!",
    "Reminder: Your subscription to XYZ service will expire soon. Renew now to continue enjoying the benefits.",
    "Urgent! Your account has been compromised. Please verify your identity by clicking this link."
]

# Convert sample_emails list to a pandas Series
sample_emails_series = pd.Series(sample_emails)

# Predict the labels for the sample emails
sample_predictions = model.predict(sample_emails_series)

# Display the results
for email, label in zip(sample_emails, sample_predictions):
    print(f"\nEmail: {email}\nPrediction: {label}\n")



Email: Congratulations! You've won a free ticket to the Bahamas. Click here to claim your prize.
Prediction: spam


Email: Hey, just wanted to check in and see how you're doing. Let's catch up soon!
Prediction: ham


Email: Reminder: Your subscription to XYZ service will expire soon. Renew now to continue enjoying the benefits.
Prediction: ham


Email: Urgent! Your account has been compromised. Please verify your identity by clicking this link.
Prediction: spam

