<a href="https://colab.research.google.com/github/AparnaMounikau763/OIBSIP/blob/main/Email_spam_Detection_with_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


**Load the Dataset**

In [None]:
# Load dataset
df = pd.read_csv('/content/spam.csv', encoding='latin-1')

# Drop unnecessary columns if they exist
df = df[['v1', 'v2']]  # v1: label (ham/spam), v2: message text
df.columns = ['label', 'message']  # Rename columns for clarity

# Display the first few rows
print(df.head())


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


**Data Preprocessing**

In [None]:
# Convert labels to binary values
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Check for missing values
df.isnull().sum()

# Optional: Text preprocessing (remove punctuation, lowercase, etc.)
import string
df['message'] = df['message'].str.lower()
df['message'] = df['message'].str.translate(str.maketrans('', '', string.punctuation))

# Display processed data
print(df.head())


   label                                            message
0      0  go until jurong point crazy available only in ...
1      0                            ok lar joking wif u oni
2      1  free entry in 2 a wkly comp to win fa cup fina...
3      0        u dun say so early hor u c already then say
4      0  nah i dont think he goes to usf he lives aroun...


**Split the Data**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")



Training samples: 4457, Testing samples: 1115


**Feature Extraction**

In [None]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the training data
X_train_vect = vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_vect = vectorizer.transform(X_test)


**Train the Model**

In [None]:
# Initialize the model
model = MultinomialNB()

# Train the model
model.fit(X_train_vect, y_train)

# Make predictions
y_pred = model.predict(X_test_vect)


**Evaluate the Model**

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 96.77%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.76      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

Confusion Matrix:
[[965   0]
 [ 36 114]]


**Predict New Samples**

In [None]:
new_emails = ["Free entry in 2 a weekly competition to win FA Cup final tickets! Text FA to 87121 to receive entry question",
              "Hello, how are you?"]

# Transform new samples
new_emails_vect = vectorizer.transform(new_emails)

# Make predictions
predictions = model.predict(new_emails_vect)

# Output predictions
for i, email in enumerate(new_emails):
    print(f"Email: {email}\nPrediction: {'Spam' if predictions[i] == 1 else 'Ham'}\n")


Email: Free entry in 2 a weekly competition to win FA Cup final tickets! Text FA to 87121 to receive entry question
Prediction: Spam

Email: Hello, how are you?
Prediction: Ham



**Save the Model and Vectorizer**

In [None]:
import joblib

# Save the model
joblib.dump(model, '/content/spam_detector_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, '/content/vectorizer.pkl')


['/content/vectorizer.pkl']

**Load and Use the Model**

In [None]:
# Load the model and vectorizer
model = joblib.load('spam_detector_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

# Transform new samples and predict
new_emails_vect = vectorizer.transform(new_emails)
predictions = model.predict(new_emails_vect)

# Output predictions
for i, email in enumerate(new_emails):
    print(f"Email: {email}\nPrediction: {'Spam' if predictions[i] == 1 else 'Ham'}\n")


Email: Free entry in 2 a weekly competition to win FA Cup final tickets! Text FA to 87121 to receive entry question
Prediction: Spam

Email: Hello, how are you?
Prediction: Ham

