<a href="https://colab.research.google.com/github/Aditya2040/Calci/blob/main/EmailSpamCheck.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Sample data
data = {
    'email_content': [
        "Congratulations! You've won a free iPhone. Click here to claim.",
        "Meeting reminder for tomorrow at 10 AM.",
        "Urgent: Your account has been compromised. Verify your details now.",
        "Project proposal submission deadline is next Friday.",
        "Click now to win a vacation package!",
        "Regarding the report from last week.",
        "Your Amazon shipment is on its way.",
        "Exclusive offer: Get 50% off on all products!",
        "Let's discuss the next steps for the marketing campaign.",
        "Limited time offer! Claim your prize now."
    ],
    'label': [
        'spam',
        'not spam',
        'spam',
        'not spam',
        'spam',
        'not spam',
        'not spam',
        'spam',
        'not spam',
        'spam'
    ]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\nValue Counts of Labels:")
print(df['label'].value_counts())

Original DataFrame:
                                       email_content     label
0  Congratulations! You've won a free iPhone. Cli...      spam
1            Meeting reminder for tomorrow at 10 AM.  not spam
2  Urgent: Your account has been compromised. Ver...      spam
3  Project proposal submission deadline is next F...  not spam
4               Click now to win a vacation package!      spam
5               Regarding the report from last week.  not spam
6                Your Amazon shipment is on its way.  not spam
7      Exclusive offer: Get 50% off on all products!      spam
8  Let's discuss the next steps for the marketing...  not spam
9          Limited time offer! Claim your prize now.      spam

Value Counts of Labels:
label
spam        5
not spam    5
Name: count, dtype: int64


In [3]:
# Separate features (X) and target (y)
X = df['email_content']
y = df['label']

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the email content
X_vectorized = vectorizer.fit_transform(X)

print("\nShape of Vectorized Data (Number of emails, Number of unique words):", X_vectorized.shape)
print("\nFirst 5 rows of vectorized data (sparse matrix representation):")
print(X_vectorized[:5])
print("\nFeatures (unique words) extracted:")
print(vectorizer.get_feature_names_out())


Shape of Vectorized Data (Number of emails, Number of unique words): (10, 62)

First 5 rows of vectorized data (sparse matrix representation):
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 39 stored elements and shape (5, 62)>
  Coords	Values
  (0, 12)	1
  (0, 60)	1
  (0, 54)	1
  (0, 59)	1
  (0, 18)	1
  (0, 24)	1
  (0, 10)	1
  (0, 23)	1
  (0, 50)	1
  (0, 9)	1
  (1, 31)	1
  (1, 43)	1
  (1, 17)	1
  (1, 51)	1
  (1, 6)	1
  (1, 0)	1
  (1, 4)	1
  (2, 52)	1
  (2, 61)	2
  (2, 2)	1
  (2, 22)	1
  (2, 7)	1
  (2, 11)	1
  (2, 55)	1
  (2, 14)	1
  (2, 33)	1
  (3, 40)	1
  (3, 41)	1
  (3, 47)	1
  (3, 13)	1
  (3, 25)	1
  (3, 32)	1
  (3, 19)	1
  (4, 10)	1
  (4, 50)	1
  (4, 33)	1
  (4, 58)	1
  (4, 53)	1
  (4, 37)	1

Features (unique words) extracted:
['10' '50' 'account' 'all' 'am' 'amazon' 'at' 'been' 'campaign' 'claim'
 'click' 'compromised' 'congratulations' 'deadline' 'details' 'discuss'
 'exclusive' 'for' 'free' 'friday' 'from' 'get' 'has' 'here' 'iphone' 'is'
 'its' 'last' 'let' 'limi

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]} emails")
print(f"Testing set size: {X_test.shape[0]} emails")


Training set size: 8 emails
Testing set size: 2 emails


In [5]:
# Initialize the Multinomial Naive Bayes classifier
model = MultinomialNB()

# Train the model using the training data
model.fit(X_train, y_train)

print("\nModel trained successfully!")


Model trained successfully!


In [6]:
# Make predictions on the test set
y_pred = model.predict(X_test)

print("\nPredictions on the test set:")
for i in range(len(y_test)):
    print(f"Actual: {y_test.iloc[i]}, Predicted: {y_pred[i]}")


Predictions on the test set:
Actual: not spam, Predicted: not spam
Actual: not spam, Predicted: not spam


In [7]:
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy Score: {accuracy:.2f}")


Accuracy Score: 1.00
