<a href="https://colab.research.google.com/github/AvniKal/SpamDetectionMailsPrediction-ak/blob/master/Spam_detection1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#download and extraxt the dataset
import kagglehub

path = kagglehub.dataset_download("ashfakyeafi/spam-email-classification")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/spam-email-classification


In [None]:
# Python code is used to list all the files and folders inside a directory.
import os
os.listdir(path) # listdir lists all the files in that directory

['email.csv']

In [None]:
import pandas as pd
import os

# Load the dataset
df = pd.read_csv(os.path.join(path, 'email.csv'), encoding='latin-1')

# View the first few rows
print(df.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [None]:
df.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [None]:
# Convert 'spam' to 1 and 'ham' to 0
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

In [None]:
import string
import re
# re is used to work with regular expressions (for finding/removing patterns like numbers).
# string helps handle characters like punctuation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# This is the tool you’ll use later to convert cleaned text into numbers.

In [None]:
def clean_text(text):
    text = text.lower()                          # lowercase
    text = re.sub(r'\d+', '', text)              # remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = text.strip()                          # remove whitespace
    return text

df['clean_message'] = df['Message'].apply(clean_text)

In [None]:
df['Category'].isnull().sum()

np.int64(1)

In [None]:
df = df.dropna(subset=['Category'])

string.punctuation
This gives a string of all punctuation characters:
str.maketrans('', '', string.punctuation)
This creates a translation table telling Python:
“For every character in string.punctuation, remove it.”
It basically says:
Don’t replace anything with anything else.
Just delete the characters listed in string.punctuation.
 text.translate(...)
This applies the translation table to the actual message.

In [None]:
#Convert text to TF-IDF features

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['clean_message'])

# Labels
y = df['Category']

Ignore common English words (called stop words) like is, and, the, a, etc."
Term Frequency (TF)
How often a word appears in a message.
So IDF increases the weight of rare words, and decreases the weight of common ones.

In [None]:
#Train a Classification Model

#split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   #Ensures the split is the same every time (for reproducibility).

In [None]:
#Train Naive Bayes (simple & good for text):
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
#evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9632286995515695

Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      1.00      0.98       966
         1.0       1.00      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115


Confusion Matrix:
 [[966   0]
 [ 41 108]]


In [None]:
#Test with Your Own Messages
def predict_spam(message):
    message = clean_text(message)
    vec = vectorizer.transform([message])
    pred = model.predict(vec)
    return "Spam" if pred[0] == 1 else "Not Spam"

# Try some examples
print(predict_spam("Congratulations! You've won a free iPhone. Click here to claim."))  # likely spam
print(predict_spam("Hey, are we still on for lunch today?"))  # likely not spam

Spam
Not Spam
