<a href="https://colab.research.google.com/github/Avdhoot1574/Spam-Mail-Detection-Model/blob/main/Spam_Message_Detection_ML_Project_By_Avdhoot_Nakod.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Import pandas
import pandas as pd

# Load the CSV file with proper encoding to avoid errors
df = pd.read_csv('spam.csv', encoding='latin-1')

# Show the first 5 rows of the dataset
df.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
# Keep only useful columns
df = df[['v1', 'v2']]  # 'v1' is the label, 'v2' is the message

# Rename them for clarity
df.columns = ['label', 'message']

# Display a few random rows
df.sample(5)


Unnamed: 0,label,message
51,ham,"A gram usually runs like &lt;#&gt; , a half e..."
3038,ham,"Wishing you and your family Merry \X\"" mas and..."
1619,ham,Friends that u can stay on fb chat with
3589,ham,I am in escape theatre now. . Going to watch K...
4924,ham,Ok... Let u noe when i leave my house.


In [7]:
# Map the labels: ham = 0, spam = 1
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# Check the label distribution
df['label'].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


In [8]:
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
# Create a function to clean text
def clean_text(msg):
    # Lowercase
    msg = msg.lower()
    # Remove punctuation
    msg = ''.join([char for char in msg if char not in string.punctuation])
    # Remove stopwords
    words = msg.split()
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)

# Create a new column with cleaned messages
df['cleaned_message'] = df['message'].apply(clean_text)

# Show a few examples
df[['message', 'cleaned_message']].sample(5)



Unnamed: 0,message,cleaned_message
4185,"Hmm ok, i'll stay for like an hour cos my eye ...",hmm ok ill stay like hour cos eye really sore
4986,No rushing. I'm not working. I'm in school so ...,rushing im working im school rush go hungry
687,"Dear,Me at cherthala.in case u r coming cochin...",dearme cherthalain case u r coming cochin pls ...
1467,I wont touch you with out your permission.,wont touch permission
1057,Ard 515 like dat. Y?,ard 515 like dat


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize vectorizer
tfidf = TfidfVectorizer()

# Fit and transform the cleaned messages
X = tfidf.fit_transform(df['cleaned_message'])

# Labels (0 = ham, 1 = spam)
y = df['label_num']


In [12]:
print("Vectorized shape:", X.shape)


Vectorized shape: (5572, 9376)


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [15]:
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data size:", X_train.shape)
print("Testing data size:", X_test.shape)

# Initialize the model
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)
# Predict on test data
y_pred = model.predict(X_test)



Training data size: (4457, 9376)
Testing data size: (1115, 9376)


In [16]:
# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9659192825112107
Confusion Matrix:
 [[965   0]
 [ 38 112]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [17]:
import joblib

In [18]:
# Save the trained model
joblib.dump(model, 'spam_classifier_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [23]:
def predict_spam(msg):
    # Clean the message
    cleaned = clean_text(msg)
    # Vectorize
    vectorized = tfidf.transform([cleaned])
    # Predict
    result = model.predict(vectorized)
    return "Spam" if result[0] == 1 else "Ham"

# Testing Fuction

print(predict_spam("Congratulations! You have won a free ticket. Call now to claim it!"))
print(predict_spam("Hey bro, let's catch up tomorrow."))



Spam
Ham


In [24]:
!pip install gradio


Collecting gradio
  Downloading gradio-5.25.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (

In [29]:
import gradio as gr
import joblib

# Load model and vectorizer
model = joblib.load('spam_classifier_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Text preprocessing
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def clean_text(msg):
    msg = msg.lower()
    msg = ''.join([ch for ch in msg if ch not in string.punctuation])
    words = msg.split()
    words = [w for w in words if w not in stopwords.words('english')]
    return " ".join(words)

# Prediction function
def predict_spam_gradio(message):
    cleaned = clean_text(message)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)
    return "🚫Alert ! This message could be Spam" if prediction[0] == 1 else "✅ Not to Worry ! The Message is not Spam"

# Gradio Interface
interface = gr.Interface(
    fn=predict_spam_gradio,
    inputs="text",
    outputs="text",
    title="📩 Spam Message Detector",
    description="Type a message to check whether it's spam or not."
)

interface.launch(share=True)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c3d313565ee2f08f26.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


