<a href="https://colab.research.google.com/github/Arpan255/CodeClauseInternship_Spam_classification/blob/main/Spam_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
data = pd.read_csv('/content/spam_ham_dataset.csv')
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    processed_text = ' '.join(words)
    return processed_text
data['processed_text'] = data['text'].apply(preprocess_text)
print(data[['text', 'processed_text', 'label']].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                                                text  \
0  Subject: enron methanol ; meter # : 988291\r\n...   
1  Subject: hpl nom for january 9 , 2001\r\n( see...   
2  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3  Subject: photoshop , windows , office . cheap ...   
4  Subject: re : indian springs\r\nthis deal is t...   

                                      processed_text label  
0  subject enron methanol meter follow note gave ...   ham  
1  subject hpl nom januari see attach file hplnol...   ham  
2  subject neon retreat ho ho ho around wonder ti...   ham  
3  subject photoshop window offic cheap main tren...  spam  
4  subject indian spring deal book teco pvr reven...   ham  


In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num,processed_text
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter follow note gave ...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom januari see attach file hplnol...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho around wonder ti...
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop window offic cheap main tren...
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,subject indian spring deal book teco pvr reven...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform(data['processed_text'])


In [None]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, data['label'], test_size=0.2, random_state=42)
naive_bayes.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = naive_bayes.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.96
Classification Report:
               precision    recall  f1-score   support

         ham       0.98      0.97      0.97       742
        spam       0.92      0.94      0.93       293

    accuracy                           0.96      1035
   macro avg       0.95      0.95      0.95      1035
weighted avg       0.96      0.96      0.96      1035



In [None]:
user_input = """Subject: tenaska iv july
darren :
please remove the price on the tenaska iv sale , deal 384258 , for july and enter the demand fee . the amount should be $ 3 , 902 , 687 . 50 .
thanks ,
megan"""

preprocessed_input = preprocess_text(user_input)
tfidf_input = tfidf_vectorizer.transform([preprocessed_input])
prediction = naive_bayes.predict(tfidf_input)
if prediction == "spam":
    print("This email is classified as spam.")
else:
    print("This email is not spam.")

This email is not spam.


In [None]:
user_input = """Subject: photoshop , windows , office . cheap . main trending
abasements darer prudently fortuitous undergone
lighthearted charm orinoco taster
railroad affluent pornographic cuvier
irvin parkhouse blameworthy chlorophyll
robed diagrammatic fogarty clears bayda
inconveniencing managing represented smartness hashish
academies shareholders unload badness
danielson pure caffein
spaniard chargeable levin

"""

preprocessed_input = preprocess_text(user_input)
tfidf_input = tfidf_vectorizer.transform([preprocessed_input])
prediction = naive_bayes.predict(tfidf_input)
if prediction == "spam":
    print("This email is classified as spam.")
else:
    print("This email is not spam.")

This email is classified as spam.
