In [10]:
import nltk
import pandas as pd

In [11]:
messages = pd.read_csv('SMSSpamCollection.csv',sep='\t',names=['label','message'])

In [12]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
nltk.download('stopwords')

import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

ps = PorterStemmer()

corpus = []

for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages['message'][i])
    review = review.lower()
    # review = review.split()
    review = nltk.word_tokenize(review)
    # print("review",review)
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(1,3))
cv = TfidfVectorizer(max_features=2500)

X = cv.fit_transform(corpus).toarray()

In [15]:
pd.DataFrame(X,columns=cv.get_feature_names_out()).head()

Unnamed: 0,aah,aathi,abi,abiola,abl,abt,abta,ac,acc,accept,...,yo,yoga,yogasana,yor,yr,yummi,yun,yuo,yup,zed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
y = pd.Series(messages['label'].map({'ham':0,'spam':1}))

In [17]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: label, Length: 5572, dtype: int64

In [18]:

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB().fit(X_train,y_train)

y_pred = spam_detect_model.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print(f" Test Accuracy Score : {accuracy_score(y_test,y_pred)*100}%")
print(f" Confusion Matrix : {confusion_matrix(y_test,y_pred)}")
print(f" Classification Report : {classification_report(y_test,y_pred)}")

 Test Accuracy Score : 97.9372197309417%
 Confusion Matrix : [[955   0]
 [ 23 137]]
 Classification Report :               precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.86      0.92       160

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [19]:
test_sentence = "25000 INR Cashback on iPhone 12 Pro Max. Click here to avail the offer."

test_sentence = re.sub('[^a-zA-Z]',' ',test_sentence)
test_sentence = test_sentence.lower()
test_sentence = test_sentence.split()
test_sentence = [ps.stem(word) for word in test_sentence if not word in stopwords.words('english')]
test_sentence = ' '.join(test_sentence)

test_sentence = cv.transform([test_sentence]).toarray()

result = spam_detect_model.predict(test_sentence)

if result[0] == 1:
    print("Spam")
else:
    print("Not Spam")


Spam
