In [1]:
import pandas as pd
messages=pd.read_csv("SMSSpamCollection.csv",sep='\t',names=["labels","messages"])

In [2]:
# Data Cleaning 
import re 
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ankitgupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [4]:
corpus=[]

for i in range(0,len(messages)):
    review=re.sub('[a^zA-Z]',' ',messages['messages'][i])
    # What re.sub() does:
    # re.sub(pattern, replacement, text)
    # ðŸ‘‰ It finds all characters matching pattern and replaces them with replacement.
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if not word in  stopwords.words('english')]
    # IN this line apply each words stopwords if this is not in stopwords replace word with poster stemming
    review=' '.join(review)
    corpus.append(review)
   

In [5]:
# Dependent Features
y=pd.get_dummies(messages['labels']).astype(int)
y=y.iloc[:,1].values

In [7]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(corpus,y,test_size=0.20,random_state=42)

In [8]:
## Creating TF-IDF Model
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(max_features=2500,ngram_range=(1,2))
X_train=tv.fit_transform(X_train).toarray()
X_test=tv.transform(X_test).toarray()

In [9]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
tv.vocabulary_

{'epli': 500,
 'win': 2415,
 '100': 30,
 'orld': 1460,
 'end': 483,
 'servic': 1837,
 'ello': 469,
 'town': 2185,
 'lre': 1092,
 'dy': 444,
 'dont': 416,
 'rush': 1792,
 'home': 760,
 'ting': 2136,
 'ill': 828,
 'let': 991,
 'know': 961,
 'et': 516,
 'lre dy': 1093,
 'let know': 992,
 'ow': 1501,
 'come': 285,
 'ng': 1291,
 'go': 662,
 'tell': 2041,
 'her': 737,
 'hen': 733,
 'told': 2156,
 'ow come': 1503,
 'ey': 548,
 'thi': 2098,
 'till': 2127,
 'dint': 396,
 'meet': 1131,
 'even': 525,
 'singl': 1883,
 'time': 2128,
 'situ': 1888,
 'tion': 2138,
 'nge': 1292,
 'bring': 198,
 'rington': 1735,
 'rt': 1778,
 'free': 605,
 'hit': 750,
 'ch': 227,
 'week': 2372,
 'ington': 863,
 'ic': 798,
 'stop': 1986,
 'receiv': 1662,
 'repli': 1694,
 'it': 893,
 'mu': 1226,
 'ite': 900,
 'everyon': 532,
 'get': 640,
 'ke': 923,
 'shower': 1866,
 'li': 998,
 'obil': 1374,
 'rded': 1653,
 '000': 2,
 'onu': 1428,
 'ller': 1044,
 'ri': 1724,
 '02': 5,
 '03': 7,
 '2nd': 90,
 'ttempt': 2210,
 'cont': 309,

In [None]:
# Dependent Features


In [None]:
# Train Test Split


In [11]:
from sklearn.naive_bayes import MultinomialNB
spam_tfidf_model=MultinomialNB().fit(X_train,y_train)
spam_tfidf_model

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [12]:
# prediction
y_pred=spam_tfidf_model.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
from sklearn.metrics import accuracy_score,classification_report
accuracy_score(y_pred,y_test)

0.9757847533632287

In [14]:
classification_report(y_pred,y_test)

'              precision    recall  f1-score   support\n\n           0       1.00      0.98      0.99       987\n           1       0.84      0.98      0.90       128\n\n    accuracy                           0.98      1115\n   macro avg       0.92      0.98      0.94      1115\nweighted avg       0.98      0.98      0.98      1115\n'