In [7]:
import pandas as pd
messages=pd.read_csv("SMSSpamCollection.csv",sep='\t',names=["labels","messages"])

In [8]:
# Data Cleaning 
import re 
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ankitgupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [10]:
corpus=[]

for i in range(0,len(messages)):
    review=re.sub('[a^zA-Z]',' ',messages['messages'][i])
    # What re.sub() does:
    # re.sub(pattern, replacement, text)
    # ðŸ‘‰ It finds all characters matching pattern and replaces them with replacement.
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if not word in  stopwords.words('english')]
    # IN this line apply each words stopwords if this is not in stopwords replace word with poster stemming
    review=' '.join(review)
    corpus.append(review)
   

In [11]:
## Creating TF-IDF Model
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(max_features=2500,ngram_range=(1,2))
X=tv.fit_transform(corpus).toarray()

In [12]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
tv.vocabulary_

{'point': 1638,
 'cr': 330,
 'il': 844,
 'ble': 183,
 'gre': 728,
 'world': 2443,
 'ine': 880,
 'got': 715,
 'il ble': 845,
 'oke': 1453,
 'wif': 2410,
 'oni': 1484,
 'ree': 1728,
 'entri': 506,
 'wkli': 2430,
 'comp': 299,
 'win': 2414,
 'fin': 594,
 'ext': 562,
 'receiv': 1720,
 'question': 1696,
 'std': 2006,
 'txt': 2241,
 'te': 2062,
 'ppli': 1662,
 'ree entri': 1729,
 'std txt': 2007,
 'txt te': 2245,
 'te ppli': 2066,
 'dun': 450,
 'rli': 1795,
 'lre': 1106,
 'dy': 455,
 'lre dy': 1107,
 'think': 2128,
 'goe': 701,
 'usf': 2324,
 'live': 1044,
 'round': 1813,
 'though': 2140,
 'sg': 1893,
 'ey': 566,
 'rling': 1798,
 'week': 2367,
 'nd': 1247,
 'word': 2436,
 'ck': 262,
 'like': 1029,
 'fun': 647,
 'still': 2012,
 'ok': 1452,
 'send': 1876,
 '50': 106,
 'ree sg': 1731,
 'week nd': 2369,
 'ven': 2341,
 'brother': 210,
 'spe': 1981,
 'me': 1132,
 'hey': 762,
 'tre': 2205,
 'id': 817,
 'per': 1597,
 'request': 1756,
 'ell': 481,
 'ru': 1840,
 'ett': 533,
 'set': 1886,
 'llertun': 1

In [15]:
# Dependent Features
y=pd.get_dummies(messages['labels']).astype(int)
y=y.iloc[:,1].values

In [16]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

In [17]:
from sklearn.naive_bayes import MultinomialNB
spam_tfidf_model=MultinomialNB().fit(X_train,y_train)
spam_tfidf_model

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [21]:
# prediction
y_pred=spam_tfidf_model.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [19]:
from sklearn.metrics import accuracy_score,classification_report
accuracy_score(y_pred,y_test)

0.979372197309417

In [20]:
classification_report(y_pred,y_test)

'              precision    recall  f1-score   support\n\n           0       1.00      0.98      0.99       985\n           1       0.86      0.98      0.92       130\n\n    accuracy                           0.98      1115\n   macro avg       0.93      0.98      0.95      1115\nweighted avg       0.98      0.98      0.98      1115\n'