In [1]:
import pandas as pd

In [4]:
pd.set_option('display.max_colwidth', None) # print all string data on notebook 

In [5]:
df = pd.read_csv('SMSSpamCollection', sep='\t',
                names=["label", "message"])

In [6]:
df.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"
5,spam,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"
6,ham,Even my brother is not like to speak with me. They treat me like aids patent.
7,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
8,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030


In [9]:
df["label"].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [10]:
import re # data cleaning and preprocessing 
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Abhijeet
[nltk_data]     Rane\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [12]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tqdm.notebook import tqdm_notebook as tqdm


In [15]:
ps = PorterStemmer()

In [16]:
len(stopwords.words('english'))

179

In [17]:
corpus = []
with tqdm(total=len(df)) as pbar:
    for i in range(0, len(df)):
        review = re.sub('[^a-zA-z]', ' ', df['message'][i])
        review = review.lower()
        review = review.split()
        review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
        pbar.update(1)

HBox(children=(FloatProgress(value=0.0, max=5572.0), HTML(value='')))




In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()
y=pd.get_dummies(df['label'])
y=y.iloc[:,1].values


In [26]:
# train Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [27]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [28]:
y_pred=spam_detect_model.predict(X_test)

In [29]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [30]:
cm = confusion_matrix(y_test,y_pred)
cm

array([[946,   9],
       [  8, 152]], dtype=int64)

In [31]:
acc = accuracy_score(y_test, y_pred)
acc

0.9847533632286996

In [32]:
rep = classification_report(y_test, y_pred)
print(rep)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       955
           1       0.94      0.95      0.95       160

    accuracy                           0.98      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115

