<a href="https://colab.research.google.com/github/AshmithaMB/NLP/blob/main/Spam_or_Ham.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#####Natural Language Processing & Machine Learning

###Detecting Spam & Ham Emails 

#####Importing the Libraries

In [24]:
import pandas as pd 
import string #to handle punctuations and special characters
from nltk.corpus import stopwords #to andle stopwords dictionary
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics

In [25]:
df_spam = pd.read_csv("/content/NB.csv" , encoding = 'ISO-8859-1')

In [26]:
df_spam.head(10)

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or å£10,000..."
4,spam,okmail: Dear Dave this is your final notice to...
5,ham,Aiya we discuss later lar... Pick u up at 4 is...
6,ham,Are you this much buzy
7,ham,Please ask mummy to call father
8,spam,Marvel Mobile Play the official Ultimate Spide...
9,ham,"fyi I'm at usf now, swing by the room whenever"


In [27]:
df_spam.shape

(5559, 2)

#### Data pre-processing + Feature Extraction

In [28]:
def message_text_process(message):
  no_punctuation = [char for char in message if char not in string.punctuation]
  no_punctuation = ''.join(no_punctuation)
  return [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

In [29]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [30]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [31]:
df_spam['text'].head(11).apply(message_text_process)

0                          [Hope, good, week, checking]
1                                 [Kgive, back, thanks]
2                                      [also, cbe, pay]
3     [complimentary, 4, STAR, Ibiza, Holiday, å£100...
4     [okmail, Dear, Dave, final, notice, collect, 4...
5               [Aiya, discuss, later, lar, Pick, u, 4]
6                                          [much, buzy]
7                    [Please, ask, mummy, call, father]
8     [Marvel, Mobile, Play, official, Ultimate, Spi...
9                 [fyi, Im, usf, swing, room, whenever]
10    [Sure, thing, big, man, hockey, elections, 6, ...
Name: text, dtype: object

In [33]:
bag_words = CountVectorizer(analyzer = message_text_process).fit(df_spam['text'])

In [36]:
print(len(bag_words.vocabulary_))

11356


In [37]:
print(bag_words)

CountVectorizer(analyzer=<function message_text_process at 0x7f5b26024c10>)


In [39]:
message_bagwords = bag_words.transform(df_spam['text'])

In [40]:
print(message_bagwords)

  (0, 2254)	1
  (0, 5414)	1
  (0, 6847)	1
  (0, 10943)	1
  (1, 2489)	1
  (1, 4865)	1
  (1, 10332)	1
  (2, 4611)	1
  (2, 5352)	1
  (2, 8719)	1
  (3, 262)	1
  (3, 605)	1
  (3, 1296)	1
  (3, 2244)	1
  (3, 2329)	1
  (3, 2588)	1
  (3, 3523)	1
  (3, 4066)	1
  (3, 5331)	1
  (3, 5554)	1
  (3, 5601)	1
  (3, 7847)	1
  (3, 8343)	1
  (3, 11304)	1
  (4, 224)	1
  :	:
  (5557, 509)	1
  (5557, 965)	1
  (5557, 1721)	1
  (5557, 1777)	1
  (5557, 1778)	1
  (5557, 2403)	1
  (5557, 2404)	1
  (5557, 3278)	1
  (5557, 3397)	2
  (5557, 3486)	1
  (5557, 4454)	1
  (5557, 7076)	1
  (5557, 7594)	1
  (5557, 7688)	1
  (5557, 8015)	1
  (5557, 9120)	1
  (5557, 9755)	1
  (5557, 10077)	1
  (5557, 10637)	1
  (5557, 10715)	1
  (5557, 11121)	1
  (5558, 3627)	1
  (5558, 5243)	1
  (5558, 5867)	1
  (5558, 6606)	1


Importing Term Frequency Inverse Document Frequency Transformer

In [42]:
tfidf_transformer = TfidfTransformer().fit(message_bagwords)

In [43]:
message_tfidf = tfidf_transformer.transform(message_bagwords)

In [44]:
print(message_tfidf.shape)
print(message_tfidf)

(5559, 11356)
  (0, 10943)	0.4379039903544475
  (0, 6847)	0.4079211665788139
  (0, 5414)	0.6229255062832731
  (0, 2254)	0.5037898676138289
  (1, 10332)	0.53343447866409
  (1, 4865)	0.3941617905371652
  (1, 2489)	0.7483876935472058
  (2, 8719)	0.5395126957976536
  (2, 5352)	0.6872527142806397
  (2, 4611)	0.486425490478287
  (3, 11304)	0.2886588513405475
  (3, 8343)	0.26016042466750977
  (3, 7847)	0.26016042466750977
  (3, 5601)	0.26016042466750977
  (3, 5554)	0.2332211897066975
  (3, 5331)	0.20699668006135605
  (3, 4066)	0.2169360073935419
  (3, 3523)	0.32915177840434773
  (3, 2588)	0.2780553934168467
  (3, 2329)	0.2953789049732685
  (3, 2244)	0.24076955349243523
  (3, 1296)	0.32915177840434773
  (3, 605)	0.147541827236046
  (3, 262)	0.32915177840434773
  (4, 8523)	0.2839777476652211
  :	:
  (5557, 10715)	0.11021052039206913
  (5557, 10637)	0.08306947846909804
  (5557, 10077)	0.2120492716727746
  (5557, 9755)	0.2417954431547904
  (5557, 9120)	0.1353368128310807
  (5557, 8015)	0.17033997

Building a Model

In [45]:
X = message_tfidf
print(X)
y = df_spam['type']
print(y)

  (0, 10943)	0.4379039903544475
  (0, 6847)	0.4079211665788139
  (0, 5414)	0.6229255062832731
  (0, 2254)	0.5037898676138289
  (1, 10332)	0.53343447866409
  (1, 4865)	0.3941617905371652
  (1, 2489)	0.7483876935472058
  (2, 8719)	0.5395126957976536
  (2, 5352)	0.6872527142806397
  (2, 4611)	0.486425490478287
  (3, 11304)	0.2886588513405475
  (3, 8343)	0.26016042466750977
  (3, 7847)	0.26016042466750977
  (3, 5601)	0.26016042466750977
  (3, 5554)	0.2332211897066975
  (3, 5331)	0.20699668006135605
  (3, 4066)	0.2169360073935419
  (3, 3523)	0.32915177840434773
  (3, 2588)	0.2780553934168467
  (3, 2329)	0.2953789049732685
  (3, 2244)	0.24076955349243523
  (3, 1296)	0.32915177840434773
  (3, 605)	0.147541827236046
  (3, 262)	0.32915177840434773
  (4, 8523)	0.2839777476652211
  :	:
  (5557, 10715)	0.11021052039206913
  (5557, 10637)	0.08306947846909804
  (5557, 10077)	0.2120492716727746
  (5557, 9755)	0.2417954431547904
  (5557, 9120)	0.1353368128310807
  (5557, 8015)	0.17033997821294367
  (5

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [48]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4447, 11356)
(1112, 11356)
(4447,)
(1112,)


In [49]:
spam_detect = svm.SVC(kernel = 'linear', C= 1, gamma = 10). fit(X_train, y_train)

In [51]:
predicted = spam_detect.predict(X_test)

In [52]:
predicted

array(['ham', 'spam', 'ham', ..., 'spam', 'ham', 'ham'], dtype=object)

In [53]:
expected = y_test
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       952
        spam       0.99      0.88      0.93       160

    accuracy                           0.98      1112
   macro avg       0.98      0.94      0.96      1112
weighted avg       0.98      0.98      0.98      1112

[[950   2]
 [ 19 141]]


In [54]:
metrics.accuracy_score(expected, predicted)

0.9811151079136691