In [1]:
import pandas as pd
import re
import nltk

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arigilasrinivas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arigilasrinivas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arigilasrinivas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
messages = pd.read_csv("SMSSpamCollection.tsv", sep='\t', names=["label","message"])

In [4]:
messages.head()

Unnamed: 0,label,message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [5]:
messages.shape

(5568, 2)

In [6]:
messages['label'].unique()

array(['ham', 'spam'], dtype=object)

In [7]:
messages['label'].nunique()

2

In [8]:
messages['label'].value_counts()

ham     4822
spam     746
Name: label, dtype: int64

In [9]:
## taking one message and perform basic explorations

In [10]:
messages['message'][0]

"I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times."

In [11]:
sentences = nltk.sent_tokenize(messages['message'][0])
sentences

["I've been searching for the right words to thank you for this breather.",
 'I promise i wont take your help for granted and will fulfil my promise.',
 'You have been wonderful and a blessing at all times.']

In [12]:
words = nltk.word_tokenize(messages['message'][0])
words

['I',
 "'ve",
 'been',
 'searching',
 'for',
 'the',
 'right',
 'words',
 'to',
 'thank',
 'you',
 'for',
 'this',
 'breather',
 '.',
 'I',
 'promise',
 'i',
 'wont',
 'take',
 'your',
 'help',
 'for',
 'granted',
 'and',
 'will',
 'fulfil',
 'my',
 'promise',
 '.',
 'You',
 'have',
 'been',
 'wonderful',
 'and',
 'a',
 'blessing',
 'at',
 'all',
 'times',
 '.']

In [13]:
len(sentences),len(words)

(3, 41)

In [14]:
## Stemmer and Lemmatizer

In [15]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [16]:
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()

In [17]:
stem_sent = sentences
print(f'sentence before stemmer {stem_sent}')
for i in range(len(stem_sent)):
    words = nltk.word_tokenize(sentences[i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    stem_sent[i] = ' '.join(words)

print(f'sentence after stemmer {stem_sent}')

sentence before stemmer ["I've been searching for the right words to thank you for this breather.", 'I promise i wont take your help for granted and will fulfil my promise.', 'You have been wonderful and a blessing at all times.']
sentence after stemmer ["I 've search right word thank breather .", 'I promis wont take help grant fulfil promis .', 'you wonder bless time .']


In [18]:
lemma_sent = sentences
print(f'sentence before lemmatizer {lemma_sent}')
for i in range(len(lemma_sent)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    lemma_sent[i] = ' '.join(words)
    
print(f'sentence after lemmatizer {lemma_sent}')    

sentence before lemmatizer ["I 've search right word thank breather .", 'I promis wont take help grant fulfil promis .', 'you wonder bless time .']
sentence after lemmatizer ["I 've search right word thank breather .", 'I promis wont take help grant fulfil promis .', 'wonder bless time .']


In [19]:
corpus = []

for i in range(len(sentences)):
    review = re.sub('[^a-z A-Z]',' ',sentences[i])
    review = review.lower()
    review = review.split()
    print(review)
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    print(review)
    corpus.append(review)
    
print(f'corpus : {corpus} and length is {len(corpus)}') 

['i', 've', 'search', 'right', 'word', 'thank', 'breather']
search right word thank breather
['i', 'promis', 'wont', 'take', 'help', 'grant', 'fulfil', 'promis']
promis wont take help grant fulfil promis
['wonder', 'bless', 'time']
wonder bless time
corpus : ['search right word thank breather', 'promis wont take help grant fulfil promis', 'wonder bless time'] and length is 3


In [20]:
## Bag of Words

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

X

array([[0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 1, 1, 2, 0, 0, 1, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]], dtype=int64)

In [22]:
## TF IDF

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()
X = tf.fit_transform(corpus).toarray()

X

array([[0.        , 0.4472136 , 0.        , 0.        , 0.        ,
        0.        , 0.4472136 , 0.4472136 , 0.        , 0.4472136 ,
        0.        , 0.        , 0.        , 0.4472136 ],
       [0.        , 0.        , 0.33333333, 0.33333333, 0.33333333,
        0.66666667, 0.        , 0.        , 0.33333333, 0.        ,
        0.        , 0.        , 0.33333333, 0.        ],
       [0.57735027, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.57735027, 0.57735027, 0.        , 0.        ]])

In [24]:
### for SPAM ClASSIFIER MODEL

In [25]:
corpus = []

for i in range(len(messages)):
    review = re.sub('[^a-z A-Z]',' ',messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)


tf = TfidfVectorizer()
X = tf.fit_transform(corpus).toarray()
print(X)
print(len(X))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
5568


In [26]:
y=pd.get_dummies(messages['label'])
print(y)

y=y.iloc[:,1].values
print(y)
len(y)

      ham  spam
0       1     0
1       0     1
2       1     0
3       1     0
4       1     0
...   ...   ...
5563    0     1
5564    1     0
5565    1     0
5566    1     0
5567    1     0

[5568 rows x 2 columns]
[0 1 0 ... 0 0 0]


5568

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=0)

In [29]:
from sklearn.naive_bayes import MultinomialNB

In [30]:
NB_model = MultinomialNB()

In [31]:
span_detect_model = NB_model.fit(X_train,y_train)

In [32]:
y_train_pred = span_detect_model.predict(X_train)
y_train_pred

array([0, 1, 0, ..., 0, 0, 0], dtype=uint8)

In [33]:
y_test_pred = span_detect_model.predict(X_test)
y_test_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [34]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [35]:
conf_mat = confusion_matrix(y_test, y_test_pred)
conf_mat

array([[955,   0],
       [ 33, 126]], dtype=int64)

In [36]:
class_rep = classification_report(y_test, y_test_pred)
print(class_rep)

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       955
           1       1.00      0.79      0.88       159

    accuracy                           0.97      1114
   macro avg       0.98      0.90      0.93      1114
weighted avg       0.97      0.97      0.97      1114



In [37]:
model_accuracy = accuracy_score(y_test, y_test_pred)
model_accuracy

0.9703770197486535