# Spam Classifier using Natural Language Processing

## Model Accuracy is ~98

In [1]:
# Importing pandas to read the csv file
import pandas as pd

# Reading a txt file and seperating the text with TAB. Column names are not defined in the file, so we are taking 2 columns
# first is label and other is message
messages = pd.read_csv('SMSSpamCollection.txt', sep = '\t', names = ['label', 'message'])

In [2]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Data Cleaning and Data Processing

In [3]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

### Steps for data cleaning:

* Remove all the special characters, comma etc other than space and A-z
* Made it all sentences in lower case
* Splitting the text
* Applying the stemming

In [11]:
corpus = []

for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [15]:
# Creating the bag of the words model

from sklearn.feature_extraction.text import CountVectorizer

# Taking 5K frequent data, play around with min or max dataset
cv = CountVectorizer(max_features = 5000)
x = cv.fit_transform(corpus).toarray()
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
x.shape

(5572, 5000)

In [17]:
# We can't pass the text directly so that why we are apply get_dummies method to convert text to dummy
y = pd.get_dummies(messages['label'])

# Selecting the 2 column value (spam), ignoring the value of (ham)
y = y.iloc[:, 1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [21]:
import nltk
import sklearn

print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The nltk version is 3.5.
The scikit-learn version is 0.23.1.


In [23]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)

In [26]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(x_train, y_train)

In [27]:
y_pred = spam_detect_model.predict(x_test)

In [28]:
# Applying the confusion matrix to check the performance of model

from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test, y_pred)
confusion_m

array([[939,  16],
       [  8, 152]], dtype=int64)

In [30]:
# Checking the accuracy score

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.97847533632287