# Spam Classifier
SMS spam collection Data set

[Download: Data Folder, Data set Description](https://archive.ics.uci.edu/dataset/228/sms+spam+collection)

# Importing dataset

In [1]:
import pandas as pd

In [2]:
messages = pd.read_csv('sms_spam_collection/SMSSpamCollection', sep = '\t',
                       names = ["Label", "message"])

In [3]:
messages.shape

(5572, 2)

In [4]:
messages.head()

Unnamed: 0,Label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
messages.describe()

Unnamed: 0,Label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


# Data cleaning and preprocesssing

In [6]:
import re
import nltk
#nltk.download('stopwords')

In [7]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []

for i in range (0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ',messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)


# Creating the Bag of Words model

In [8]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
#cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [9]:
corpus[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [10]:
X.shape

(5572, 5000)

In [11]:
len(corpus)

5572

In [12]:
# Access element at row 491, column 41 (index 491, 41)
element = X[491, 41]
print("Element at (491, 41):", element)


Element at (491, 41): 0


In [13]:
import numpy as np
unique_elements, counts = np.unique(X, return_counts=True)
print("Unique elements:", unique_elements)
print("Counts:", counts)


Unique elements: [ 0  1  2  3  4  5  6  8 10 15 18]
Counts: [27816169    41654     1871      245       32       12       12        1
        1        1        2]


In [14]:
# creating the dummy variables for the labels 'ham' and 'spam'
y = pd.get_dummies(messages['Label'], dtype='int64')

In [15]:
y.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [16]:
y.describe()

Unnamed: 0,ham,spam
count,5572.0,5572.0
mean,0.865937,0.134063
std,0.340751,0.340751
min,0.0,0.0
25%,1.0,0.0
50%,1.0,0.0
75%,1.0,0.0
max,1.0,1.0


In [17]:
y = y.iloc[:,1]

In [18]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: spam, Length: 5572, dtype: int64

# Train test split

In [19]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state= 0)

# Naive Bayes Classifier

In [20]:
# Training model using Naive Bayes Classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [21]:
y_pred = spam_detect_model.predict(X_test)

In [22]:
print(y_pred)

[0 1 0 ... 0 1 0]


In [23]:
y_pred.shape

(1115,)

# Accuracy metrics

In [24]:
# metrics to measure the accuracy
from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test, y_pred) 

In [25]:
confusion_m

array([[943,  12],
       [  8, 152]])

In [26]:
# accuracy score
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

In [27]:
accuracy

0.9820627802690582

# Conclusion
- The accuracy of the spam classifier model using **Naive Bayes Classier and stemming features 98.2%.**
- The **code spent most time in Data Cleaning and preprocesssing step** due to large dataset and nested loops.
- In the data cleaning step the **stopswords like(and, that, in, ...) were removed** as they do not contribute to training process.
- **Bag of data** was used to change the token of categorical data into numerical interpretation.
- Training data of 80% and testing data of 20% was specified in Train_test_split.