<a href="https://colab.research.google.com/github/Alvinkariuki/Sms-spam-detection/blob/main/SMS_spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Spam Detection Algorithm



In [None]:
# Import libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Load the data
# from google.colab import files
# uploaded = files.upload()

Saving spam.csv to spam (1).csv


In [None]:
# Read CSV file
df = pd.read_csv('spam.csv', encoding='latin-1')

#Print first 5 rows
df.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


Start preprocessing by making some adjustments to the dataset by extracting what we need from the dataset and leave out NaN values

In [None]:
# Begin preprocessing our data
email_df = pd.DataFrame()
email_df['spam'] = pd.Series(df.v1)
email_df['text'] = pd.Series(df.v2)

email_df.head(5)

Unnamed: 0,spam,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Encoding
We notice that the spam column contains discrete text, either ham or spam where spam may denote 1 and ham 0



In [None]:
lb = LabelEncoder()
email_df['spam'] = lb.fit_transform(email_df['spam'])

# Display first 5 values
email_df.head(5)

Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# Print the shape (number of rows and columns)
email_df.shape

(5572, 2)

In [None]:
# Get the column names 
email_df.columns

Index(['spam', 'text'], dtype='object')

In [None]:
# Check for duplicates and remove
email_df.drop_duplicates(inplace=True)

In [None]:
# Show new shape of df after dropping duplicate values
df.shape

(5572, 5)

There were no duplicate values in this dataset hence the number of rows remains the same

In [None]:
# Show number of missing data from each column (NAN , NaN, na)
email_df.isnull().sum()

spam    0
text    0
dtype: int64

In [None]:
# Download the stopwords package
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def process_text(text):
  # Remove punctuation from text
  no_puncts = [char for char in text if char not in string.punctuation]
  no_puncts = ''.join(no_puncts)

  # Remove stopwords from text (tokenization) 
  clean_wrds = [word for word in no_puncts.split() if word.lower() not in stopwords.words('english')]

  # Return list of clean text words
  return clean_wrds

In [None]:
# Show tokenization (list of tokens)
email_df['text'].head().apply(process_text)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: text, dtype: object

In [34]:
# Convert a collection of text to matrix of tokens
from sklearn.feature_extraction.text import CountVectorizer
messages_bag_o_words = CountVectorizer(analyzer=process_text).fit_transform(email_df['text'])


In [35]:
# Split data into 80% train 20% test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bag_o_words, email_df['spam'], test_size=0.20, random_state=0)


In [36]:
# Get the shape of the messages_bag_o_words
messages_bag_o_words.shape

(5169, 11304)

In [37]:
# Create and train Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(X_train, y_train)

In [40]:
# Print the predictions
print(classifier.predict(X_train))

# Print actual target values
print(y_train.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


# Model Evaluation

We will now check how well our model performs on the dataset

In [44]:
# Evaluate model on train data
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
predct = classifier.predict(X_train)
print(classification_report(y_train, predct))

print()

print("Confusion Matrix: \n", confusion_matrix(y_train, predct))

print()

print("Accuracy: ", accuracy_score(y_train, predct))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3631
           1       0.98      0.98      0.98       504

    accuracy                           1.00      4135
   macro avg       0.99      0.99      0.99      4135
weighted avg       1.00      1.00      1.00      4135


Confusion Matrix: 
 [[3623    8]
 [  11  493]]

Accuracy:  0.9954050785973397


# Test Data performance

We now evaluate how the model performs on the train data

In [45]:
# Print the predictions
print(classifier.predict(X_test))

# Print actual target values
print(y_test.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [47]:
# Evaluate model on train data
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
predct = classifier.predict(X_test)
print(classification_report(y_test, predct))

print()

print("Confusion Matrix: \n", confusion_matrix(y_test, predct))

print()

print("Accuracy: ", accuracy_score(y_test, predct))

              precision    recall  f1-score   support

           0       0.99      0.96      0.97       885
           1       0.80      0.93      0.86       149

    accuracy                           0.96      1034
   macro avg       0.89      0.94      0.92      1034
weighted avg       0.96      0.96      0.96      1034


Confusion Matrix: 
 [[850  35]
 [ 11 138]]

Accuracy:  0.9555125725338491
