<a href="https://colab.research.google.com/github/GKS07/NLP-Projects/blob/main/SMS_Spam_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
# downloading the data

messages = pd.read_csv('/content/SMSSpamCollection', sep = '\t', names = ['label', 'message'])

In [3]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# data cleaning and preprocessing

import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

corpus = []

for i in range(0, len(messages)):
  review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
  review = review.lower()
  review = review.split()

  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review  = ' '.join(review)
  corpus.append(review)

In [7]:
corpus[2]

'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli'

In [8]:
messages['message'][2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [9]:
# creating the Bag of Words model

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features= 5000)

x = cv.fit_transform(corpus).toarray()

In [11]:
x[2]

array([0, 0, 0, ..., 0, 0, 0])

In [22]:
y = pd.get_dummies(messages['label'])
#y.head()

In [23]:
# we don't need both colum  we can take a colum of spam if 0 its ham else spam.

y = y.iloc[:, 1].values

In [25]:
# Train test split

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test  = train_test_split(x, y, test_size = 0.20, random_state = 0)

In [26]:
# Traning model using naive bias classifier

from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB().fit(x_train, y_train)

In [27]:
# predicting the x_test

y_pred = spam_detect_model.predict(x_test)

In [29]:
y_pred

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [30]:
y_test

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [31]:
#confusion Matrix

from sklearn.metrics import confusion_matrix

CM = confusion_matrix(y_test, y_pred)
CM



array([[946,   9],
       [  8, 152]])

In [32]:
# Accuracy of the model
from sklearn.metrics import accuracy_score
accuracy  =  accuracy_score(y_test, y_pred)
print(accuracy)

0.9847533632286996


So by using Stemming and Bag of Words our model has accuracy of 98%

## **Now lets try with lemmatization and TF-IDF **

In [35]:
# Lemmatization

from nltk.stem import WordNetLemmatizer

lematizer = WordNetLemmatizer()

corpus = []

for i in range(0, len(messages)):
  review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
  review = review.lower()
  review = review.split()

  review = [lematizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
  review  = ' '.join(review)
  corpus.append(review)

In [37]:
# creating the TF-IDF model

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features= 5000)

x_lem = tfidf.fit_transform(corpus).toarray()

In [38]:
# Train test split

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test  = train_test_split(x_lem, y, test_size = 0.20, random_state = 0)

In [39]:
# Traning model using naive bias classifier

from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB().fit(x_train, y_train)

In [40]:
# predicting the x_test

y_pred_lem = spam_detect_model.predict(x_test)

In [41]:
#confusion Matrix

from sklearn.metrics import confusion_matrix

CM = confusion_matrix(y_test, y_pred_lem)
CM

array([[955,   0],
       [ 26, 134]])

In [42]:
# Accuracy of the model
from sklearn.metrics import accuracy_score
accuracy  =  accuracy_score(y_test, y_pred_lem)
print(accuracy)

0.9766816143497757


By using TF-IDF and Lemmatization we get 97% accuracy.