# **SPAM CLASSIFIER**

### **Loading the Dataset**

In [2]:
import pandas as pd
url = 'https://raw.githubusercontent.com/ShresthaSudip/SMS_Spam_Detection_DNN_LSTM_BiLSTM/master/SMSSpamCollection'
messages = pd.read_csv(url, sep ='\t',names=["label", "message"])
#messages = pd.read_csv('spam.csv', sep='\t',names=["label", "message"])

In [3]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
messages['message'].loc[2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [5]:
messages['label'].loc[2]

'spam'

In [6]:
messages.isnull().sum()

label      0
message    0
dtype: int64

### **Text Preprocessing**

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

wordnet=WordNetLemmatizer()
corpus = []

for i in range(0, len(messages)):
    message = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    message = message.lower()
    message = message.split()  
    message = [wordnet.lemmatize(word) for word in message if not word in stopwords.words('english')]
    message = ' '.join(message)
    corpus.append(message)

[nltk_data] Downloading package stopwords to /home/amits/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/amits/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/amits/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### **Feature Extraction**

**Bag of Words**

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [9]:
messages['label']

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [10]:
Y = pd.get_dummies(messages['label'])

In [11]:
Y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [12]:
Y = Y.iloc[:,1].values

In [13]:
print(Y)

[0 0 1 ... 0 0 0]


### **Modeling**

**Multinomial Naive Bayes**

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=0)

In [15]:
from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB().fit(X_train, Y_train)

In [16]:
Y_pred = spam_detect_model.predict(X_test)

In [17]:
from sklearn.metrics import accuracy_score

score = accuracy_score(Y_test, Y_pred)
print(score)

0.9829596412556054


In [18]:
from sklearn.metrics import classification_report

print(classification_report(Y_pred, Y_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       956
           1       0.94      0.94      0.94       159

    accuracy                           0.98      1115
   macro avg       0.96      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115



### **Feature Extraction**

**TF-IDF**

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(max_features=2500)
X = tv.fit_transform(corpus).toarray()

### **Modeling**

**Multinomial Naive Bayes**

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=0)

In [21]:
from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB().fit(X_train, Y_train)

In [22]:
Y_pred = spam_detect_model.predict(X_test)

In [23]:
from sklearn.metrics import accuracy_score

score = accuracy_score(Y_test, Y_pred)
print(score)

0.979372197309417


In [24]:
from sklearn.metrics import classification_report

print(classification_report(Y_pred, Y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       976
           1       0.86      0.99      0.92       139

    accuracy                           0.98      1115
   macro avg       0.93      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115



**Random Forest Classifier**

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=0)

In [26]:
from sklearn.ensemble import RandomForestClassifier

spam_detect_model = RandomForestClassifier().fit(X_train, Y_train)

In [27]:
Y_pred = spam_detect_model.predict(X_test)

In [28]:
from sklearn.metrics import accuracy_score

score = accuracy_score(Y_test, Y_pred)
print(score)

0.9847533632286996


In [29]:
from sklearn.metrics import classification_report

print(classification_report(Y_pred, Y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       972
           1       0.89      1.00      0.94       143

    accuracy                           0.98      1115
   macro avg       0.95      0.99      0.97      1115
weighted avg       0.99      0.98      0.99      1115

