In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
message = pd.read_csv("SMSSpamCollection", sep="\t", names=['Label','Message'])

In [3]:
message.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#Data Cleaning and Preprocessing using stemming

In [5]:
import re
import nltk

In [6]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [7]:
pe = PorterStemmer()
corpus = []

In [8]:
for i in range(len(message)):
    review = re.sub('[^a-zA-Z]',' ',message['Message'][i])
    review = review.lower()
    review = review.split() #convert the sentences into list and break the string with seperator
    
    review = [pe.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review) #concatenate list string into a string
    corpus.append(review)

In [9]:
# Creating the Bag of Words model

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()

In [12]:
y = pd.get_dummies(message['Label'])
y = y.iloc[:,1]

In [13]:
# Train Test Split

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [15]:
# Training model using Naive bayes classifier

In [16]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train,y_train)

In [17]:
y_pred = spam_detect_model.predict(X_test)

In [18]:
from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test,y_pred)
confusion_m

array([[946,   9],
       [  8, 152]], dtype=int64)

In [19]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.9847533632286996

In [20]:
#Data Cleaning and Preprocessing using lemmatization

In [21]:
Messages = pd.read_csv("SMSSpamCollection", sep="\t", names=['Label','Message'])

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
Corpus = []

In [23]:
for i in range(len(Messages)):
    reviews = re.sub('[^a-zA-Z]',' ',Messages['Message'][i])
    reviews = reviews.lower()
    reviews = reviews.split() #convert the sentences into list and break the string with seperator
    
    reviews = [lemmatizer.lemmatize(word) for word in reviews if not word in stopwords.words('english')]
    reviews = ' '.join(reviews) #concatenate list string into a string
    Corpus.append(reviews)

In [24]:
# Creating the TF-IDF model

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(5000)
X1 = cv.fit_transform(Corpus).toarray()

In [26]:
y1 = pd.get_dummies(Messages['Label'])
y1 = y1.iloc[:,1]

In [27]:
# Train Test Split

In [28]:
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size = 0.20, random_state = 0)

In [29]:
# Training model using Naive bayes classifier

In [30]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model1 = MultinomialNB().fit(X_train1,y_train1)

In [31]:
y_pred1 = spam_detect_model1.predict(X_test1)

In [32]:
confusion_m1 = confusion_matrix(y_test1,y_pred1)
confusion_m1

array([[955,   0],
       [ 31, 129]], dtype=int64)

In [33]:
accuracy1 = accuracy_score(y_test1,y_pred1)
accuracy1

0.9721973094170404