## Importing the esssential libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing The dataset

In [2]:
data=pd.read_csv("SMSSpamCollection",sep="\t",names=["labels","message"]) # since the dataset is tab seperated 
data.head()

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# check for any nan or missing values
data.isnull().any()

labels     False
message    False
dtype: bool

## Cleaning the texts

In [4]:
# cleaning the data
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

ps=PorterStemmer()
wordnet=WordNetLemmatizer()

In [5]:
messages=[]
for i in range(len(data)):
    review=re.sub("[^a-zA-Z]"," ",data["message"][i]) 
    review=review.lower()
    review=review.split()
    review=[wordnet.lemmatize(word) for word in review if not word in stopwords.words("english")]
    review=" ".join(review)
    messages.append(review)

## Bag of Words

In [6]:
# creating bag of words 
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500)
x=cv.fit_transform(messages).toarray()

## Encoding the data

In [7]:
# encode the dependent variable data
y=pd.get_dummies(data["labels"])
y=y.iloc[:,1].values

## Splitting the data

In [8]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=0)

## Training the Navie Byes Model

In [9]:
# training the naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
spam_detect=MultinomialNB()
spam_detect=spam_detect.fit(xtrain,ytrain)

## Predicting the results

In [10]:
y_pred=spam_detect.predict(xtest)
y_pred

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

## Model Evaluation

In [11]:
# accuracy score
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred,ytest)
print(accuracy)

0.9829596412556054
