In [2]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 

**Load Dataset**

In [3]:
dataset = pd.read_csv("spam.csv",encoding='latin-1')[['v1','v2']]
dataset.columns = ['label','messages']
dataset.head(5)

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**Preprocess the data**

In [4]:
dataset.isnull().sum()

label       0
messages    0
dtype: int64

In [61]:
# checking the balacing of the data 
dataset.value_counts(dataset["label"])

label
0    4825
1     747
Name: count, dtype: int64

In [6]:
dataset["label"] = dataset["label"].map({'ham':0,'spam':1})

In [7]:
dataset.head(4)

Unnamed: 0,label,messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...


**split the data**

In [17]:
x = dataset["messages"]
y = dataset["label"]

**Using the Tf-IDF vectorizer**

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
vectorzier = TfidfVectorizer(stop_words="english",max_features=3000)
x_idf = vectorzier.fit_transform(x)

In [22]:
x_idf.shape

(5572, 3000)

In [23]:
y.shape

(5572,)

**Balancing the data**

In [30]:
from imblearn.over_sampling import RandomOverSampler

In [31]:
ros = RandomOverSampler(random_state=42)
x_resampled , y_resampled = ros.fit_resample(x_idf,y)

In [32]:
y_resampled.value_counts()

label
0    4825
1    4825
Name: count, dtype: int64

**Applying the training and testing**

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
x_train , x_test , y_train , y_test = train_test_split(x_idf,y,test_size=0.2,random_state=42)

**Train the model**

In [38]:
from sklearn.naive_bayes import MultinomialNB

In [39]:
nb = MultinomialNB()
nb.fit(x_train,y_train)

**Evaluate the model**

In [40]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [57]:
y_pred = nb.predict(x_train)
score = nb.score(x_test,y_test) , nb.score(x_train,y_train)

In [44]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [60]:
print("Accuracy for testing data and training data is :", score)

Accuracy for testing data and training data is : (0.9820627802690582, 0.9863136638994839)


**Tru Custom Message**

In [58]:
sample = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,,,"]
sample_vectorizer = vectorzier.transform(sample)
if nb.predict(sample_vectorizer)[0]==1:
    print("It is a Spam")
else:
    print("Not spam")

It is a Spam
