# Importing Libraries

In [55]:
import pandas as pd
import numpy as np

In [56]:
data=pd.read_csv(r'C:\Users\amany\Desktop\archive datasets\spam.csv')
print(data.shape)
data.head()

(5572, 5)


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### Removing Unwanted Columns

In [57]:
data=data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
data.columns=['label','text']
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [58]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


## Text Cleaning

In [59]:
import nltk
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
lem=WordNetLemmatizer()

clean_data=[]
for text in data['text']:
    text=text.lower()
    text=re.sub('[^A-Za-z0-9]',' ',text)
    text=text.split()
    text=[lem.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]
    text=' '.join(text)
    clean_data.append(text)

clean_data=pd.DataFrame(clean_data)
data['text']=clean_data

## Converting category into numbers

In [60]:
data['label'].replace(['ham','spam'],[1,0],inplace=True)

In [61]:
data.head()

Unnamed: 0,label,text
0,1,go jurong point crazy available bugis n great ...
1,1,ok lar joking wif u oni
2,0,free entry 2 wkly comp win fa cup final tkts 2...
3,1,u dun say early hor u c already say
4,1,nah think go usf life around though


In [62]:
data['label'].value_counts()

1    4825
0     747
Name: label, dtype: int64

## Handling Imbalance Data

In [63]:
from sklearn.utils import resample
train_majority = data[data.label==1]
train_minority = data[data.label==0]
train_minority_unsampled=resample(train_minority,
                                 replace=True,
                                 n_samples=len(train_majority),
                                 random_state=123)
new_data=pd.concat([train_minority_unsampled,train_majority])
new_data['label'].value_counts()

0    4825
1    4825
Name: label, dtype: int64

In [64]:
new_data.shape

(9650, 2)

## Applying Bag Of Words

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
train_data=cv.fit_transform(new_data['text'])
print(train_data.shape,test_data.shape)

(9650, 7963) (1, 7963)


## Splitting Data

In [66]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(train_data,new_data['label'],test_size=0.25,random_state=42)
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((7237, 7963), (7237,), (2413, 7963), (2413,))

## Model Building

In [67]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Evaluating Model

In [68]:
y_pred=model.predict(x_test)

from sklearn.metrics import accuracy_score,f1_score,classification_report,confusion_matrix

print("Accuracy : ",accuracy_score(y_test,y_pred))
print("F1 Score : ",f1_score(y_test,y_pred))
print("Classification Report : \n",classification_report(y_test,y_pred))
print("Confusion Matrix :\n ",confusion_matrix(y_test,y_pred))

Accuracy :  0.9763779527559056
F1 Score :  0.975796178343949
Classification Report : 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1237
           1       0.97      0.98      0.98      1176

    accuracy                           0.98      2413
   macro avg       0.98      0.98      0.98      2413
weighted avg       0.98      0.98      0.98      2413

Confusion Matrix :
  [[1207   30]
 [  27 1149]]


## Testing the model

In [72]:
test=['Hey Plz Come and take me home!']
test_data=cv.transform(test)
result=model.predict(test_data)
if result[0]==1:
    print("You got ham sms!")
else:
    print("Dear,it's spam message! Sorry")

You got ham sms!


In [71]:
sms=['Hello, how are you!', 'Win money, win from home.', 'Call me now', 'Hello, Call you tomorrow?']
sms=cv.transform(sms)
result=model.predict(sms)
result
for i in range(len(result)):
    if result[i]==1:
        print("You got ham sms! Congrats")
    else:
        print("Dear,it's spam message! Sorry")

You got ham sms! Congrats
Dear,it's spam message! Sorry
Dear,it's spam message! Sorry
You got ham sms! Congrats


##### As we can our model accuracy is good and giving a good result!