In [1]:
import pandas as pd
import nltk

In [2]:
data = pd.read_csv('SMSSpamCollection',sep='\t',names=['labels','messages'])

### Data cleaning 

In [3]:
data.head()

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.isna().sum()

labels      0
messages    0
dtype: int64

In [5]:
data.shape

(5572, 2)

In [6]:
data = data.drop_duplicates()
data.shape

(5169, 2)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5169 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   labels    5169 non-null   object
 1   messages  5169 non-null   object
dtypes: object(2)
memory usage: 121.1+ KB


In [8]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer

### Performing Lemmatization

In [9]:
wn = WordNetLemmatizer()
corpus = []

In [10]:
import re
for i in data['messages']:
    review = re.sub('^a-zA-Z',' ',i)
    review = review.lower()
    review = review.split()
    words = [wn.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    corpus.append(' '.join(words))

### Bag Of Words

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x_cv = cv.fit_transform(corpus).toarray()

In [12]:
y = pd.get_dummies(data['labels'])
y = y.iloc[:,1].values

### Model Buliding 

In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_cv,y,test_size=0.2,random_state=45)

In [14]:
from sklearn.naive_bayes import MultinomialNB
model_cv = MultinomialNB()

In [15]:
model_cv.fit(x_train,y_train)

In [16]:
pred = model_cv.predict(x_test)

In [17]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,mean_squared_error

In [18]:
accuracy_score(pred,y_test)

0.9796905222437138

### Confusion Matrix Of Bag Of Words

In [19]:
cfm = confusion_matrix(y_test,pred)
cfm

array([[886,  14],
       [  7, 127]], dtype=int64)

### Classification Report Of Bag Of Words

In [20]:
cfp = classification_report(y_test,pred)
print(cfp)

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       900
           1       0.90      0.95      0.92       134

    accuracy                           0.98      1034
   macro avg       0.95      0.97      0.96      1034
weighted avg       0.98      0.98      0.98      1034



### TF - IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
x_tf = tf.fit_transform(corpus).toarray()

In [22]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_cv,y,test_size=0.2,random_state=45)

In [23]:
from sklearn.naive_bayes import MultinomialNB
model_tf = MultinomialNB()

In [24]:
model_tf.fit(x_train,y_train)

In [25]:
pred = model_tf.predict(x_test)

In [26]:
accuracy_score(pred,y_test)

0.9796905222437138

### Confusion Matrix of TF-IDF

In [27]:
confusion_matrix(y_test,pred)

array([[886,  14],
       [  7, 127]], dtype=int64)

### Classification Report of TF-IDF

In [28]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       900
           1       0.90      0.95      0.92       134

    accuracy                           0.98      1034
   macro avg       0.95      0.97      0.96      1034
weighted avg       0.98      0.98      0.98      1034

