In [1]:
import pandas as pd


In [2]:
messages=pd.read_csv('spam.csv',
                    sep='\t',names=["label","message"])

In [3]:
messages

Unnamed: 0,label,message
0,Type,Message
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
...,...,...
652,ham,Anything lor. Juz both of us lor.
653,ham,Get me out of this dump heap. My mom decided t...
654,ham,Ok lor... Sony ericsson salesman... I ask shuh...
655,ham,Ard 6 like dat lor.


### First we preprocess the data and store it in a corpus

In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
snowball=SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
import re
corpus = []

stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))

for msg in messages['message']:
    review = re.sub('[^a-zA-Z]', ' ', msg)
    review = review.lower()
    review = review.split()
    review = [stemmer.stem(word) for word in review if word not in stop_words]
    review = ' '.join(review)
    corpus.append(review)

corpus

['messag',
 'go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkts st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'final match head to

### Apply BAG OF WORDS

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,ngram_range=(1,3))
x=cv.fit_transform(corpus).toarray()

In [10]:
cv.vocabulary_

{'messag': 588,
 'go': 369,
 'point': 1816,
 'crazi': 210,
 'avail': 47,
 'great': 392,
 'world': 4722,
 'la': 503,
 'got': 388,
 'wat': 4305,
 'point crazi': 1819,
 'world la': 4725,
 'point crazi avail': 1820,
 'world la buffet': 4726,
 'ok': 1288,
 'lar': 510,
 'joke': 480,
 'wif': 4528,
 'oni': 1406,
 'ok lar': 1297,
 'wif oni': 4531,
 'ok lar joke': 1298,
 'free': 337,
 'entri': 290,
 'wkli': 4641,
 'comp': 185,
 'win': 4559,
 'fa': 314,
 'final': 321,
 'st': 3236,
 'may': 578,
 'text': 3493,
 'receiv': 2154,
 'question': 2020,
 'std': 3279,
 'txt': 3834,
 'rate': 2072,
 'appli': 28,
 'free entri': 340,
 'wkli comp': 4642,
 'win fa': 4564,
 'st may': 3238,
 'receiv entri': 2155,
 'question std': 2023,
 'std txt': 3282,
 'txt rate': 3855,
 'rate appli': 2073,
 'wkli comp win': 4645,
 'win fa cup': 4565,
 'st may text': 3239,
 'receiv entri question': 2156,
 'question std txt': 2024,
 'std txt rate': 3283,
 'txt rate appli': 3856,
 'dun': 269,
 'say': 2490,
 'earli': 271,
 'alreadi'

### Creating the output feature

In [17]:
import pandas as pd
y=pd.get_dummies(messages['label'])
y

Unnamed: 0,Type,ham,spam
0,True,False,False
1,False,True,False
2,False,True,False
3,False,False,True
4,False,True,False
...,...,...,...
652,False,True,False
653,False,True,False
654,False,True,False
655,False,True,False


In [18]:
y=y.iloc[:,1].values

In [19]:
y

array([False,  True,  True, False,  True,  True, False,  True,  True,
       False, False,  True, False, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True, False,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True, False,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True, False,  True,  True,  True, False,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

## Training the ML model


### Train Test Split

In [20]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

### Using Naive Bayes Theorem


In [22]:
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()
spam_detect_model=classifier.fit(x_train,y_train)


In [23]:
spam_detect_model.predict(x_test)

array([ True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True, False, False,  True,  True,  True,
       False, False,  True,  True,  True, False, False, False,  True,
       False,  True,  True,  True,  True, False,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,

### Measuring Accuracy

In [24]:
from sklearn.metrics import accuracy_score,classification_report

print(accuracy_score(y_test,spam_detect_model.predict(x_test)))
print(classification_report(y_test,spam_detect_model.predict(x_test)))

0.9090909090909091
              precision    recall  f1-score   support

       False       0.57      0.87      0.68        15
        True       0.98      0.91      0.95       117

    accuracy                           0.91       132
   macro avg       0.77      0.89      0.82       132
weighted avg       0.93      0.91      0.92       132



# But here we have trained the model based on the data after getting preprprocessed by BOW method
#
# But in real life we should impliment the BOW after the train test split
#
# So we need to use the train test split before applying BOW


Performing a train-test split before applying the Bag of Words (BoW) model is crucial for several reasons:

Preventing Data Leakage: The primary purpose of splitting the data is to avoid data leakage. If you apply BoW to the entire dataset before splitting, the model has access to information from the test set during training, which can lead to overly optimistic performance metrics. This is akin to studying the exam questions beforehand rather than preparing based on the syllabus (training data).

Generalization: By splitting the data first, the model learns to generalize better. The training set should represent the data with which the model will learn, while the test set is meant for evaluating how well the model can perform on unseen data.

Preprocessing: It’s essential to preprocess and clean the text data before the train-test split. However, once you split the data, any further transformations such as BoW should be applied only to the training set and then used to transform the test set. This ensures that the model does not learn any patterns from the test set.

In summary, performing the train-test split before applying the BoW model helps maintain the integrity of the evaluation process and ensures that your model's performance metrics reflect its true capabilities on unseen data.