<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Codebasics/nlp-tutorials/9_bag_of_words/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [5]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [6]:
df.shape

(5572, 3)

In [7]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [9]:
X_train.shape

(4457,)

In [10]:
X_test.shape

(1115,)

In [11]:
type(X_train)

pandas.core.series.Series

In [12]:
X_train[:4]

3723               I'm in a movie... Collect car oredi...
772     Lol! U drunkard! Just doing my hair at d momen...
2161    No. Its not specialisation. Can work but its s...
5303    I can. But it will tell quite long, cos i have...
Name: Message, dtype: object

In [13]:
type(y_train)

pandas.core.series.Series

In [14]:
y_train[:4]

3723    0
772     0
2161    0
5303    0
Name: spam, dtype: int64

In [15]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7757 sparse matrix of type '<class 'numpy.int64'>'
	with 59433 stored elements in Compressed Sparse Row format>

In [17]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [18]:
X_train_cv.shape

(4457, 7757)

In [19]:
v.get_feature_names_out()[1771]

'cheesy'

In [20]:
v.vocabulary_

{'in': 3659,
 'movie': 4633,
 'collect': 1902,
 'car': 1655,
 'oredi': 5006,
 'lol': 4211,
 'drunkard': 2489,
 'just': 3895,
 'doing': 2418,
 'my': 4690,
 'hair': 3321,
 'at': 1114,
 'moment': 4589,
 'yeah': 7688,
 'still': 6493,
 'up': 7195,
 'tonight': 6982,
 'wats': 7410,
 'the': 6828,
 'plan': 5254,
 'no': 4820,
 'its': 3785,
 'not': 4852,
 'specialisation': 6373,
 'can': 1637,
 'work': 7601,
 'but': 1582,
 'slave': 6229,
 'labor': 4012,
 'will': 7526,
 'look': 4220,
 'for': 2959,
 'it': 3778,
 'this': 6865,
 'month': 4603,
 'sha': 6066,
 'cos': 2038,
 'shakara': 6071,
 'beggar': 1304,
 'tell': 6766,
 'quite': 5562,
 'long': 4216,
 'haven': 3376,
 'finish': 2876,
 'film': 2863,
 'yet': 7703,
 'yun': 7737,
 'buying': 1590,
 'school': 5955,
 'got': 3224,
 'offer': 4919,
 '2000': 345,
 'plus': 5281,
 'only': 4968,
 'why': 7510,
 'de': 2210,
 'you': 7717,
 'looking': 4224,
 'good': 3208,
 'maybe': 4424,
 'could': 2048,
 'get': 3140,
 'book': 1431,
 'out': 5031,
 'tomo': 6971,
 'then': 

In [21]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [22]:
np.where(X_train_np[0]!=0)

(array([1655, 1902, 3659, 4633, 5006]),)

In [24]:
X_train[:4][1579]

KeyError: ignored

In [25]:
X_train_np[0][1771]

0

<h3>Train the naive bayes model</h3>

In [26]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [27]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [28]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       965
           1       0.98      0.93      0.96       150

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [29]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [30]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [31]:
clf.fit(X_train, y_train)

In [32]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       965
           1       0.98      0.93      0.96       150

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

