# Text Representation - Bag Of Words (BOW)

In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
df.shape

(5572, 3)

In [4]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [9]:
# def get_spam_number(x):
#     if x=="spam":
#         return 1
#     return 0

In [6]:
df['Spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [7]:
df.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


## Train test split

In [14]:
X = df['Message']
y = df['Spam']

In [15]:
from sklearn.model_selection import train_test_split
X_train , X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8)

In [17]:
len(X_train)

4457

In [18]:
len(X_test)

1115

In [19]:
len(y_train)

4457

In [22]:
len(y_test)

1115

In [23]:
type(X_train)

pandas.core.series.Series

In [27]:
type(y_train)

pandas.core.series.Series

In [26]:
X_train[:4]

5146    Oh unintentionally not bad timing. Great. Fing...
1602    Carlos is taking his sweet time as usual so le...
1188    There's no point hangin on to mr not right if ...
2996    Mm not entirely sure i understood that text bu...
Name: Message, dtype: object

In [28]:
y_train[:4]

5146    0
1602    0
1188    0
2996    0
Name: Spam, dtype: int64

In [29]:
type(X_train.values)

numpy.ndarray

## Create bag of words representation using CountVectorizer

In [48]:
from sklearn.feature_extraction.text import CountVectorizer

x = CountVectorizer()
X_train_cv = x.fit_transform(X_train.values)
X_train_cv 

<4457x7806 sparse matrix of type '<class 'numpy.int64'>'
	with 59620 stored elements in Compressed Sparse Row format>

In [49]:
X_train_cv.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [58]:
X_train_cv.shape  #4457 emails and each email is a size of vector 7806

(4457, 7806)

In [51]:
x.get_feature_names_out()[1000:1050]

array(['anti', 'any', 'anybody', 'anymore', 'anyone', 'anyplaces',
       'anythin', 'anything', 'anythingtomorrow', 'anytime', 'anyway',
       'anyways', 'anywhere', 'aom', 'apart', 'apartment', 'apes',
       'apeshit', 'aphex', 'apnt', 'apologetic', 'apologise', 'apologize',
       'app', 'apparently', 'appear', 'applausestore', 'applebees',
       'apples', 'application', 'apply', 'applyed', 'applying',
       'appointment', 'appointments', 'appreciate', 'appreciated',
       'approaches', 'approaching', 'approve', 'approx', 'apps', 'appt',
       'appy', 'april', 'aproach', 'apt', 'aptitude', 'aquarius', 'ar'],
      dtype=object)

In [52]:
dir(x)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_validate_ngram_range',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'bui

In [None]:
x.vocabulary_

In [53]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [54]:
np.where(X_train_np[0]!=0)

(array([ 929, 1210, 2854, 2885, 3176, 3266, 4510, 4862, 4952, 5288, 6868,
        6955, 7063, 7220, 7446, 7580], dtype=int64),)

In [59]:
X_train[:4][5146]



In [60]:
X_train_np[0][929]

1

## Train the naive bayes model 

In [61]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [63]:
X_test_cv = x.transform(X_test)

## Evaluate Performance

In [64]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       959
           1       0.99      0.94      0.96       156

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [66]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = x.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

## Train the model using sklearn pipeline and reduce number of lines of code

In [67]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [68]:
clf.fit(X_train ,y_train)

In [69]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       959
           1       0.99      0.94      0.96       156

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

