<h2 align='center'>NLP : Text Representation - Bag Of Words (BOW)</h2>

In [4]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [12]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0) # new colum for spam is 1 and not spam is 0

In [16]:
df.shape

(5572, 3)

In [18]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [23]:
X_train.shape

(4457,)

In [25]:
X_test.shape

(1115,)

In [27]:
type(X_train)

pandas.core.series.Series

In [29]:
X_train[:4]

24      Ffffffffff. Alright no way I can meet up with ...
4628    Please call our customer service representativ...
174     Bloody hell, cant believe you forgot my surnam...
68      Did you hear about the new "Divorce Barbie"? I...
Name: Message, dtype: object

In [31]:
type(y_train)

pandas.core.series.Series

In [33]:
y_train[:4]

24      0
4628    1
174     0
68      1
Name: spam, dtype: int64

In [35]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values) # X_train.values --> numpy array
X_train_cv

# creating vector count of each word in the data

<4457x7783 sparse matrix of type '<class 'numpy.int64'>'
	with 59606 stored elements in Compressed Sparse Row format>

In [44]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [46]:
X_train_cv.shape # 7783 words in vocabulary

(4457, 7783)

In [52]:
v.get_feature_names_out()[1000:1050]

array(['apologetic', 'apologise', 'apologize', 'app', 'apparently',
       'appeal', 'appear', 'appendix', 'applausestore', 'applebees',
       'apples', 'application', 'apply', 'applyed', 'applying',
       'appointment', 'appointments', 'appreciate', 'appreciated',
       'approaching', 'approved', 'approx', 'apps', 'appt', 'appy',
       'april', 'aproach', 'aptitude', 'aquarius', 'ar', 'arab',
       'arabian', 'arcade', 'archive', 'ard', 'are', 'area', 'aren',
       'arent', 'arestaurant', 'aretaking', 'areyouunique', 'argh',
       'argue', 'arguing', 'argument', 'arguments', 'aries', 'arise',
       'arises'], dtype=object)

In [54]:
dir(v # gives all the methods

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_validate_ngram_range',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',

In [50]:
v.vocabulary_

{'ffffffffff': 2827,
 'alright': 908,
 'no': 4811,
 'way': 7438,
 'can': 1627,
 'meet': 4445,
 'up': 7212,
 'with': 7578,
 'you': 7744,
 'sooner': 6339,
 'please': 5257,
 'call': 1604,
 'our': 5016,
 'customer': 2124,
 'service': 6046,
 'representative': 5732,
 'on': 4945,
 '0800': 48,
 '169': 315,
 '6031': 562,
 'between': 1324,
 '10am': 266,
 '9pm': 725,
 'as': 1073,
 'have': 3377,
 'won': 7609,
 'guaranteed': 3283,
 '1000': 258,
 'cash': 1674,
 'or': 4983,
 '5000': 531,
 'prize': 5432,
 'bloody': 1390,
 'hell': 3416,
 'cant': 1638,
 'believe': 1298,
 'forgot': 2960,
 'my': 4685,
 'surname': 6661,
 'mr': 4634,
 'ill': 3637,
 'give': 3163,
 'clue': 1856,
 'its': 3794,
 'spanish': 6373,
 'and': 943,
 'begins': 1292,
 'did': 2291,
 'hear': 3397,
 'about': 745,
 'the': 6844,
 'new': 4776,
 'divorce': 2359,
 'barbie': 1214,
 'it': 3786,
 'comes': 1904,
 'all': 896,
 'of': 4906,
 'ken': 3940,
 'stuff': 6570,
 'then': 6856,
 'get': 3135,
 'some': 6313,
 'together': 6968,
 'll': 4186,
 'text

In [56]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [60]:
np.where(X_train_np[0]!=0) # where indexes in not zero

(array([ 908, 1627, 2827, 4445, 4811, 6339, 7212, 7438, 7578, 7744],
       dtype=int64),)

In [66]:
X_train[:4][4628]

'Please call our customer service representative on 0800 169 6031 between 10am-9pm as you have WON a guaranteed £1000 cash or £5000 prize!'

In [52]:
X_train_np[0][1771]

1

<h3>Train the naive bayes model</h3>

In [68]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [70]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [73]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.99      0.93      0.96       149

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [75]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?', # Not spam
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!' # Spam
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [78]:
# Directly create a pipeline instead of writing all the code.

from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [80]:
clf.fit(X_train, y_train)

In [82]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.99      0.93      0.96       149

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

