### Bag Of Words Text Representation
In this technique, a dictionary/vocabulary of words are vectorized through word count and used in ML training for NLP. 
##### Limitations
1. Long vocabulary length (consumes memory & computing resources due to sparse representation->containing a lot of zeros)
2. Doesn't capture meaning of words properly.

In [1]:
import pandas as pd
import numpy as np

In [2]:
spam_url = r"E:\Datasets\NLP and Text\spam.csv"

In [9]:
df = pd.read_csv(spam_url)
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [14]:
def get_spam_number(x):
    if x == 'spam':
        return 1
    return 0

In [15]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [16]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [17]:
df.shape

(5572, 3)

In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size = 0.2)

In [19]:
x_train.shape

(4457,)

In [20]:
x_test.shape

(1115,)

In [21]:
type(x_train)

pandas.core.series.Series

In [26]:
x_train[:4]

4681    That's cool he'll be here all night, lemme kno...
4890    Japanese Proverb: If one Can do it, U too Can ...
1442                           Ya:)going for restaurant..
472     How long has it been since you screamed, princ...
Name: Message, dtype: object

In [27]:
type(y_train)

pandas.core.series.Series

In [28]:
y_train[:4]

4681    0
4890    0
1442    0
472     0
Name: spam, dtype: int64

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

x_train_cv = v.fit_transform(x_train.values)
x_train_cv

<4457x7697 sparse matrix of type '<class 'numpy.int64'>'
	with 59475 stored elements in Compressed Sparse Row format>

In [36]:
x_train_cv.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [37]:
x_train_cv.shape

(4457, 7697)

In [41]:
v.get_feature_names_out().shape

(7697,)

In [42]:
dir(v) # shows all methods applicable

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_validate_ngram_range',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',

In [44]:
v.vocabulary_

{'that': 6780,
 'cool': 2001,
 'he': 3354,
 'll': 4127,
 'be': 1259,
 'here': 3398,
 'all': 898,
 'night': 4755,
 'lemme': 4042,
 'know': 3930,
 'when': 7434,
 'you': 7661,
 're': 5568,
 'around': 1059,
 'japanese': 3776,
 'proverb': 5449,
 'if': 3583,
 'one': 4921,
 'can': 1627,
 'do': 2358,
 'it': 3738,
 'too': 6935,
 'none': 4789,
 'must': 4635,
 'indian': 3646,
 'version': 7233,
 'let': 4053,
 'him': 3420,
 'leave': 4029,
 'and': 950,
 'finally': 2830,
 'kerala': 3890,
 'stop': 6474,
 'doing': 2382,
 'make': 4295,
 'strike': 6498,
 'against': 847,
 'ya': 7622,
 'going': 3159,
 'for': 2920,
 'restaurant': 5725,
 'how': 3505,
 'long': 4158,
 'has': 3332,
 'been': 1280,
 'since': 6156,
 'screamed': 5938,
 'princess': 5394,
 'don': 2390,
 'this': 6820,
 'week': 7394,
 'to': 6899,
 'tirunelvai': 6876,
 'da': 2132,
 'dont': 2394,
 'worry': 7563,
 'day': 2171,
 'very': 7235,
 'big': 1339,
 'lambu': 3972,
 'ji': 3800,
 'vl': 7271,
 'come': 1904,
 'til': 6865,
 'then': 6795,
 'enjoy': 2581,

In [40]:
v.get_feature_names_out()[1000:1050]

array(['apologetic', 'apologise', 'apologize', 'apology', 'app',
       'apparently', 'appeal', 'appear', 'appendix', 'applebees',
       'apples', 'application', 'apply', 'applyed', 'appointment',
       'appointments', 'appreciate', 'appreciated', 'approaches',
       'approaching', 'appropriate', 'approve', 'approved', 'approx',
       'apps', 'appt', 'appy', 'april', 'aproach', 'apt', 'aptitude',
       'aquarius', 'ar', 'arab', 'arabian', 'arcade', 'archive', 'ard',
       'are', 'area', 'aren', 'arent', 'arestaurant', 'aretaking',
       'areyouunique', 'argh', 'argue', 'argument', 'aries', 'arise'],
      dtype=object)

In [47]:
x_train_np = x_train_cv.toarray()
x_train_np[:4][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [55]:
np.where(x_train_np[0]!=0)

(array([ 898, 1059, 1259, 2001, 3354, 3398, 3930, 4042, 4127, 4755, 5568,
        6780, 7434, 7661], dtype=int64),)

In [56]:
x_train[:4][4681]

"That's cool he'll be here all night, lemme know when you're around"

In [57]:
x_train_np[0][0]

0

In [58]:
from sklearn.naive_bayes import MultinomialNB 

model = MultinomialNB()
model.fit(x_train_cv, y_train)

In [59]:
x_test_cv = v.transform(x_test)

In [60]:
from sklearn.metrics import classification_report

y_pred = model.predict(x_test_cv)

In [61]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       963
           1       0.96      0.89      0.93       152

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [62]:
emails = [
    'Hey Mohan, can we get together to watch a football tomorrow?',
    'Up to 20% discount on parking, exclusive offer just for you. Dont miss on this reward!'
]

In [63]:
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [64]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [65]:
clf.fit(x_train, y_train)

In [69]:
y_pred = clf.predict(x_test)
print("Results: \n", classification_report(y_test, y_pred))

Results: 
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       963
           1       0.96      0.89      0.93       152

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

