<a href="https://colab.research.google.com/github/Celaena24/NLP/blob/main/BOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [186]:
paragraph = """She likes to play outside.
My favorite color is red.
Let’s go to the playground.
Does his sister have a sister?
She goes to school to study."""

**Implementing BOW from scratch**

In [187]:
import numpy as np
import pandas as pd
import re

In [188]:
# getting the corpus ready
corpus = paragraph.split("\n")
corpus = [re.sub('[^a-zA-Z0-9]', ' ', document.lower()) for document in corpus]
corpus

['she likes to play outside ',
 'my favorite color is red ',
 'let s go to the playground ',
 'does his sister have a sister ',
 'she goes to school to study ']

In [189]:
# List of words in the corpus
words = []
for sent in corpus:
  for word in sent.split():
    words.append(word)

In [190]:
# Creating the vocabulary dictionary
vocab = set(words)
vocab

{'a',
 'color',
 'does',
 'favorite',
 'go',
 'goes',
 'have',
 'his',
 'is',
 'let',
 'likes',
 'my',
 'outside',
 'play',
 'playground',
 'red',
 's',
 'school',
 'she',
 'sister',
 'study',
 'the',
 'to'}

In [191]:
# Calulating BOW for each document in the corpus
def calculate_BOW(doc):
  dic = {}
  for word in vocab:
    dic[word] = doc.split().count(word)
  return dic


In [193]:
# List of BOW for each document in the corpus
l = []
for doc in corpus:
  l.append(calculate_BOW(doc))
df = pd.DataFrame(l, index=corpus)
df

Unnamed: 0,let,my,likes,favorite,school,playground,a,does,outside,s,...,sister,play,the,study,to,have,color,his,goes,red
she likes to play outside,0,0,1,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
my favorite color is red,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
let s go to the playground,1,0,0,0,0,1,0,0,0,1,...,0,0,1,0,1,0,0,0,0,0
does his sister have a sister,0,0,0,0,0,0,1,1,0,0,...,2,0,0,0,0,1,0,1,0,0
she goes to school to study,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,2,0,0,0,1,0


**Implementing BOW using scikit-learn and nltk**


In [194]:
!pip install nltk



In [195]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [196]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [197]:
corpus = nltk.sent_tokenize(paragraph)
corpus

['She likes to play outside.',
 'My favorite color is red.',
 'Let’s go to the playground.',
 'Does his sister have a sister?',
 'She goes to school to study.']

In [198]:
# lemmatization
lemmatizer = WordNetLemmatizer()
for i in range(len(corpus)):
  corpus[i] = " ".join([lemmatizer.lemmatize(word.lower()) for word in corpus[i].split()])
corpus

['she like to play outside.',
 'my favorite color is red.',
 'let’s go to the playground.',
 'doe his sister have a sister?',
 'she go to school to study.']

In [163]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(corpus)

In [176]:
feat_names = cv.get_feature_names_out()
feat_names

array(['color', 'does', 'favorite', 'goes', 'let', 'likes', 'outside',
       'play', 'playground', 'red', 'school', 'sister'], dtype=object)

In [177]:
doc_array = X.toarray()
doc_array

array([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0]])

In [178]:
frequency_matrix = pd.DataFrame(doc_array, index=corpus, columns=feat_names)
frequency_matrix

Unnamed: 0,color,does,favorite,goes,let,likes,outside,play,playground,red,school,sister
She likes to play outside.,0,0,0,0,0,1,1,1,0,0,0,0
My favorite color is red.,1,0,1,0,0,0,0,0,0,1,0,0
Let’s go to the playground.,0,0,0,0,1,0,0,0,1,0,0,0
Does he have a sister?,0,1,0,0,0,0,0,0,0,0,0,1
She goes to school.,0,0,0,1,0,0,0,0,0,0,1,0
