# BOW example

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['Cats and dogs are not allowed', 'Cats and dogs are antagonistic']
count_vect = CountVectorizer()
X = count_vect.fit_transform(corpus)
X

import pandas as pd
 
df = pd.DataFrame()
df['vocabulary'] = count_vect.get_feature_names()
df['document1 vector'] = X.toarray()[0]
df['document2 vector'] = X.toarray()[1]
df.set_index('vocabulary', inplace=True)
print(df.T)

vocabulary        allowed  and  antagonistic  are  cats  dogs  not
document1 vector        1    1             0    1     1     1    1
document2 vector        0    1             1    1     1     1    0


# TF_IDF Example
tf = (no of times words appear in a doc)/(totoal no of words in the doc)
IDF - IDF measures of importance of word, taking into consideration the frequency of the word through out the corpus
idf = log(total no of docs/no of docs with the word in it)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
 
corpus = ['Cats have four legs',
          'Cats and dogs are antagonistic',
          'He hate dogs']
tfidf = TfidfVectorizer()
vect = tfidf.fit_transform(corpus)
 
df = pd.DataFrame()
df['vocabulary'] = tfidf.get_feature_names()
df['sentence1'] = vect.toarray()[0]
df['sentence2'] = vect.toarray()[1]
df['sentence3'] = vect.toarray()[2]
df.set_index('vocabulary', inplace=True)
print(df.T)

vocabulary       and  antagonistic       are      cats      dogs      four  \
sentence1   0.000000      0.000000  0.000000  0.402040  0.000000  0.528635   
sentence2   0.490479      0.490479  0.490479  0.373022  0.373022  0.000000   
sentence3   0.000000      0.000000  0.000000  0.000000  0.473630  0.000000   

vocabulary      hate      have        he      legs  
sentence1   0.000000  0.528635  0.000000  0.528635  
sentence2   0.000000  0.000000  0.000000  0.000000  
sentence3   0.622766  0.000000  0.622766  0.000000  


# ng-gram example
An N-Gram is a sequence of N-words in a sentence. Here, N is an integer which stands for the number of words in the sequence.

For example, if we put N=1, then it is referred to as a uni-gram. If you put N=2, then it is a bi-gram. If we substitute N=3, then it is a tri-gram.

In [10]:
from nltk import ngrams
sentence = 'I like dancing in the rain'
ngram = ngrams(sentence.split(' '),n=2)
for x in ngram:
    print(x)



('I', 'like')
('like', 'dancing')
('dancing', 'in')
('in', 'the')
('the', 'rain')


In [1]:
# import statments
import numpy
import re

'''
Tokenize each the sentences, example
Input : "John likes to watch movies. Mary likes movies too"
Ouput : "John","likes","to","watch","movies","Mary","likes","movies","too"
'''
def tokenize(sentences):
    words = []
    for sentence in sentences:
        w = word_extraction(sentence)
        words.extend(w)
        
    words = sorted(list(set(words)))
    return words

def word_extraction(sentence):
    ignore = ['a', "the", "is"]
    words = re.sub("[^\w]", " ",  sentence).split()
    cleaned_text = [w.lower() for w in words if w not in ignore]
    return cleaned_text    
    
def generate_bow(allsentences):    
    vocab = tokenize(allsentences)
    print("Word List for Document \n{0} \n".format(vocab));

    for sentence in allsentences:
        words = word_extraction(sentence)
        bag_vector = numpy.zeros(len(vocab))
        for w in words:
            for i,word in enumerate(vocab):
                if word == w: 
                    bag_vector[i] += 1
                    
        print("{0} \n{1}\n".format(sentence,numpy.array(bag_vector)))


allsentences = ["Joe waited for the train", "The train was late", "Mary and Samantha took the bus", 
            "I looked for Mary and Samantha at the bus station", 
            "Mary and Samantha arrived at the bus station early but waited until noon for the bus"]


generate_bow(allsentences)


Word List for Document 
['and', 'arrived', 'at', 'bus', 'but', 'early', 'for', 'i', 'joe', 'late', 'looked', 'mary', 'noon', 'samantha', 'station', 'the', 'took', 'train', 'until', 'waited', 'was'] 

Joe waited for the train 
[0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.]

The train was late 
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1.]

Mary and Samantha took the bus 
[1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0.]

I looked for Mary and Samantha at the bus station 
[1. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0.]

Mary and Samantha arrived at the bus station early but waited until noon for the bus 
[1. 1. 1. 2. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0.]

