In [144]:
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
import numpy as np
import re
import string

In [145]:
text_data = "India is a large democracy . It is seventh largest country in the world . Also population wise it is the second largest among all the countries ."

## <font color="Orange">Manual</font>

In [146]:
sentences = [sent.lower() for sent in sent_tokenize(text_data)]
words = word_tokenize(text_data)
vocab = list(set([word.lower() for word in words]))

print(sentences)
print(vocab)

['india is a large democracy .', 'it is seventh largest country in the world .', 'also population wise it is the second largest among all the countries .']
['countries', 'world', 'india', 'country', 'seventh', 'also', 'the', 'largest', 'second', 'among', 'a', 'all', 'population', 'democracy', '.', 'wise', 'it', 'in', 'is', 'large']


In [147]:
token_dict = dict()
word_dict = dict()

idx = 0
for w in vocab:
    token_dict[w] = idx
    word_dict[idx] = w
    idx += 1

token_dict

{'countries': 0,
 'world': 1,
 'india': 2,
 'country': 3,
 'seventh': 4,
 'also': 5,
 'the': 6,
 'largest': 7,
 'second': 8,
 'among': 9,
 'a': 10,
 'all': 11,
 'population': 12,
 'democracy': 13,
 '.': 14,
 'wise': 15,
 'it': 16,
 'in': 17,
 'is': 18,
 'large': 19}

Encoding

In [148]:
def encode(dict, line):
    words = word_tokenize(line)
    output = [dict[word] for word in words]

    return output

In [149]:
encoded_lines = [encode(token_dict,line) for line in sentences]

encoded_lines

[[2, 18, 10, 19, 13, 14],
 [16, 18, 4, 7, 3, 17, 6, 1, 14],
 [5, 12, 15, 16, 18, 6, 8, 7, 9, 11, 6, 0, 14]]

Decoding

In [150]:
def decode(dict, line):
    output = [dict[num] for num in line]
    output_string = ' '.join(output)
    
    return output_string

In [151]:
decode(word_dict,[2, 18, 10, 19, 13, 14])

'india is a large democracy .'

## <font color="Orange">One Hot Encoding</font>

In [152]:
from sklearn.preprocessing import OneHotEncoder

In [153]:
one_hot = OneHotEncoder()

level_2_sentences = []
for sent in sentences:
    line = word_tokenize(sent)

    while len(line) < len(vocab):
        line.append(' ')
    
    level_2_sentences.append(line)

one_hot.fit(level_2_sentences)
encoded_text = one_hot.transform(level_2_sentences)

print(encoded_text.toarray())
print(one_hot.get_feature_names_out())

[[0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1.
  0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0.
  1. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1.]
 [1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.
  0. 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.]]
['x0_also' 'x0_india' 'x0_it' 'x1_is' 'x1_population' 'x2_a' 'x2_seventh'
 'x2_wise' 'x3_it' 'x3_large' 'x3_largest' 'x4_country' 'x4_democracy'
 'x4_is' 'x5_.' 'x5_in' 'x5_the' 'x6_ ' 'x6_second' 'x6_the' 'x7_ '
 'x7_largest' 'x7_world' 'x8_ ' 'x8_.' 'x8_among' 'x9_ ' 'x9_all' 'x10_ '
 'x10_the' 'x11_ ' 'x11_countries' 'x12_ ' 'x12_.' 'x13_ ' 'x14_ ' 'x15_ '
 'x16_ ' 'x17_ ' 'x18_ ' 'x19_ ']


## <font color="Orange">Bag of Words</font>

In [154]:
print(sentences)
print(vocab)

['india is a large democracy .', 'it is seventh largest country in the world .', 'also population wise it is the second largest among all the countries .']
['countries', 'world', 'india', 'country', 'seventh', 'also', 'the', 'largest', 'second', 'among', 'a', 'all', 'population', 'democracy', '.', 'wise', 'it', 'in', 'is', 'large']


In [157]:
import string

In [160]:
feature_vectors = []

for sent in sentences:
    vector = [0]*len(vocab)
    for word in word_tokenize(sent):
        if word not in string.punctuation:
            vector[vocab.index(word)] = len(re.findall(r'\b'+word+r'\b',sent))
        else:
            pattern = rf"\{word}"
            vector[vocab.index(word)] = len(re.findall(pattern,sent))
    
    feature_vectors.append(vector)

print(vocab)
print(feature_vectors)

['countries', 'world', 'india', 'country', 'seventh', 'also', 'the', 'largest', 'second', 'among', 'a', 'all', 'population', 'democracy', '.', 'wise', 'it', 'in', 'is', 'large']
[[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1], [0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0], [1, 0, 0, 0, 0, 1, 2, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0]]


In [161]:
pd.DataFrame(feature_vectors, columns=vocab)

Unnamed: 0,countries,world,india,country,seventh,also,the,largest,second,among,a,all,population,democracy,.,wise,it,in,is,large
0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1,1
1,0,1,0,1,1,0,1,1,0,0,0,0,0,0,1,0,1,1,1,0
2,1,0,0,0,0,1,2,1,1,1,0,1,1,0,1,1,1,0,1,0


## <font color="Orange">Count Vectorizer</font>

In [162]:
from sklearn.feature_extraction.text import CountVectorizer

In [167]:
cov = CountVectorizer()
text_transform = cov.fit_transform(sentences)

print(text_transform.toarray())
print(cov.get_feature_names_out())

[[0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 1 0 1 1 0 1 0 0 1 1 0 1]
 [1 1 1 1 0 0 0 0 1 1 0 1 1 1 0 2 1 0]]
['all' 'also' 'among' 'countries' 'country' 'democracy' 'in' 'india' 'is'
 'it' 'large' 'largest' 'population' 'second' 'seventh' 'the' 'wise'
 'world']


In [168]:
pd.DataFrame(text_transform.toarray(),columns=cov.get_feature_names_out())

Unnamed: 0,all,also,among,countries,country,democracy,in,india,is,it,large,largest,population,second,seventh,the,wise,world
0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0
1,0,0,0,0,1,0,1,0,1,1,0,1,0,0,1,1,0,1
2,1,1,1,1,0,0,0,0,1,1,0,1,1,1,0,2,1,0


removing the stop words

In [169]:
cov = CountVectorizer(stop_words='english')
text_transform = cov.fit_transform(sentences)

print(text_transform.toarray())
print(cov.get_feature_names_out())

[[0 0 1 1 1 0 0 0 0 0 0]
 [0 1 0 0 0 1 0 0 1 0 1]
 [1 0 0 0 0 1 1 1 0 1 0]]
['countries' 'country' 'democracy' 'india' 'large' 'largest' 'population'
 'second' 'seventh' 'wise' 'world']


In [170]:
pd.DataFrame(text_transform.toarray(),columns=cov.get_feature_names_out())

Unnamed: 0,countries,country,democracy,india,large,largest,population,second,seventh,wise,world
0,0,0,1,1,1,0,0,0,0,0,0
1,0,1,0,0,0,1,0,0,1,0,1
2,1,0,0,0,0,1,1,1,0,1,0


## <font color="Orange">Character level Encoding</font>

In [171]:
cov = CountVectorizer(analyzer='char')
text_transform = cov.fit_transform(sentences)

print(text_transform.toarray())
print(cov.get_feature_names_out())

[[ 5  1  4  2  2  2  1  0  3  1  1  1  1  0  2  1  0  0  0  0  1]
 [ 8  1  1  1  1  4  1  2  3  2  0  3  2  0  3  3  5  1  1  1  1]
 [12  1  5  2  1  6  2  2  5  5  1  4  6  2  2  6  6  2  0  1  0]]
[' ' '.' 'a' 'c' 'd' 'e' 'g' 'h' 'i' 'l' 'm' 'n' 'o' 'p' 'r' 's' 't' 'u'
 'v' 'w' 'y']


In [172]:
pd.DataFrame(text_transform.toarray(),columns=cov.get_feature_names_out())

Unnamed: 0,Unnamed: 1,.,a,c,d,e,g,h,i,l,...,n,o,p,r,s,t,u,v,w,y
0,5,1,4,2,2,2,1,0,3,1,...,1,1,0,2,1,0,0,0,0,1
1,8,1,1,1,1,4,1,2,3,2,...,3,2,0,3,3,5,1,1,1,1
2,12,1,5,2,1,6,2,2,5,5,...,4,6,2,2,6,6,2,0,1,0
