# Count Vectorizer with Character Level Encoding

In [4]:
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
text_data = ["This is a sample sentence", "Another example sentence", "One more text sentence"]

vec = CountVectorizer(analyzer = 'char')#we will see how many times that character appears
encoded_data = vec.fit_transform(text_data)

print(vec.vocabulary_)
print(encoded_data.toarray())

df = DataFrame(encoded_data.toarray() , columns = vec.vocabulary_)

df

{'t': 13, 'h': 4, 'i': 5, 's': 12, ' ': 0, 'a': 1, 'm': 7, 'p': 10, 'l': 6, 'e': 3, 'n': 8, 'c': 2, 'o': 9, 'r': 11, 'x': 14}
[[4 2 1 4 1 2 1 1 2 0 1 0 4 2 0]
 [2 2 1 6 1 0 1 1 3 1 1 1 1 2 1]
 [3 0 1 6 0 0 0 1 3 2 0 1 1 3 1]]


Unnamed: 0,t,h,i,s,Unnamed: 5,a,m,p,l,e,n,c,o,r,x
0,4,2,1,4,1,2,1,1,2,0,1,0,4,2,0
1,2,2,1,6,1,0,1,1,3,1,1,1,1,2,1
2,3,0,1,6,0,0,0,1,3,2,0,1,1,3,1


# Building Code from Scratch

In [28]:
text_data = ["This is a sample sentence", "Another example sentence", "One more text sentence", "this"]

characters = list(set(''.join(text_data)))
print(len(characters))



18


In [29]:
chr_to_int = {}
for indx,i in enumerate(characters):
    chr_to_int[i] = indx
chr_to_int

{'x': 0,
 'i': 1,
 'e': 2,
 'c': 3,
 'p': 4,
 'l': 5,
 'n': 6,
 'm': 7,
 'O': 8,
 ' ': 9,
 'h': 10,
 'a': 11,
 'o': 12,
 'A': 13,
 'T': 14,
 'r': 15,
 's': 16,
 't': 17}

In [30]:
# we can do the exact same thing in one line 
chr_to_int = {i : indx for indx, i in enumerate(characters)}
chr_to_int

{'x': 0,
 'i': 1,
 'e': 2,
 'c': 3,
 'p': 4,
 'l': 5,
 'n': 6,
 'm': 7,
 'O': 8,
 ' ': 9,
 'h': 10,
 'a': 11,
 'o': 12,
 'A': 13,
 'T': 14,
 'r': 15,
 's': 16,
 't': 17}

In [37]:
encoded_data = []
for sent in text_data:
    sent_encoded = []
    for char in sent:
        sent_encoded.append(chr_to_int[char])
    encoded_data.append(sent_encoded)
print(encoded_data)

[[14, 10, 1, 16, 9, 1, 16, 9, 11, 9, 16, 11, 7, 4, 5, 2, 9, 16, 2, 6, 17, 2, 6, 3, 2], [13, 6, 12, 17, 10, 2, 15, 9, 2, 0, 11, 7, 4, 5, 2, 9, 16, 2, 6, 17, 2, 6, 3, 2], [8, 6, 2, 9, 7, 12, 15, 2, 9, 17, 2, 0, 17, 9, 16, 2, 6, 17, 2, 6, 3, 2], [17, 10, 1, 16]]


In [40]:
# we will see how we will write the above one in list comprehension
encode_data = [[chr_to_int[char] for char in sent]for sent in text_data]
print(encode_data)

[[14, 10, 1, 16, 9, 1, 16, 9, 11, 9, 16, 11, 7, 4, 5, 2, 9, 16, 2, 6, 17, 2, 6, 3, 2], [13, 6, 12, 17, 10, 2, 15, 9, 2, 0, 11, 7, 4, 5, 2, 9, 16, 2, 6, 17, 2, 6, 3, 2], [8, 6, 2, 9, 7, 12, 15, 2, 9, 17, 2, 0, 17, 9, 16, 2, 6, 17, 2, 6, 3, 2], [17, 10, 1, 16]]


# This is way to encode and decode data which computer understands

In [42]:
text_data = ["This is a sample sentence", "Another example sentence", "One more text sentence", "this"]

characters = list(set(''.join(text_data)))
print(len(characters))

chr_to_int = {i    : indx for indx, i in enumerate(characters)}
int_to_chr = {indx : i for indx, i in enumerate(characters)}

encode_data = [[chr_to_int[char] for char in sent]for sent in text_data]
print(encode_data)

decode_data = [''.join([int_to_chr[char] for char in sent]) for sent in encode_data]
print(decode_data)

18
[[14, 10, 1, 16, 9, 1, 16, 9, 11, 9, 16, 11, 7, 4, 5, 2, 9, 16, 2, 6, 17, 2, 6, 3, 2], [13, 6, 12, 17, 10, 2, 15, 9, 2, 0, 11, 7, 4, 5, 2, 9, 16, 2, 6, 17, 2, 6, 3, 2], [8, 6, 2, 9, 7, 12, 15, 2, 9, 17, 2, 0, 17, 9, 16, 2, 6, 17, 2, 6, 3, 2], [17, 10, 1, 16]]
['This is a sample sentence', 'Another example sentence', 'One more text sentence', 'this']
