# Building a vocabulary
Using word tokenization to build a vocabulary.


In [1]:
sentence = """Thomas Jefferson began building monticello at the age of 26."""
sentence.split()

['Thomas',
 'Jefferson',
 'began',
 'building',
 'monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [2]:
str.split(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [13]:
# Tokenization of the string, building a vocabulary and transformation into csv format
import numpy as np
token_sequence = str.split(sentence)
vocab = sorted(set(token_sequence))
",".join(vocab)

'26.,Jefferson,Thomas,age,at,began,building,monticello,of,the'

In [17]:
token_sequence

['Thomas',
 'Jefferson',
 'began',
 'building',
 'monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [18]:
vocab

['26.',
 'Jefferson',
 'Thomas',
 'age',
 'at',
 'began',
 'building',
 'monticello',
 'of',
 'the']

In [16]:
# One hot encoding of the tokenized string
num_tokens = len(token_sequence)
vocab_size = len(vocab)
print("Number of tokens: {}".format(num_tokens))
print("Vocabulary-Size: {}".format(vocab_size))
one_hot_vectors = np.zeros((num_tokens, vocab_size), int)

# Go over 
for i, word in enumerate(token_sequence):
    one_hot_vectors[i, vocab.index(word)] = 1

Number of tokens: 10
Vocabulary-Size: 10


In [15]:
one_hot_vectors

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [19]:
# Concatenate the vocabulary
' '.join(vocab)

'26. Jefferson Thomas age at began building monticello of the'

In [20]:
import pandas as pd

In [23]:
pd.DataFrame(one_hot_vectors, columns = vocab)

Unnamed: 0,26.,Jefferson,Thomas,age,at,began,building,monticello,of,the
0,0,0,1,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,0,0
5,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1
7,0,0,0,1,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0
9,1,0,0,0,0,0,0,0,0,0


In [24]:
# Bag of words from a sentence
sentence_bow = {}
for token in sentence.split():
    sentence_bow[token] = 1

sorted(sentence_bow.items())

[('26.', 1),
 ('Jefferson', 1),
 ('Thomas', 1),
 ('age', 1),
 ('at', 1),
 ('began', 1),
 ('building', 1),
 ('monticello', 1),
 ('of', 1),
 ('the', 1)]

In [25]:
df = pd.DataFrame(pd.Series(dict([(token, 1) for token in sentence.split()])), columns=['sent']).T

In [26]:
df

Unnamed: 0,Thomas,Jefferson,began,building,monticello,at,the,age,of,26.
sent,1,1,1,1,1,1,1,1,1,1


In [28]:
# Bigger example, creation for a bag-of-words (bow) for specific sentences
sentences = """Thomas Jefferson began building Monticello at the age of 26. \n"""
sentences += """Construction was done mostly by local masons and carpenters.\n"""
sentences += """He moved into the South Pavilion n 1770.\n"""
sentences += """Turning Monticello into a neoclassical masterpiece was Jefferson's obsession."""

# Usually better to use .splitlines()
corpus = {}
for i, sent in enumerate(sentences.split("\n")):
    corpus['sent_{}'.format(i)] = dict((tok, 1) for tok in sent.split())

df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
df[df.columns[:20]]

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.,Construction,was,done,mostly,by,local,masons,and,carpenters.,He
sent_0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0
sent_1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,0
sent_2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
sent_3,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [29]:
# All tokens
df.columns

Index(['Thomas', 'Jefferson', 'began', 'building', 'Monticello', 'at', 'the',
       'age', 'of', '26.', 'Construction', 'was', 'done', 'mostly', 'by',
       'local', 'masons', 'and', 'carpenters.', 'He', 'moved', 'into', 'South',
       'Pavilion', 'n', '1770.', 'Turning', 'a', 'neoclassical', 'masterpiece',
       'Jefferson's', 'obsession.'],
      dtype='object')

In [32]:
from collections import Counter

In [33]:
Counter(sentences.split())

Counter({'Thomas': 1,
         'Jefferson': 1,
         'began': 1,
         'building': 1,
         'Monticello': 2,
         'at': 1,
         'the': 2,
         'age': 1,
         'of': 1,
         '26.': 1,
         'Construction': 1,
         'was': 2,
         'done': 1,
         'mostly': 1,
         'by': 1,
         'local': 1,
         'masons': 1,
         'and': 1,
         'carpenters.': 1,
         'He': 1,
         'moved': 1,
         'into': 2,
         'South': 1,
         'Pavilion': 1,
         'n': 1,
         '1770.': 1,
         'Turning': 1,
         'a': 1,
         'neoclassical': 1,
         'masterpiece': 1,
         "Jefferson's": 1,
         'obsession.': 1})

In [38]:
# First vector-space modell (VSM)
df = df.T
df

Unnamed: 0,sent_0,sent_1,sent_2,sent_3
Thomas,1,0,0,0
Jefferson,1,0,0,0
began,1,0,0,0
building,1,0,0,0
Monticello,1,0,0,1
at,1,0,0,0
the,1,0,1,0
age,1,0,0,0
of,1,0,0,0
26.,1,0,0,0


In [44]:
# Calculating the distance between the sentences to get an understanding of how alike they are
# explicitly: print("sent_0 <> sent_1: {}".format(df.sent_0.dot(df.sent_1)))

# Compare distances between each sentence
cols = df.columns
for outter_indx in range(len(cols)):
    
    for inner_indx in range(outter_indx+1, len(cols)):
            first_column = cols[outter_indx]
            second_column = cols[inner_indx]
            print("sent_{0} <> sent_{1}: {2}".format(first_column, second_column, df[first_column].dot(df[second_column])))
    

sent_sent_0 <> sent_sent_1: 0
sent_sent_0 <> sent_sent_2: 1
sent_sent_0 <> sent_sent_3: 1
sent_sent_1 <> sent_sent_2: 0
sent_sent_1 <> sent_sent_3: 1
sent_sent_2 <> sent_sent_3: 1
