In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
sentences = ['I love my dog', 'I love my cat', 'you love my cat!',]

tokenizer = Tokenizer(num_words = 100) # num_words = take top 100 most common word from text
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'love': 1, 'my': 2, 'i': 3, 'cat': 4, 'dog': 5, 'you': 6}


In [6]:
sentences = ['I love my dog', 'I love my cat', 'you love my cat!',
            'Do you think my dog is amazing?']

tokenizer = Tokenizer(num_words = 100) # num_words = take top 100 most common word from text
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
print(word_index)
print(sequences)

{'my': 1, 'love': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[3, 2, 1, 4], [3, 2, 1, 5], [6, 2, 1, 5], [7, 6, 8, 1, 4, 9, 10]]


In [7]:
test_data = ['i really love my dog', 'my dog loves my manatee']
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[3, 2, 1, 4], [1, 4, 1]]


words, which were not fitted are not recognized

In [9]:
tokenizer = Tokenizer(num_words = 100, oov_token = '<OOV>')
tokenizer.fit_on_texts(sentences)
word_index= tokenizer.word_index
test_seq = tokenizer.texts_to_sequences(test_data)
print(word_index)
print(test_seq)

{'<OOV>': 1, 'my': 2, 'love': 3, 'i': 4, 'dog': 5, 'cat': 6, 'you': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[4, 1, 3, 2, 5], [2, 5, 1, 2, 1]]


## Padding
adding uniformity of size for all sentences


In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
sentences = ['I love my dog', 'I love my cat', 'you love my cat!',
            'Do you think my dog is amazing?']

tokenizer = Tokenizer(num_words = 100, oov_token = '<OOV>') # num_words = take top 100 most common word from text
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, padding = 'post')
#parameters: maxlen = cut sentences, padding at the end of the sentence

print(word_index)
print(sequences)
print(padded)

{'<OOV>': 1, 'my': 2, 'love': 3, 'i': 4, 'dog': 5, 'cat': 6, 'you': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[4, 3, 2, 5], [4, 3, 2, 6], [7, 3, 2, 6], [8, 7, 9, 2, 5, 10, 11]]
[[ 4  3  2  5  0  0  0]
 [ 4  3  2  6  0  0  0]
 [ 7  3  2  6  0  0  0]
 [ 8  7  9  2  5 10 11]]


## Repetition code from Notebook presentation

In [None]:
#import tensorflow as tf
from tensorflow import keras
#import ... Tokenizer
#import ..... pad_sequences

In [41]:
sentences = ['I love my dog', 'I love my cat', 'you love my cat!',
            'Do you think my dog is amazing?']

tokenizer = Tokenizer(num_words = 100, oov_token = '<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen = 7)
print('\nWord Index = ', word_index)
print('\nSequences = ', sequences)
print('\nPadded Sequences = ', padded)




Word Index =  {'<OOV>': 1, 'my': 2, 'love': 3, 'i': 4, 'dog': 5, 'cat': 6, 'you': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

Sequences =  [[4, 3, 2, 5], [4, 3, 2, 6], [7, 3, 2, 6], [8, 7, 9, 2, 5, 10, 11]]

Padded Sequences =  [[ 0  0  0  4  3  2  5]
 [ 0  0  0  4  3  2  6]
 [ 0  0  0  7  3  2  6]
 [ 8  7  9  2  5 10 11]]


In [43]:
test_data = ['i really love my dog', 'my dog loves my manatee']

test_seq = tokenizer.texts_to_sequences(test_data)

padded = pad_sequences(test_seq, maxlen = 8)
print(test_seq)
print(padded)


[[4, 1, 3, 2, 5], [2, 5, 1, 2, 1]]
[[0 0 0 4 1 3 2 5]
 [0 0 0 2 5 1 2 1]]


## Sarcasm database processing


In [59]:
import json


with open('sarcasm_data/sarcasm.json', 'r') as f:
    datastore = json.load(f)
    
sentences = []
labels =[]
urls = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

tokenizer = Tokenizer(oov_token = '<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(len(word_index))

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding = 'post' )
print(sentences[0])
print(padded[0])
print(padded.shape)

29657
former versace store clerk sues over secret 'black code' for minority shoppers
[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)
