In [4]:
import numpy as np
import pandas as pd

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM,Bidirectional
from tensorflow.keras.utils import to_categorical

In [6]:
data = pd.read_pickle('desc_clean.pickle')

In [24]:
text = []
for caption_list in data.values():
    for caption in caption_list[:2]:
        text.append(caption)
len(text)

16182

In [25]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)

In [26]:
vocab_size = len(tokenizer.word_index) + 1    # add 1 for oov_token
print(vocab_size)

5890


In [27]:
text_seq = tokenizer.texts_to_sequences(text)

In [28]:
text_seq[:10]

[[36, 1, 76, 154, 5, 115, 45, 425, 8, 392, 1, 22, 3423, 535],
 [13, 291, 58, 209, 121],
 [10, 6, 4, 783, 6, 15, 370],
 [10, 6, 4, 1816, 228, 6, 32, 9, 158, 89, 3, 2, 144],
 [37, 13, 150, 1, 569, 95, 1, 43, 8, 536, 1157, 9, 53, 210, 1, 1026],
 [37, 13, 5, 49, 1, 43, 8, 55, 536, 1157],
 [7, 551, 3, 151, 27, 23, 6, 95, 46, 116],
 [7, 551, 3, 2, 151, 17, 670, 12, 6, 5, 1082, 1331],
 [7, 1, 22, 81, 96, 1438, 18, 169],
 [7, 371, 22, 81, 96, 4, 284]]

In [29]:
input_seq = []
for line in text_seq:
    for i in range(1,len(line)):
        n_gram = line[:i+1]
        input_seq.append(n_gram)
        

In [30]:
input_seq[:13]

[[36, 1],
 [36, 1, 76],
 [36, 1, 76, 154],
 [36, 1, 76, 154, 5],
 [36, 1, 76, 154, 5, 115],
 [36, 1, 76, 154, 5, 115, 45],
 [36, 1, 76, 154, 5, 115, 45, 425],
 [36, 1, 76, 154, 5, 115, 45, 425, 8],
 [36, 1, 76, 154, 5, 115, 45, 425, 8, 392],
 [36, 1, 76, 154, 5, 115, 45, 425, 8, 392, 1],
 [36, 1, 76, 154, 5, 115, 45, 425, 8, 392, 1, 22],
 [36, 1, 76, 154, 5, 115, 45, 425, 8, 392, 1, 22, 3423],
 [36, 1, 76, 154, 5, 115, 45, 425, 8, 392, 1, 22, 3423, 535]]

In [31]:
# find max list of tokens
max_length = max([len(i) for i in input_seq])
print(max_length)

29


In [32]:
def create_seq(text,tokenizer):
    input_seq = []
    text_sequence = tokenizer.texts_to_sequences(text)
    for line in text_sequence:
        for i in range(1,len(line)):
            n_gram = line[:i+1]
            input_seq.append(n_gram)
            
            
    # pad the sequence to have the same length = max_length
    input_seq = pad_sequences(input_seq , padding='pre' , maxlen=max_length)

    x = []
    y = []

    x = input_seq[:,:-1]
    y = input_seq[:,-1]
    
    y = to_categorical(y , num_classes=vocab_size)

    return x,y

In [33]:
def data_generator(texts,tokenzier):
    size = 32
    while 1:
        for start in range(0,len(texts),size):

            stop = start+size
            text_sequence = create_seq(texts[start:stop],tokenzier)

            yield text_sequence

Model

In [34]:
model = Sequential()
model.add(Embedding(vocab_size , 64, input_length=max_length-1))

model.add(Bidirectional(LSTM(20)))

model.add(Dense(vocab_size , activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 28, 64)            376960    
                                                                 
 bidirectional_1 (Bidirectio  (None, 40)               13600     
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 5890)              241490    
                                                                 
Total params: 632,050
Trainable params: 632,050
Non-trainable params: 0
_________________________________________________________________


In [35]:
model.compile(optimizer='adam',loss='categorical_crossentropy' , metrics=['acc'])

In [61]:
gen = data_generator(text,tokenizer)
model.fit(gen,epochs=100,steps_per_epoch=len(input_seq)/32)

Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f4d39cb0d50>

In [93]:
sentence = 'football'


next_words = 15

for i in range(next_words):
    
    sent = tokenizer.texts_to_sequences([sentence])
    sent = pad_sequences(sent , padding='pre' , maxlen=max_length-1)
    predicted = np.argmax(model.predict(sent))

    
    output_word = ""
    for word,index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
            
    sentence += " " + output_word
        
print(sentence)

football player in blue jersey is challenging the goal in soccer ball while other players are


In [None]:
ss