# Query completion

In [None]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam

In [None]:
# Read queries
queries_json = json.load(open( ".\cranfield\cran_queries.json", 'r'))[:]
query_ids, queries = [item["query number"] for item in queries_json], \
                        [item["query"] for item in queries_json]


In [None]:
# queries_sent = [query.split('.') for query in queries]
# queries_sent
query_merged = ''.join(queries)
queries_sent = query_merged.split('.')
queries_sent

## using a neural network model

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(queries_sent)
print(tokenizer.word_index)

In [None]:
input_sequences = []
for query in queries_sent:
    sequence = tokenizer.texts_to_sequences([query])[0]
    for i in range(1,len(sequence)):
        input_sequences.append(sequence[:i+1])
        
# pad sequences to equal length
max_seq_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding = 'pre'))

# create predictors and labels
xs, labels = input_sequences[:,:-1], input_sequences[:,-1]

total_words = len(tokenizer.word_index)+1
ys = tf.keras.utils.to_categorical(labels, num_classes = total_words)
    

In [None]:
model = Sequential()
model.add(Embedding(total_words, 16, input_length = max_seq_len -1))
model.add(Bidirectional(LSTM(20)))
model.add(Dense(total_words, activation = 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(xs, ys, epochs = 200, verbose = 1)

In [None]:
# to convert number to string associated with the label
reverse_word_index = {val : key for (key,val) in tokenizer.word_index.items()}
reverse_word_index

In [None]:
def complete_query(incomplete_query, next_n_words = 1):
    seed_text = incomplete_query
    next_words = next_n_words
    
    for _ in range(next_words):
        
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen = max_seq_len - 1, padding = 'pre')
        predicted = model.predict_classes(token_list, verbose=0)
        # print(predicted)
        # decoding the predicted word
        out_word = reverse_word_index[predicted[0]]
        seed_text += " "+out_word
        
    return seed_text

In [None]:
complete_query("experimental studies of creep",1)

In [None]:
complete_query("experimental studies of creep",10)