# Generate Keywords Ground Truth
This notebook generates the ground truth dataset for search keywords, given user input.

In [None]:
from multiprocessing import Process

import const
import numpy
import pandas as pd
import traceback
import yake

In [None]:
squad_ds = pd.read_csv(const.DATASETS_FOLDER + "squad-train-v2.0.csv")

In [None]:
def generate_keywords(squad_ds, start_row, end_row, batch_size = 64):
    #print("start: " + str(start_row) + ", end: " + str(end_row))
    row_len = end_row - start_row
    if row_len <= batch_size:
            kw_extr = yake.KeywordExtractor(n=16)
            for row in range(start_row, end_row):
                keywords = kw_extr.extract_keywords(squad_ds.loc[row, "question"])
                if keywords == []:
                     print("No keywords extracted at row " + str(row) + " (question: " + squad_ds.loc[row, "question"] + ")")
                else:
                    squad_ds.loc[row, "search keyword"] = keywords[0][0]
                alt_keywords = []
                for item in keywords:
                    alt_keywords.append(item[0])
                squad_ds.loc[row, "possible_keywords"] = str(alt_keywords)
            
    else:
        row_range = end_row - start_row
        left_start = start_row
        left_end = int((row_range)/2) + left_start
        right_start = start_row + int((row_range)/2)
        right_end = end_row

        #print("left_start: " + str(left_start) + " left_end: " + str(left_end) + " right_start: " + str(right_start) + " right_end: " + str(right_end))
        p_left = Process(target=generate_keywords, args=(squad_ds, left_start, left_end, batch_size))
        p_right = Process(target=generate_keywords, args=(squad_ds, right_start, right_end, batch_size))
        p_left.start()
        p_right.start()
        p_left.join()
        p_right.join()

In [None]:
squad_ds.head()

In [None]:
end = len(squad_ds)
generate_keywords(squad_ds, 0, end, batch_size=512)

In [None]:
squad_ds.head()