# Generate Keywords Ground Truth
This notebook generates the ground truth dataset for search keywords, given user input.

In [1]:
from multiprocessing import *

import const
import ctypes
import numpy
import pandas as pd
import traceback
import yake

In [2]:
temp_squad_ds = pd.read_csv(const.DATASETS_FOLDER + "squad-train-v2.0.csv")

In [3]:
k = Value(ctypes.py_object)
k.value = temp_squad_ds

In [4]:
print(k.value)

        Unnamed: 0     source    topic  \
0                0  Squad 2.0  Beyoncé   
1                1  Squad 2.0  Beyoncé   
2                2  Squad 2.0  Beyoncé   
3                3  Squad 2.0  Beyoncé   
4                4  Squad 2.0  Beyoncé   
...            ...        ...      ...   
130314      130314  Squad 2.0   Matter   
130315      130315  Squad 2.0   Matter   
130316      130316  Squad 2.0   Matter   
130317      130317  Squad 2.0   Matter   
130318      130318  Squad 2.0   Matter   

                                                paragraph  \
0       Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
1       Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
2       Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
3       Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
4       Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
...                                                   ...   
130314  The term "matter" is used throughout physics i...   
130315 

In [5]:
def generate_keywords(squad_ds_ns, start_row, end_row, batch_size = 64):
    #print("start: " + str(start_row) + ", end: " + str(end_row))
    temp_squad_ds = squad_ds_ns.df
    row_len = end_row - start_row
    if row_len <= batch_size:
        kw_extr = yake.KeywordExtractor(n=16)
        for row in range(start_row, end_row):
            keywords = kw_extr.extract_keywords(temp_squad_ds.loc[row, "question"])
            if keywords == []:
                    print("No keywords extracted at row " + str(row) + " (question: " + temp_squad_ds.loc[row, "question"] + ")")
            else:
                temp_squad_ds.loc[row, "keyword"] = keywords[0][0]
            alt_keywords = []
            for item in keywords:
                alt_keywords.append(item[0])
            temp_squad_ds.loc[row, "possible_keywords"] = str(alt_keywords)
        squad_ds_ns.df = temp_squad_ds
    else:
        row_range = end_row - start_row
        left_start = start_row
        left_end = int((row_range)/2) + left_start
        right_start = start_row + int((row_range)/2)
        right_end = end_row

        left_mgr = Manager()
        right_mgr = Manager()
        left_squad_ns = left_mgr.Namespace()
        right_squad_ns = right_mgr.Namespace()

        left_squad_ns.df = squad_ds_ns.df[0:int(row_range/2)].copy()
        right_squad_ns.df = squad_ds_ns.df[int(row_range/2):row_range].copy()
        p_left = Process(target=generate_keywords, args=(left_squad_ns, left_start, left_end, batch_size))
        p_right = Process(target=generate_keywords, args=(right_squad_ns, right_start, right_end, batch_size))
        p_left.start()
        p_right.start()
        p_left.join()
        p_right.join()
        squad_ds_ns.df = pd.concat([left_squad_ns.df, right_squad_ns.df])

In [6]:
temp_squad_ds.head()

Unnamed: 0.1,Unnamed: 0,source,topic,paragraph,question,question_id,is_impossible,answers,expanded_answers
0,0,Squad 2.0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,False,in the late 1990s,in the late 1990s as lead singer of R&B girl-g...
1,1,Squad 2.0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,False,singing and dancing,in the late 1990s as lead singer of R&B girl-g...
2,2,Squad 2.0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,56be85543aeaaa14008c9066,False,2003,in the late 1990s as lead singer of R&B girl-g...
3,3,Squad 2.0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,56bf6b0f3aeaaa14008c9601,False,"Houston, Texas",in the late 1990s as lead singer of R&B girl-g...
4,4,Squad 2.0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,56bf6b0f3aeaaa14008c9602,False,late 1990s,in the late 1990s as lead singer of R&B girl-g...


In [7]:
end = len(temp_squad_ds)
mgr = Manager()
squad_ns = mgr.Namespace()
squad_ns.df = temp_squad_ds
generate_keywords(squad_ns, 0, 64, batch_size=4)
squad_ds = squad_ns.df