# Generate Keywords Ground Truth
This notebook generates the ground truth dataset for search keywords, given user input.

In [10]:
from multiprocessing import *
from transformers import pipeline
from datetime import datetime as dt

import const
import ctypes
import numpy
import pandas as pd
import torch
import traceback
import yake

In [11]:
KEYWORD_GEN_MODEL = "yake"

In [12]:
temp_df = pd.read_csv(const.DATASETS_FOLDER + "squad_ds_keyword_train_OLD.csv")

In [4]:
k = Value(ctypes.py_object)
k.value = temp_ds

In [5]:
print(k.value)

       Unnamed: 0     source    topic  \
0               0  Squad 2.0  Normans   
1               1  Squad 2.0  Normans   
2               2  Squad 2.0  Normans   
3               3  Squad 2.0  Normans   
4               4  Squad 2.0  Normans   
...           ...        ...      ...   
26227       26227  Squad 2.0    Force   
26228       26228  Squad 2.0    Force   
26229       26229  Squad 2.0    Force   
26230       26230  Squad 2.0    Force   
26231       26231  Squad 2.0    Force   

                                               paragraph  \
0      The Normans (Norman: Nourmands; French: Norman...   
1      The Normans (Norman: Nourmands; French: Norman...   
2      The Normans (Norman: Nourmands; French: Norman...   
3      The Normans (Norman: Nourmands; French: Norman...   
4      The Normans (Norman: Nourmands; French: Norman...   
...                                                  ...   
26227  The pound-force has a metric counterpart, less...   
26228  The pound-force has 

In [6]:
def generate_keywords(ds_ns, start_row, end_row, batch_size = 64):
    #print("start: " + str(start_row) + ", end: " + str(end_row))
    temp_ds = ds_ns.df
    row_len = end_row - start_row
    if row_len <= batch_size:
        kw_extr = yake.KeywordExtractor(n=16)
        for row in range(start_row, end_row):
            keywords = kw_extr.extract_keywords(temp_ds.loc[row, "question"])
            if keywords == []:
                    print("No keywords extracted at row " + str(row) + " (question: " + temp_ds.loc[row, "question"] + ")")
            else:
                temp_ds.loc[row, "keyword"] = keywords[0][0]
            alt_keywords = []
            for item in keywords:
                alt_keywords.append(item[0])
            temp_ds.loc[row, "possible_keywords"] = str(alt_keywords)
        ds_ns.df = temp_ds
    else:
        row_range = end_row - start_row
        left_start = start_row
        left_end = int((row_range)/2) + left_start
        right_start = start_row + int((row_range)/2)
        right_end = end_row

        left_mgr = Manager()
        right_mgr = Manager()
        left_ns = left_mgr.Namespace()
        right_ns = right_mgr.Namespace()

        left_ns.df = ds_ns.df[0:int(row_range/2)].copy()
        right_ns.df = ds_ns.df[int(row_range/2):row_range].copy()
        p_left = Process(target=generate_keywords, args=(left_ns, left_start, left_end, batch_size))
        p_right = Process(target=generate_keywords, args=(right_ns, right_start, right_end, batch_size))
        p_left.start()
        p_right.start()
        p_left.join()
        p_right.join()
        ds_ns.df = pd.concat([left_ns.df, right_ns.df])

In [7]:
def generate_keywords_by_zephyr(model, ds):
    questions_processed = 0
    samples_len = len(ds)
    start_dt = dt.now()
    end_dt = None
    for row in range(len(ds)):
        content_msg = ("Hello Zephyr. I am creating a dataset that contains questions about topics. " + 
       "The questions may be asked by the user, thus it is a feature in the dataset I am creating. " +
       "Keywords are the dataset's label, which are keywords that can " +
       "be searched online to answer the user's questions. Please generate a kewyord that answers the following question in " +
       "quotation marks: \"")
        progress = int(questions_processed / samples_len) * 100
        progr_msg = ("Generating keywords... " + str(progress) + "% ("
            + "samples: " + str(questions_processed + 1) + "/" + str(samples_len))
        if end_dt is None:
            progr_msg += ", Elapsed time: 00:00.00)"
        else:
            elapsed_time = end_dt - start_dt
            avg_time = elapsed_time / questions_processed
            samples_remaining = samples_len - questions_processed
            time_remaining = avg_time * (samples_remaining)
            progr_msg += ", Elapsed time: " + str(elapsed_time) + ", "
            progr_msg += "Time remaining: " + str(time_remaining) + ")"
        print(progr_msg, end="                                                           \r")        
        question = ds.loc[row, "question"]
        content_msg += question + "\""
        msg = [{"role": "user", "content": content_msg}]
        prompt = zephyr.tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
        model_output = zephyr(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
        model_output = model_output[0]
        model_output = model_output['generated_text']
        model_output = model_output.split("<|assistant|>\n")
        keyword = model_output[1]
        if keyword == None:
                print("No keywords extracted at row " + str(row) + " (question: " + ds.loc[row, "question"] + ")")
        else:
            ds.loc[row, "keyword"] = keyword
        end_dt = dt.now()
        questions_processed += 1

In [8]:
temp_ds.head()

Unnamed: 0.1,Unnamed: 0,source,topic,paragraph,question,question_id,is_impossible,answers,expanded_answers
0,0,Squad 2.0,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,56ddde6b9a695914005b9628,False,France,"and and Norway who, under their leader Rollo, ..."
1,1,Squad 2.0,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,56ddde6b9a695914005b9628,False,France,"and and Norway who, under their leader Rollo, ..."
2,2,Squad 2.0,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,56ddde6b9a695914005b9628,False,France,"and and Norway who, under their leader Rollo, ..."
3,3,Squad 2.0,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,56ddde6b9a695914005b9628,False,France,"and and Norway who, under their leader Rollo, ..."
4,4,Squad 2.0,Normans,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,56ddde6b9a695914005b9629,False,10th and 11th centuries,"and and Norway who, under their leader Rollo, ..."


In [9]:
if KEYWORD_GEN_MODEL == "yake":
    end = len(temp_ds)
    mgr = Manager()
    ns = mgr.Namespace()
    ns.df = temp_ds
    generate_keywords(ns, 0, len(ns.df), batch_size=1024)
    temp_df = ns.df
elif KEYWORD_GEN_MODEL == "zephyr":
    zephyr = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-alpha",
                 torch_dtype = torch.bfloat16, device_map="auto")
    generate_keywords_by_zephyr(zephyr, temp_ds)
else:
    raise ValueError(KEYWORD_GEN_MODEL)

No keywords extracted at row 5766 (question: What must be at 28°C?)
No keywords extracted at row 9937 (question:  what is Internet2)
No keywords extracted at row 9938 (question:  what is Internet2)
No keywords extracted at row 9939 (question:  what is Internet2)
No keywords extracted at row 9954 (question: Internet2 became what in 2007? )
No keywords extracted at row 4939 (question: What is CO? )
No keywords extracted at row 4942 (question: What is Fe2O?)
No keywords extracted at row 5792 (question: What is O-R-O?)
No keywords extracted at row 5793 (question: What is R-OC-R?)
No keywords extracted at row 5794 (question: What is R-OOC-R?)
No keywords extracted at row 5116 (question: What is the name for O3 most often used?)
No keywords extracted at row 5117 (question: What is the name for O3 most often used?)
No keywords extracted at row 5118 (question: What is the name for O3 most often used?)
No keywords extracted at row 5119 (question: What is the name for O3 most often used?)
No key

In [13]:
temp_df = temp_df.dropna()

In [4]:
def create_label(input_row):
    """This function creates the 'text' label
        column.

        PARAMETERS
        user_prompt - The user prompt to append
            into the chat template.
        
    """
    try:
        question = input_row['question']
        keyword = input_row['keyword']
        
        prompt = ("<|system|>\nYou are a chatbot " +
                  "that assists in providing " + 
                  "information to the user. " + 
                  "Please generate a keyword " + 
                  "from the user's question.</s>\n<|user|>\n")
        prompt += question
        prompt += "</s>\n<|assistant|>\n"
        prompt += keyword
    except:
        print(question)
        print(keyword)
    return prompt

In [14]:
temp_df

Unnamed: 0.1,Unnamed: 0,source,topic,paragraph,question,question_id,is_impossible,answers,expanded_answers,keyword,possible_keywords,is_searchable,text
0,0,Squad 2.0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,False,in the late 1990s,in the late 1990s as lead singer of R&B girl-g...,Beyonce start becoming popular,"['Beyonce start becoming popular', 'Beyonce st...",True,<|system|>\nYou are a chatbot that assists in ...
1,1,Squad 2.0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,False,singing and dancing,in the late 1990s as lead singer of R&B girl-g...,areas did Beyonce compete in when she was growing,['areas did Beyonce compete in when she was gr...,True,<|system|>\nYou are a chatbot that assists in ...
2,2,Squad 2.0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,56be85543aeaaa14008c9066,False,2003,in the late 1990s as lead singer of R&B girl-g...,Year Beyonce leave Destiny Child and become a ...,['Beyonce leave Destiny Child and become a sol...,True,<|system|>\nYou are a chatbot that assists in ...
3,3,Squad 2.0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,56bf6b0f3aeaaa14008c9601,False,"Houston, Texas",in the late 1990s as lead singer of R&B girl-g...,city and state did Beyonce grow,"['city and state did Beyonce grow', 'city and ...",True,<|system|>\nYou are a chatbot that assists in ...
4,4,Squad 2.0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,56bf6b0f3aeaaa14008c9602,False,late 1990s,in the late 1990s as lead singer of R&B girl-g...,decade did Beyonce become famous,"['decade did Beyonce become famous', 'Beyonce ...",True,<|system|>\nYou are a chatbot that assists in ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
130315,130315,Squad 2.0,Matter,"The term ""matter"" is used throughout physics i...",Physics has broadly agreed on the definition o...,5a7e070b70df9f001a875439,True,matter,"ter and antimatter, normal matter has been ref...",Physics has broadly agreed on the definition,['Physics has broadly agreed on the definition...,True,<|system|>\nYou are a chatbot that assists in ...
130316,130316,Squad 2.0,Matter,"The term ""matter"" is used throughout physics i...",Who coined the term partonic matter?,5a7e070b70df9f001a87543a,True,Alfvén,"ter and antimatter, normal matter has been ref...",coined the term partonic matter,"['coined the term partonic matter', 'term part...",True,<|system|>\nYou are a chatbot that assists in ...
130317,130317,Squad 2.0,Matter,"The term ""matter"" is used throughout physics i...",What is another name for anti-matter?,5a7e070b70df9f001a87543b,True,Gk. common matter,"ter and antimatter, normal matter has been ref...",anti-matter,['anti-matter'],True,<|system|>\nYou are a chatbot that assists in ...
130318,130318,Squad 2.0,Matter,"The term ""matter"" is used throughout physics i...",Matter usually does not need to be used in con...,5a7e070b70df9f001a87543c,True,a specifying modifier,"ter and antimatter, normal matter has been ref...",Matter usually does not need to be used in con...,['Matter usually does not need to be used in c...,True,<|system|>\nYou are a chatbot that assists in ...


In [15]:
temp_df_1 = temp_df[['question', 'keyword']].apply(lambda x: create_label(x), axis=1)

In [16]:
temp_df_1.head()

0    <|system|>\nYou are a chatbot that assists in ...
1    <|system|>\nYou are a chatbot that assists in ...
2    <|system|>\nYou are a chatbot that assists in ...
3    <|system|>\nYou are a chatbot that assists in ...
4    <|system|>\nYou are a chatbot that assists in ...
dtype: object

In [17]:
temp_df['text'] = temp_df_1

In [18]:
temp_df.to_csv(const.DATASETS_FOLDER + "squad_ds_keyword_train.csv")