In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

In [2]:
from tqdm.auto import tqdm
from typing import List, Tuple
import re
DEVICE = 'cuda:0'
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# !GITHUB_ACTIONS=true pip install auto-gptq

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from data.dataset import GraphDataset, DataAugmentationLevel, NodeType, DialogNode, Question

KeyboardInterrupt: 

In [None]:
model_name_or_path = "TheBloke/upstage-llama-30b-instruct-2048-GPTQ"
model_basename = "gptq_model-4bit--1g"

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
                                          use_fast=True,
                                          cache_dir="/mount/arbeitsdaten/asr-2/vaethdk/resources/weights/",)


In [None]:
model = AutoGPTQForCausalLM.from_quantized("/mount/arbeitsdaten/asr-2/vaethdk/resources/weights/TheBloke--upstage-llama-30b-instruct-2048-GPTQ",
        # model_basename=model_basename,
        # revision="gptq-4bit-32g-actorder_True",
        use_safetensors=True,
        trust_remote_code=False,
        device="cuda:0",
        use_triton=use_triton,
        quantize_config=None)

CUDA extension not installed.
The safetensors archive passed at /mount/arbeitsdaten/asr-2/vaethdk/resources/weights/TheBloke--upstage-llama-30b-instruct-2048-GPTQ/gptq_model-4bit-32g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.
skip module injection for FusedLlamaMLPForQuantizedModel not support integrate without triton yet.


### System:
{System}

### User:
{User}

### Assistant:
{Assistant}

In [7]:
system = """You are a helpful assistant creating a list of FAQ-style questions from given facts.
Only generate questions that can be answered by the given facts, without any external knowledge.
Remove some information, especially nouns and named entities, between generated questions.
Use casual language.
Order the generated paraphrases in a numbered list."""
user = 'Generate 10 FAQ-style questions from the fact: "In the US, you are entitled to 30$ per day, minus any free meals which you choose to decline."'


prompt = f"""
### System:
{system}

### User:
{user}

### Assistant:"""

set_seed(42)
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
print(tokenizer.decode(output[0]))


<s> 
### System:
You are a helpful assistant creating a list of FAQ-style questions from given facts.
Only generate questions that can be answered by the given facts, without any external knowledge.
Remove some information, especially nouns and named entities, between generated questions.
Use casual language.
Order the generated paraphrases in a numbered list.

### User:
Generate 10 FAQ-style questions from the fact: "In the US, you are entitled to 30$ per day, minus any free meals which you choose to decline."

### Assistant:
1. What is the daily allowance for meals in the US?
2. Can you receive more than 30$ per day for meals in the US?
3. Are you allowed to decline free meals in the US?
4. How does declining free meals affect your daily meal allowance?
5. Is there a specific amount you can receive for meals in the US?
6. Can you receive less than 30$ per day for meals in the US?
7. How does the daily meal allowance work in the US?
8. Is there a limit to the number of meals you can d

## Generate Question Synonyms

In [7]:
def generate_prompt(system: str, user: str) -> str:
    return f"""
    ### System:
    {system}

    ### User:
    {user}

    ### Assistant:"""

def generate_output(prompt: str, temperature: float = 0.7, max_new_tokens: int = 512) -> torch.FloatTensor:
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
    output = model.generate(inputs=input_ids, temperature=temperature, max_new_tokens=max_new_tokens)
    return tokenizer.decode(output[0])


In [8]:
human_data_train = GraphDataset('en/train_graph.json', 'en/train_answers.json', False, augmentation=DataAugmentationLevel.NONE)

- not using synonyms
===== Dataset Statistics =====
- files:  resources/en/train_graph.json resources/en/train_answers.json
- synonyms: False
- depth: 20  - degree: 13
- answers: 73
- questions: 279
- loaded original data: True
- loaded generated data: False


In [9]:
# check that we don't have any answer synonyms
for node in human_data_train.node_list:
    for question in node.questions:
        human_data_train.question_list.remove(question)
        del human_data_train.questions_by_key[question.key]
    node.questions.clear()
assert len(human_data_train.question_list) == 0
assert len(human_data_train.questions_by_key) == 0

In [19]:
def parse_output(original_question: str, prompt: str, output: str, num_paraphrases: int) -> List[str]:
    # remove prompt from output first (ends at ### ASSISTANT: )
    questions = []
    cleaned = output[len(prompt):]
    
    if not "1." in cleaned: 
        print("NO LIST FOR QUESTION", original_question)
        return questions
    
    for i in range(1, num_paraphrases+1):
        if not f"{i}." in cleaned: 
            print(f" - NO {i}. CANDIDATE FOR QUESTION", original_question)
            continue

        start_idx = cleaned.find(f"{i}.") # find i. line
        end_idx = cleaned.find("\n", start_idx) # read until line end 
        if i == num_paraphrases and end_idx == -1:
            # last line might not have line break
            end_idx = len(cleaned)
        if start_idx == -1 or end_idx == -1:
            print(f" - INDEX PROBLEM FOR {i}. CANDIDATE: ({start_idx}, {end_idx})")
            continue
        # parse answer
        questions.append(cleaned[start_idx:end_idx].replace("</s>", "").strip())

        cleaned = cleaned[end_idx:] # remove i. line
    return questions

# V1

In [11]:
from data.dataset import NodeType, Question
import time

set_seed(42)

system = """You are a helpful assistant creating a list of FAQ-style questions from given facts.
Only generate questions that can be answered by the given facts, without any external knowledge.
Remove some information, especially nouns and named entities, between generated questions.
Use casual language.
Order the generated paraphrases in a numbered list."""

def user(answer_text: str, num_paraphrases: int) -> str:
    return f'Generate {num_paraphrases} FAQ-style questions from the fact: "{answer_text}"'

NUM_QUESTIONS = 10
TEMPERATURE = 0.7
MAX_NEW_TOKENS = 1024
generated_data = {}

for node in tqdm(human_data_train.nodes_by_type[NodeType.INFO]):
    prompt = generate_prompt(system=system, user=user(node.text, NUM_QUESTIONS))
    gen = generate_output(prompt=prompt, temperature=TEMPERATURE, max_new_tokens=MAX_NEW_TOKENS)
    candidates = parse_output(original_question=node.text, prompt=prompt, output=gen, num_paraphrases=NUM_QUESTIONS)
    for candidate in candidates:
        key = str(time.time()).replace(".", "")
        generated_data[key] = {
            "dialog_node_key": node.key,
            "key": key,
            "text": candidate,
        }

  0%|          | 0/80 [00:00<?, ?it/s]

100%|██████████| 80/80 [9:40:49<00:00, 435.62s/it]  


In [12]:
import json

cleaned_data = {}
for key in generated_data:
    node = human_data_train.nodes_by_key[generated_data[key]['dialog_node_key']]
    cleaned_data[key] = generated_data[key]
    for i in range (1, NUM_QUESTIONS+1):
        cleaned_data[key]['text'] = cleaned_data[key]['text'].replace(f"{i}.", "").strip()
    cleaned_data[key]["node_text"] = node.text
    cleaned_data[key]["node_type"] = node.node_type.value

with open("resources/en/generated/train_questions_v1.json", "w") as f:
    json.dump(cleaned_data, f)

# DATA GENERATION V2: SHORTER QUESTIONS


In [13]:
from data.dataset import NodeType, Question
import time

set_seed(42)

system = """You are a helpful assistant creating a list of FAQ-style questions from given facts.
Only generate questions that can be answered by the given facts, without any external knowledge.
Remove some information, especially nouns and named entities, between generated questions.
Use casual language.
Prefer short questions.
Order the generated questions in a numbered list."""

def user(answer_text: str, num_paraphrases: int) -> str:
    return f'Generate {num_paraphrases} short FAQ-style questions from the fact: "{answer_text}"'

NUM_QUESTIONS = 10
TEMPERATURE = 0.7
MAX_NEW_TOKENS = 1024
generated_data = {}

for node in tqdm(human_data_train.nodes_by_type[NodeType.INFO]):
    prompt = generate_prompt(system=system, user=user(node.text, NUM_QUESTIONS))
    gen = generate_output(prompt=prompt, temperature=TEMPERATURE, max_new_tokens=MAX_NEW_TOKENS)
    candidates = parse_output(original_question=node.text, prompt=prompt, output=gen, num_paraphrases=NUM_QUESTIONS)
    for candidate in candidates:
        key = str(time.time()).replace(".", "")
        generated_data[key] = {
            "dialog_node_key": node.key,
            "key": key,
            "text": candidate,
        }

100%|██████████| 80/80 [11:34:35<00:00, 520.95s/it]  


In [14]:
import json

cleaned_data = {}
for key in generated_data:
    node = human_data_train.nodes_by_key[generated_data[key]['dialog_node_key']]
    cleaned_data[key] = generated_data[key]
    for i in range (1, NUM_QUESTIONS+1):
        cleaned_data[key]['text'] = cleaned_data[key]['text'].replace(f"{i}.", "").strip()
    cleaned_data[key]["node_text"] = node.text
    cleaned_data[key]["node_type"] = node.node_type.value

with open("resources/en/generated/train_questions_v2.json", "w") as f:
    json.dump(cleaned_data, f)

# DATA GENERATION V3: SHORTER QUESTIONS + SPLIT NODE CONTEXT

1. Try to generate shorter questions (-> change prompt)
2. Try to generate more diverse questions
    1. Detect relevant sentences in node text via NER tool (also detects time, quantities, ...)
    2. Generate questions for whole node context, then for only relevant sentences / sub-sentences of node
    3. Choose amount of questions to be generated depending on amount of extracted NERs?

In [8]:
# !pip install stanza

Collecting stanza
  Downloading stanza-1.5.0-py3-none-any.whl (802 kB)
[K     |████████████████████████████████| 802 kB 14.3 MB/s eta 0:00:01
Collecting emoji
  Downloading emoji-2.7.0.tar.gz (361 kB)
[K     |████████████████████████████████| 361 kB 119.2 MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Building wheels for collected packages: emoji
  Building wheel for emoji (PEP 517) ... [?25ldone
[?25h  Created wheel for emoji: filename=emoji-2.7.0-py2.py3-none-any.whl size=356563 sha256=d6fa52fd4eea49ae84298a10e2c77bf642c90e75abcdd4db626c8b0309a7e8f2
  Stored in directory: /home/users2/vaethdk/.cache/pip/wheels/41/11/48/5df0b9727d5669c9174a141134f10304d1d78a3b89a4676f3d
Successfully built emoji
Installing collected packages: emoji, stanza
Successfully installed emoji-2.7.0 stanza-1.5.0
You should consider upgrading via the '/fs/scratch/users/vaethdk/a

In [10]:
import stanza

In [11]:
torch.cuda.device_count()

1

In [10]:
# stanza.download(lang="en", model_dir=".models/")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 59.2MB/s]                    
2023-08-04 16:21:45 INFO: Downloading default packages for language: en (English) ...
Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/default.zip: 100%|██████████| 594M/594M [00:05<00:00, 115MB/s]  
2023-08-04 16:21:56 INFO: Finished downloading models and saved to .models/.


In [12]:
nlp = stanza.Pipeline('en', processors='tokenize,ner', device="cuda:0")

2023-08-11 11:18:46 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 49.5MB/s]                    
2023-08-11 11:18:49 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2023-08-11 11:18:49 INFO: Using device: cuda:0
2023-08-11 11:18:49 INFO: Loading: tokenize
2023-08-11 11:18:52 INFO: Loading: ner
2023-08-11 11:18:52 INFO: Done loading processors!


In [16]:
print(doc.ents)

[{
  "text": "Barack Obama",
  "type": "PERSON",
  "start_char": 0,
  "end_char": 12
}, {
  "text": "Hawaii",
  "type": "GPE",
  "start_char": 25,
  "end_char": 31
}]


In [13]:
from statistics import mean

nodes_with_ner = 0
nodes_without_ner = 0
avg_node_ner = []

for node in tqdm(human_data_train.nodes_by_type[NodeType.INFO]):
    context = nlp(node.text)
    if len(context.ents) > 0:
        nodes_with_ner += 1
        avg_node_ner.append(len(context.ents))
    else:
        nodes_without_ner += 1

print("TOTAL INFO NODES", len(human_data_train.nodes_by_type[NodeType.INFO]))
print("NODES WITH NER", nodes_with_ner)
print("NODES WITHOUT NER", nodes_without_ner)
print("AVG NER PER NODE WITH NER", mean(avg_node_ner))

100%|██████████| 80/80 [00:04<00:00, 16.02it/s]

TOTAL INFO NODES 80
NODES WITH NER 33
NODES WITHOUT NER 47
AVG NER PER NODE WITH NER 2.090909090909091





In [14]:
avg_node_sentence_length = []

for node in human_data_train.nodes_by_type[NodeType.INFO]:
    avg_node_sentence_length.append(node.text.count("."))

print("MAX #SENTENCES PER NODE", max(avg_node_sentence_length))
print("AVG #SENTENCES PER NODE", mean(avg_node_sentence_length))

MAX #SENTENCES PER NODE 6
AVG #SENTENCES PER NODE 1.8


In [17]:
def extract_ner_sentences(node: DialogNode) -> List[Tuple[str, str]]:
    """
    Extract all sentences from node text that mention NER's.
    Returns them as a list of tuples, where each tuple contains
        1. the name of the entity
        2. the sentence containing that entity
    """
    results = []
    context = nlp(node.text)
    entities = context.ents
    for entity in entities:
        start_idx = entity.start_char
        end_idx = entity.end_char
        # expand start index to beginning of sentence
        while start_idx > 0 and node.text[start_idx-1] != ".":
            start_idx -= 1
        # expand end index to end of sentence
        while end_idx < len(node.text) and node.text[end_idx-1] != ".":
            end_idx += 1
        results.append((entity.text, node.text[start_idx:end_idx]))
    return results

In [None]:
# find a testing candidate
for node in human_data_train.nodes_by_type[NodeType.INFO]:
    results = extract_ner_sentences(node)
    if len(results) > 1:
        print(results)
        break



In [None]:
print(node.text)



In [None]:
from data.dataset import NodeType, Question
import time

system = """You are a helpful assistant creating a list of diverse FAQ-style questions from given facts.
Only generate questions that can be answered by the given facts, without any external knowledge.
Use casual language.
Prefer short questions.
Order the generated paraphrases in a numbered list."""

def user(answer_text: str, num_paraphrases: int) -> str:
    return f'Generate {num_paraphrases} short and diverse FAQ-style questions from the fact: "{answer_text}"'


NUM_QUESTIONS = 3
TEMPERATURE = 0.7
MAX_NEW_TOKENS = 1024
generated_data = {}


for entity, sentence in tqdm(extract_ner_sentences(node)):
    prompt = generate_prompt(system=system, user=user(sentence, NUM_QUESTIONS))
    gen = generate_output(prompt=prompt, temperature=TEMPERATURE, max_new_tokens=MAX_NEW_TOKENS)
    candidates = parse_output(original_question=node.text, prompt=prompt, output=gen, num_paraphrases=NUM_QUESTIONS)
    generated_data[entity] = candidates


100%|██████████| 3/3 [03:52<00:00, 77.40s/it]


In [None]:
import pprint
pprint.pprint(generated_data)

              'and restrictions?',
              '2. Where can I find the latest information on travel advisories '
              'related to COVID-19?',
              '3. Which organizations should I refer to for understanding the '
              'current travel restrictions due to COVID-19?'],
 'Department 4 (Administrative Department': ['1. Who can grant authorization '
                                             'in extreme cases?',
                                             "2. What department's leadership "
                                             'can authorize in extreme '
                                             'situations?',
                                             "3. Which department's leadership "
                                             'has the power to authorize in '
                                             'extreme cases?'],
         'restrictions?',
         '2. Where can I find the latest information on travel advisories '
         'related to

In [15]:
from data.dataset import NodeType, Question
import time

system = """You are a helpful assistant creating a list of diverse FAQ-style questions from given facts.
Only generate questions that can be answered by the given facts, without any external knowledge.
Use casual language.
Prefer short questions.
Order the generated paraphrases in a numbered list."""

def user(answer_text: str, num_paraphrases: int) -> str:
    return f'Generate {num_paraphrases} short and diverse FAQ-style questions from the fact: "{answer_text}"'


NUM_QUESTIONS = 3
TEMPERATURE = 0.7
MAX_NEW_TOKENS = 1024

prompt = generate_prompt(system=system, user=user(node.text, NUM_QUESTIONS))
gen = generate_output(prompt=prompt, temperature=TEMPERATURE, max_new_tokens=MAX_NEW_TOKENS)
candidates = parse_output(original_question=node.text, prompt=prompt, output=gen, num_paraphrases=NUM_QUESTIONS)
pprint.pprint(candidates)

KeyboardInterrupt: 

In [20]:
system = """You are a helpful assistant creating a list of diverse FAQ-style questions from given facts.
Only generate questions that can be answered by the given facts, without any external knowledge.
Use casual language.
Prefer short questions.
Order the generated paraphrases in a numbered list."""

def user(answer_text: str, num_paraphrases: int) -> str:
    return f'Generate {num_paraphrases} short and diverse FAQ-style questions from the fact: "{answer_text}"'

def user_ner(answer_text: str, ner: str, num_paraphrases: int) -> str:
    return f'Generate {num_paraphrases} short and diverse FAQ-style questions about the entity "{ner}" from the fact: "{answer_text}"'


NUM_QUESTIONS = 10
NUM_QUESTIONS_PER_SENTENCE = 3
TEMPERATURE = 0.7
MAX_NEW_TOKENS = 1024
generated_data = {}

set_seed(42)

for node in tqdm(human_data_train.nodes_by_type[NodeType.INFO]):
    # use dict indexed by generated text to filter out duplicates
    all_generations = {}
    
    # extract NERs
    named_entities = extract_ner_sentences(node)

    # Generate questions with NER sentences only, make asking about NER a requirement
    for entity, sentence in named_entities:
        prompt = generate_prompt(system=system, user=user_ner(node.text, entity, NUM_QUESTIONS_PER_SENTENCE))
        gen = generate_output(prompt=prompt, temperature=TEMPERATURE, max_new_tokens=MAX_NEW_TOKENS)
        candidates = parse_output(original_question=node.text, prompt=prompt, output=gen, num_paraphrases=NUM_QUESTIONS_PER_SENTENCE)
        for candidate_idx, candidate in enumerate(candidates):
            key = str(time.time()).replace(".", "")
            cleaned_candidate = candidate.replace(f"{candidate_idx+1}.", "").strip()
            all_generations[cleaned_candidate] = {
                "context": "ner",
                "entity": entity,
                "dialog_node_key": node.key,
                "key": key,
                "text": cleaned_candidate
            }

    # Generate questions with whole context
    num_node_level_questions = max(NUM_QUESTIONS_PER_SENTENCE, NUM_QUESTIONS - len(named_entities) * NUM_QUESTIONS_PER_SENTENCE)
    prompt = generate_prompt(system=system, user=user(node.text, num_node_level_questions))
    gen = generate_output(prompt=prompt, temperature=TEMPERATURE, max_new_tokens=MAX_NEW_TOKENS)
    candidates = parse_output(original_question=node.text, prompt=prompt, output=gen, num_paraphrases=NUM_QUESTIONS_PER_SENTENCE)
    for candidate_idx, candidate in enumerate(candidates):
        key = str(time.time()).replace(".", "")
        cleaned_candidate = candidate.replace(f"{candidate_idx+1}.", "").strip()
        all_generations[cleaned_candidate] = {
            "context": "node",
            "dialog_node_key": node.key,
            "key": key,
            "text": cleaned_candidate
        }
    
    # add filtered questions to generated dataset
    for text in all_generations:
        entry = all_generations[text]
        generated_data[entry["key"]] = entry

100%|██████████| 80/80 [12:37:21<00:00, 568.01s/it]  


In [21]:
import json

cleaned_data = {}
for key in generated_data:
    node = human_data_train.nodes_by_key[generated_data[key]['dialog_node_key']]
    cleaned_data[key] = generated_data[key]
    for i in range (1, NUM_QUESTIONS+1):
        cleaned_data[key]['text'] = cleaned_data[key]['text'].replace(f"{i}.", "").strip()
    cleaned_data[key]["node_text"] = node.text
    cleaned_data[key]["node_type"] = node.node_type.value

with open("resources/en/generated/train_questions_v3.json", "w") as f:
    json.dump(cleaned_data, f)