In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

In [25]:
from tqdm.auto import tqdm
from typing import List
import re

In [2]:
DEVICE = 'cuda:0'
import torch
a= torch.zeros(1,1,device=DEVICE)

In [3]:
torch.cuda.device_count()

1

In [5]:
# !GITHUB_ACTIONS=true pip install auto-gptq

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model_name_or_path = "TheBloke/upstage-llama-30b-instruct-2048-GPTQ"
model_basename = "gptq_model-4bit--1g"

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
                                          use_fast=True,
                                          cache_dir="/mount/arbeitsdaten/asr-2/vaethdk/resources/weights/",)


In [6]:
model = AutoGPTQForCausalLM.from_quantized("/mount/arbeitsdaten/asr-2/vaethdk/resources/weights/TheBloke--upstage-llama-30b-instruct-2048-GPTQ",
        # model_basename=model_basename,
        # revision="gptq-4bit-32g-actorder_True",
        use_safetensors=True,
        trust_remote_code=False,
        device="cuda:0",
        use_triton=use_triton,
        quantize_config=None)

CUDA extension not installed.
The safetensors archive passed at /mount/arbeitsdaten/asr-2/vaethdk/resources/weights/TheBloke--upstage-llama-30b-instruct-2048-GPTQ/gptq_model-4bit-32g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.
skip module injection for FusedLlamaMLPForQuantizedModel not support integrate without triton yet.


### System:
{System}

### User:
{User}

### Assistant:
{Assistant}

In [30]:
system = """You are a helpful assistant creating a list of FAQ-style questions from given facts.
Only generate questions that can be answered by the given facts, without any external knowledge.
Remove some information, especially nouns and named entities, between generated questions.
Use casual language.
Order the generated paraphrases in a numbered list."""
user = 'Generate 10 FAQ-style questions from the fact: "In the US, you are entitled to 30$ per day, minus any free meals which you choose to decline."'


prompt = f"""
### System:
{system}

### User:
{user}

### Assistant:"""

set_seed(42)
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
print(tokenizer.decode(output[0]))


<s> 
### System:
You are a helpful assistant creating a list of FAQ-style questions from given facts.
Only generate questions that can be answered by the given facts, without any external knowledge.
Remove information from the generated questions: countries, cities, named entities and other nouns.
Use casual language.
Order the generated paraphrases in a numbered list.

### User:
Generate 10 FAQ-style questions from the fact: "In the US, you are entitled to 30$ per day, minus any free meals which you choose to decline."

### Assistant:
1. What is the daily allowance for meals in the US?
2. Can you receive more than 30$ per day for meals in the US?
3. Are you allowed to decline free meals in the US?
4. How does declining free meals affect your daily meal allowance in the US?
5. Is there a specific amount you can receive for meals in the US?
6. Can you receive less than 30$ per day for meals in the US?
7. How does the daily meal allowance work in the US?
8. Is there a limit to the number

## Generate Question Synonyms

In [31]:
from data.dataset import GraphDataset
import torch

In [32]:
def generate_prompt(system: str, user: str) -> str:
    return f"""
    ### System:
    {system}

    ### User:
    {user}

    ### Assistant:"""

def generate_output(prompt: str, temperature: float = 0.7, max_new_tokens: int = 512) -> torch.FloatTensor:
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
    output = model.generate(inputs=input_ids, temperature=temperature, max_new_tokens=max_new_tokens)
    return tokenizer.decode(output[0])


In [33]:
train = GraphDataset('resources/en/train_graph.json', 'resources/en/train_answers.json', False)

- not using synonyms
===== Dataset Statistics =====
- files:  resources/en/train_graph.json resources/en/train_answers.json
- synonyms: False
- depth: 20  - degree: 13
- answers: 73
- questions: 279


In [34]:
# check that we don't have any answer synonyms
for node in train.node_list:
    for question in node.questions:
        train.question_list.remove(question)
        del train.questions_by_key[question.key]
    node.questions.clear()
assert len(train.question_list) == 0
assert len(train.questions_by_key) == 0

In [35]:
def parse_output(original_question: str, prompt: str, output: str, num_paraphrases: int) -> List[str]:
    # remove prompt from output first (ends at ### ASSISTANT: )
    questions = []
    cleaned = output[len(prompt):]
    
    if not "1." in cleaned: 
        print("NO LIST FOR QUESTION", original_question)
        return questions
    
    for i in range(1, num_paraphrases+1):
        if not f"{i}." in cleaned: 
            print(f" - NO {i}. CANDIDATE FOR QUESTION", original_question)
            continue

        start_idx = cleaned.find(f"{i}.") # find i. line
        end_idx = cleaned.find("\n", start_idx) # read until line end 
        if i == num_paraphrases and end_idx == -1:
            # last line might not have line break
            end_idx = len(cleaned)
        if start_idx == -1 or end_idx == -1:
            print(f" - INDEX PROBLEM FOR {i}. CANDIDATE: ({start_idx}, {end_idx})")
            continue
        # parse answer
        questions.append(cleaned[start_idx:end_idx].replace("</s>").strip())

        cleaned = cleaned[end_idx:] # remove i. line
    return questions

In [36]:
from data.dataset import NodeType, Question
import time

system = """You are a helpful assistant creating a list of FAQ-style questions from given facts.
Only generate questions that can be answered by the given facts, without any external knowledge.
Remove some information, especially nouns and named entities, between generated questions.
Use casual language.
Order the generated paraphrases in a numbered list."""

def user(answer_text: str, num_paraphrases: int) -> str:
    return f'Generate {num_paraphrases} FAQ-style questions from the fact: "{answer_text}"'

NUM_QUESTIONS = 10
TEMPERATURE = 0.7
MAX_NEW_TOKENS = 1024
generated_data = {}

for node in tqdm(train.node_list):
    if node.node_type in [NodeType.INFO, NodeType.QUESTION]:
        prompt = generate_prompt(system=system, user=user(node.text, NUM_QUESTIONS))
        gen = generate_output(prompt=prompt, temperature=TEMPERATURE, max_new_tokens=MAX_NEW_TOKENS)
        candidates = parse_output(original_question=node.text, prompt=prompt, output=gen, num_paraphrases=NUM_QUESTIONS)
        for candidate in candidates:
            key = str(time.time()).replace(".", "")
            generated_data[key] = {
                "dialog_node_key": node.key,
                "key": key,
                "text": candidate,
            }

 15%|█▍        | 18/123 [56:29<4:41:16, 160.72s/it]

 - INDEX PROBLEM FOR 3. CANDIDATE: (1, -1)
 - NO 4. CANDIDATE FOR QUESTION What would you like to know about? Information Before the start of a Business trip An Emergency during your Business trip Information about the Country risk categories
 - NO 5. CANDIDATE FOR QUESTION What would you like to know about? Information Before the start of a Business trip An Emergency during your Business trip Information about the Country risk categories
 - NO 6. CANDIDATE FOR QUESTION What would you like to know about? Information Before the start of a Business trip An Emergency during your Business trip Information about the Country risk categories
 - NO 7. CANDIDATE FOR QUESTION What would you like to know about? Information Before the start of a Business trip An Emergency during your Business trip Information about the Country risk categories
 - NO 8. CANDIDATE FOR QUESTION What would you like to know about? Information Before the start of a Business trip An Emergency during your Business trip Inf

100%|██████████| 123/123 [7:02:31<00:00, 206.11s/it]  


In [44]:
import json

cleaned_data = {}
for key in generated_data:
    node = train.nodes_by_key[generated_data[key]['dialog_node_key']]
    cleaned_data[key] = generated_data[key]
    for i in range (1, NUM_QUESTIONS+1):
        cleaned_data[key]['text'] = cleaned_data[key]['text'].replace(f"{i}.", "").strip()
    cleaned_data[key]["node_text"] = node.text
    cleaned_data[key]["node_type"] = node.node_type.value

with open("resources/en/generated//train_questions.json", "w") as f:
    json.dump(cleaned_data, f)