In [8]:
import torch
import json
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GemmaTokenizer

print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))
print(torch.cuda.device_count())

True
0
NVIDIA GeForce GTX 1070
1


In [3]:
model_id = "instructlab/merlinite-7b-lab"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token='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')
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}, token='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')

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.85s/it]


In [15]:
def generate_batch(json_template):
    sys_prompt = "You are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."

    prompt = f'<|system|>\n{sys_prompt}\n<|user|>\Generate a JSON template array with five variation of {json_template} as different questions and answers with the same information but rephrased in the same json template but all on the same line\n'
    
    device = "cuda:0"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    outputs = model.generate(**inputs, max_new_tokens=350)
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return output_text

In [13]:
def parse_json(batch):
    pattern = re.compile(r'\[(.*?)\]', re.DOTALL)
    match = pattern.search(batch)
    
    if match:
        extracted_text = match.group(1)
        json_array = f'[{extracted_text}]'
        parsed_json = json.loads(json_array)
        return parsed_json
    else:
        return "Something went wrong!"

In [17]:
# Initial piece of data of which you want to generate synthetic data
question = "What is a workbench in OpenShift AI?"
answer = "A workbench represents an environment instance for development and experimentation in Red Hat OpenShift AI, allowing you to select a notebook image for your data science activities."

json_template = {
    "question": question,
    "answer": answer
}

In [18]:
%%time
variations = [json_template]

for i in range(10):
    # Generate new batch based on the last result in the array
    batch = generate_batch(variations[-1])

    # Extract and parse json
    parsed_batch = parse_json(batch)

    if parsed_batch == "Something went wrong!":
        print(parsed_batch)
        break

    # Add newly generated batch to the array
    variations = variations + parsed_batch

output_file = "parsed_json_array.json"
with open(output_file, "w") as outfile:
    json.dump(variations, outfile, indent=2)

CPU times: user 6min 56s, sys: 1min 11s, total: 8min 7s
Wall time: 8min 7s
