In [2]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m102.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=413a1269b32a6c089f348c055cd8cdd388d50be4c50d9ab4f2a8ac581bbdbccf
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [4]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 17 10:35:03 2023

@author: wuyuan
"""

import os
import json
import random
import re
import string
import tqdm
import argparse
import numpy as np
import pandas as pd
from multiprocessing import Pool
from functools import partial
from rouge_score import rouge_scorer
#from gpt3_api import make_requests as make_gpt3_requests
from accelerate import Accelerator

import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

random.seed(42)

# Set some config
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
dataset_name = "fathyshalab/clinic-work"

# Fine-tuned model name
new_model = "llama-2-7b-miniguanaco"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training



Downloading (…)lve/main/config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [5]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)


In [6]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

In [7]:
prompt = "How can renew my contract"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result)



[{'generated_text': "<s>[INST] How can renew my contract [/INST]  To renew your contract, you will typically need to follow these steps:\n nobody knows the renewal process for your contract, you should contact your employer or HR department for guidance. They can provide you with the necessary forms and information to complete the renewal process.\n\nHere are some general steps that may be involved in renewing a contract:\n\n1. Review the contract: Before you start the renewal process, review your current contract to understand the terms and conditions of your agreement. This will help you identify any changes you want to make to your contract.\n2. Gather required documents: Depending on your employer's requirements, you may need to provide certain documents to renew your contract. These documents may include your ID, proof of address, and any other relevant paperwork.\n3. Complete the renewal form: Your employer may provide you with a renewal form that you"}]


In [1]:
prompt = "請用繁體中文回答我以下的問題 : KOBE是誰"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result)

NameError: ignored

In [None]:
# Generate instruction

In [53]:

def encode_prompt(prompt_instructions, classification=False):
    """Encode multiple prompt instructions into a single string."""
    if classification:
        prompt = "Come up with a series of classification tasks. Try to specify the possible output labels when possible.\n"
    else:
        prompt = "Come up with a series of tasks:\n"
    for idx, instruction in enumerate(prompt_instructions):
        instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":")
        prompt += f"{idx+1}. {instruction}\n"
    prompt += f"{len(prompt_instructions) + 1}."
    return prompt


def sample_machine_instructions(machine_instructions, similarities, n):
    """Sample n machine instructions from a list of machine instructions."""
    return random.sample(machine_instructions, min(n, len(machine_instructions)))


def find_word_in_string(w, s):
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search(s)


def post_process_gpt3_response(response):
    raw_instructions = re.split(r"\n",result[0]["generated_text"])
    instructions = []
    for inst in raw_instructions:
        inst = re.sub(r"\s+", " ", inst).strip()
        inst = inst.strip().capitalize()
        if inst == "":
            continue
        # filter out too short or too long instructions
        if len(inst.split()) <= 3 or len(inst.split()) > 150:
            continue
        # filter based on keywords that are not suitable for language models.
        if any(find_word_in_string(word, inst) for word in ["image", "images", "graph", "graphs", "picture", "pictures", "file", "files", "map", "maps", "draw", "plot", "go to"]):
            continue
        # We found that the model tends to add "write a program" to some existing instructions, which lead to a lot of such instructions.
        # And it's a bit comfusing whether the model need to write a program or directly output the result.
        # Here we filter them out.
        # Note this is not a comprehensive filtering for all programming instructions.
        if inst.startswith("Write a program"):
            continue
        # filter those starting with punctuation
        if inst[0] in string.punctuation:
            continue
        # filter those starting with non-english character
        if not inst[0].isascii():
            continue
        instructions.append(inst)
    return instructions


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-f')
    parser.add_argument(
        "--batch_dir",
        type=str,
        #required=True,
        default=r"/content/drive/MyDrive/llama_generations",
        help="The directory where the batch is stored.",
    )
    parser.add_argument(
        "--seed_tasks_path",
        type=str,
        #required=True,
        default=r"/content/drive/MyDrive/seed_tasks.jsonl",
        help="The path to the human written data.",
    )
    parser.add_argument(
        "--num_instructions_to_generate",
        type=int,
        default=100,
        help="th",
    )
    parser.add_argument(
        "--use_clf_seed_tasks_only",
        action="store_true",
        help="If specified, we will only use the classification seed tasks to prompt new instructions. This will lead to more classification instructions.",
    )
    parser.add_argument(
        "--engine",
        type=str,
        default="davinci",
        help="The engine to use."
    )
    parser.add_argument(
        "--num_prompt_instructions",
        type=int,
        default=8,
        help="The number of instructions to use in the prompt."
    )
    parser.add_argument(
        "--request_batch_size",
        type=int,
        default=5,
        help="The number of requests to send to GPT3 at a time."
    )
    parser.add_argument(
        "--api_key",
        default="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",
        type=str,
        help="The API key to use. If not specified, the key will be read from the environment variable OPENAI_API_KEY."
    )
    parser.add_argument(
        "--organization",
        type=str,
        help="The organization to use. If not specified, the default organization id will be used."
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()
    seed_tasks = [json.loads(l) for l in open(args.seed_tasks_path, "r")]
    if args.use_clf_seed_tasks_only:
        seed_tasks = [t for t in seed_tasks if t["is_classification"]]
    seed_instructions = [t["instruction"] for t in seed_tasks]
    print(f"Loaded {len(seed_instructions)} human-written seed instructions")

    os.makedirs(args.batch_dir, exist_ok=True)
    request_idx = 0
    # load the LM-generated instructions
    machine_instructions = []
    if os.path.exists(os.path.join(args.batch_dir, "machine_generated_instructions.jsonl")):
        with open(os.path.join(args.batch_dir, "machine_generated_instructions.jsonl"), "r") as fin:
            for line in fin:
                instruction_info = json.loads(line)
                machine_instructions.append(instruction_info["instruction"])
                request_idx = instruction_info["request_idx"] + 1
        print(f"Loaded {len(machine_instructions)} machine-generated instructions")

    # similarities = {}
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)

    # now let's generate new instructions!
    progress_bar = tqdm.tqdm(total=args.num_instructions_to_generate)
    if machine_instructions:
        progress_bar.update(len(machine_instructions))

    with open(os.path.join(args.batch_dir, "machine_generated_instructions.jsonl"), "a") as fout:
        while len(machine_instructions) < args.num_instructions_to_generate:
            batch_inputs = []
            for _ in range(args.request_batch_size):
                # sample machine instructions from the pool
                prompt_instructions = sample_machine_instructions(
                    machine_instructions,
                    similarities=None,
                    n=2)
                # sample human instructions from the pool
                prompt_instructions += random.sample(seed_instructions, args.num_prompt_instructions - len(prompt_instructions))
                random.shuffle(prompt_instructions)
                prompt = encode_prompt(prompt_instructions, classification=args.use_clf_seed_tasks_only)
                batch_inputs.append(prompt)
            results = []
            pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
            for prompt in batch_inputs:
                result = pipe(f"<s>[INST] {prompt} [/INST]")
                results.append(result)
            instructions = []
            all_metadata = []
            for result in results:
                new_instructions = post_process_gpt3_response(result)
                instructions += new_instructions
                all_metadata += [result] * len(new_instructions)

            for inst, metadata in zip(instructions, all_metadata):
                with Pool(4) as p:
                    rouge_scores = p.map(partial(scorer.score, inst), seed_instructions + machine_instructions)
                rouge_scores = [score["rougeL"].fmeasure for score in rouge_scores]
                # rouge_scores = [scorer.score(inst, e_inst)["rougeL"].fmeasure for e_inst in human_instructions + machine_instructions]
                if max(rouge_scores) > 0.7:
                    continue
                all_instructions = seed_instructions + machine_instructions
                most_similar_instructions = {
                        all_instructions[i] : rouge_scores[i] for i in np.argsort(rouge_scores)[-10:][::-1]
                    }
                machine_instructions.append(inst)
                fout.write(json.dumps({
                    "instruction": inst,
                    "most_similar": most_similar_instructions,
                    "avg_similarity_score": float(np.mean(rouge_scores)),
                    "metadata": metadata,
                    "request_idx": request_idx
                }) + "\n")
                progress_bar.update(1)
            request_idx += 1


Loaded 175 human-written seed instructions
Loaded 0 machine-generated instructions



100%|██████████| 100/100 [49:15<00:00, 29.55s/it]


KeyboardInterrupt: ignored

In [None]:
#identify_cif_or_not

In [57]:
from template.clf_task_template import template_1
templates = {
    "template_1": template_1
}

ModuleNotFoundError: ignored

In [None]:
def parse_args():
    parser = argparse.ArgumentParser()
    # parser.add_argument("--template", type=str, default="template_1", help="Which template to use.")
    parser.add_argument(
        "--batch_dir",
        type=str,
        #required=True,
        default=r"C:\Users\User\iCloudDrive\實習資料\EZtable\ALpaca\self-instruct\data\gpt3_generations",
        help=r"C:\Users\User\iCloudDrive\實習資料\EZtable\ALpaca\self-instruct\data\gpt3_generations",
    )
    parser.add_argument(
        "--num_instructions",
        type=int,
        help="if specified, only generate instance input for this many instructions",
    )
    parser.add_argument(
        "--template",
        type=str,
        default="template_1",
        help="Which template to use. Currently only `template_1` is supported.",
    )
    parser.add_argument(
        "--engine",
        type=str,
        default="davinci",
        help="The engine to use."
    )
    parser.add_argument(
        "--request_batch_size",
        type=int,
        default=5,
        help="The number of requests to send in a batch."
    )
    parser.add_argument(
        "--api_key",
        default="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",
        type=str,
        help="The API key to use. If not specified, the key will be read from the environment variable `OPENAI_API_KEY`."
    )
    parser.add_argument(
        "--organization",
        type=str,
        help="The organization to use. If not specified, the default organization id will be used."
    )
    return parser.parse_args()

In [None]:
args = parse_args()

with open(os.path.join(args.batch_dir, "machine_generated_instructions.jsonl")) as fin:
    lines = fin.readlines()
    if args.num_instructions is not None:
        lines = lines[:args.num_instructions]

output_path = os.path.join(args.batch_dir, f"is_clf_or_not_{args.engine}_{args.template}.jsonl")
existing_requests = {}
if os.path.exists(output_path):
    with open(output_path) as fin:
        for line in tqdm.tqdm(fin):
            try:
                data = json.loads(line)
                existing_requests[data["instruction"]] = data
            except:
                pass
    print(f"Loaded {len(existing_requests)} existing requests")

progress_bar = tqdm.tqdm(total=len(lines))
with open(output_path, "w") as fout:
    for batch_idx in range(0, len(lines), args.request_batch_size):
        batch = [json.loads(line) for line in lines[batch_idx: batch_idx + args.request_batch_size]]
        if all(d["instruction"] in existing_requests for d in batch):
            for d in batch:
                data = existing_requests[d["instruction"]]
                data = OrderedDict(
                    (k, data[k]) for k in \
                        ["instruction", "is_classification"]
                    )
                fout.write(json.dumps(data, ensure_ascii=False) + "\n")
        else:
            # prefix = compose_prompt_prefix(human_written_tasks, batch[0]["instruction"], 8, 2)
            prefix = templates[args.template]
            prompts = [prefix + " " + d["instruction"].strip() + "\n" + "Is it classification?" for d in batch]
            results = make_gpt3_requests(
                engine=args.engine,
                prompts=prompts,
                max_tokens=3,
                temperature=0,
                top_p=0,
                frequency_penalty=0,
                presence_penalty=0,
                stop_sequences=["\n", "Task"],
                logprobs=1,
                n=1,
                best_of=1,
                api_key=args.api_key,
                organization=args.organization)
            for i in range(len(batch)):
                data = batch[i]
                if results[i]["response"] is not None:
                    data["is_classification"] = results[i]["response"]["choices"][0]["text"]
                else:
                    data["is_classification"] = ""
                data = {
                    "instruction": data["instruction"],
                    "is_classification": data["is_classification"]
                }
                data = OrderedDict(
                    (k, data[k]) for k in \
                        ["instruction", "is_classification"]
                    )
                fout.write(json.dumps(data, ensure_ascii=False) + "\n")
        progress_bar.update(len(batch))

In [39]:
#Generate instances

In [None]:
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--batch_dir",
        type=str,
        #required=True,
        default=r"C:\Users\User\iCloudDrive\實習資料\EZtable\ALpaca\self-instruct\data\gpt3_generations",
        help="The directory where the batch is stored.",
    )
    parser.add_argument(
        "--input_file",
        type=str,
        default="machine_generated_instructions.jsonl"
    )
    parser.add_argument(
        "--output_file",
        type=str,
        default="machine_generated_instances.jsonl",
    )
    parser.add_argument(
        "--num_instructions",
        type=int,
        help="if specified, only generate instance input for this many instructions",
    )
    parser.add_argument(
        "--max_instances_to_generate",
        type=int,
        default=5,
        help="The max number of instances to generate for each instruction.",
    )
    parser.add_argument(
        "--generation_tasks_only",
        action="store_true",
        help="If specified, only do for generation tasks.",
    )
    parser.add_argument(
        "--classification_tasks_only",
        action="store_true",
        help="If specified, only do for classification tasks.",
    )
    parser.add_argument(
        "--engine",
        type=str,
        default="davinci",
        help="The engine to use."
    )
    parser.add_argument(
        "--request_batch_size",
        type=int,
        default=5,
        help="The number of requests to send in a batch."
    )
    parser.add_argument(
        "--api_key",
        default="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",
        type=str,
        help="The API key to use. If not specified, the key will be read from the environment variable OPENAI_API_KEY."
    )
    parser.add_argument(
        "--organization",
        type=str,
        help="The organization to use. If not specified, the default organization id will be used."
    )
    return parser.parse_args()



In [None]:
args = parse_args()

with open(os.path.join(args.batch_dir, args.input_file)) as fin:
    lines = fin.readlines()
    if args.num_instructions is not None:
        lines = lines[:args.num_instructions]
    tasks = []
    for line in lines:
        data = json.loads(line)
        if "metadata" in data:
            data["instruction_metadata"] = data["metadata"]
            del data["metadata"]
        tasks.append(data)

task_clf_types = {}
with open(os.path.join(args.batch_dir, "is_clf_or_not_davinci_template_1.jsonl")) as fin:
    for line in fin:
        data = json.loads(line)
        task_clf_types[data["instruction"]] = data["is_classification"].strip() in ["Yes", "yes", "YES"]

if args.classification_tasks_only:
    tasks = [task for task in tasks if task_clf_types[task["instruction"]]]

if args.generation_tasks_only:
    tasks = [task for task in tasks if not task_clf_types[task["instruction"]]]

output_path = os.path.join(args.batch_dir, args.output_file)
existing_requests = {}
if os.path.exists(output_path):
    with open(output_path) as fin:
        for line in tqdm.tqdm(fin):
            try:
                data = json.loads(line)
                existing_requests[data["instruction"]] = data
            except:
                pass
    print(f"Loaded {len(existing_requests)} existing requests")

progress_bar = tqdm.tqdm(total=len(tasks))
with open(output_path, "w") as fout:
    for batch_idx in range(0, len(tasks), args.request_batch_size):
        batch = tasks[batch_idx: batch_idx + args.request_batch_size]
        if all(d["instruction"] in existing_requests for d in batch):
            for d in batch:
                data = existing_requests[d["instruction"]]
                data = OrderedDict(
                    (k, data[k]) for k in \
                        ["instruction", "raw_instances", "instance_metadata", "instruction_metadata",
                        "most_similar", "avg_similarity_score"]
                    )
                fout.write(json.dumps(data, ensure_ascii=False) + "\n")
        else:
            prompts = []
            for task in batch:
                if task_clf_types[task["instruction"]]:
                    prompt = output_first_template_for_clf + " " + task["instruction"].strip() + "\n"
                    prompts.append(prompt)
                else:
                    prompt = input_first_template_for_gen + " " + task["instruction"].strip() + "\n"
                    prompts.append(prompt)
            results = make_gpt3_requests(
                engine=args.engine,
                prompts=prompts,
                # because the clf template is longer, we need to decrease the max_tokens
                max_tokens=300 if any(task_clf_types[task["instruction"]] for task in batch) else 350,
                temperature=0,
                top_p=0,
                frequency_penalty=0,
                presence_penalty=1.5,
                stop_sequences=[f"Example {args.max_instances_to_generate + 1}", "Task:"],
                logprobs=1,
                n=1,
                best_of=1,
                api_key=args.api_key,
                organization=args.organization)
            for i in range(len(batch)):
                data = batch[i]
                data["instance_metadata"] = results[i]
                if results[i]["response"] is not None:
                    data["raw_instances"] = results[i]["response"]["choices"][0]["text"]
                else:
                    data["raw_instances"] = ""
                data = OrderedDict(
                    (k, data[k]) for k in \
                        ["instruction", "raw_instances", "instance_metadata", "instruction_metadata",
                        "most_similar", "avg_similarity_score"]
                    )
                fout.write(json.dumps(data, ensure_ascii=False) + "\n")
        progress_bar.update(len(batch))
