In [None]:
from openai import OpenAI

client = OpenAI(
    api_key='INSERT API KEY HERE',

)

text = ""

completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": text,
        }
    ],
    max_tokens=1024,
    model="gpt-3.5-turbo",
)

In [None]:
completion.choices[0].message.content.replace('\n', '')

In [None]:
import pickle

with open('data_with_imgs.pkl', 'rb') as file:
    data = pickle.load(file)

In [None]:
print(data[9])

In [None]:
import openai
import random
import time

def retry_with_exponential_backoff(
    func,
    initial_delay: float = 1,
    exponential_base: float = 2,
    jitter: bool = True,
    max_retries: int = 10,
    errors: tuple = (openai.RateLimitError,),
):
    """Retry a function with exponential backoff."""

    def wrapper(*args, **kwargs):
        # Initialize variables
        num_retries = 0
        delay = initial_delay

        # Loop until a successful response or max_retries is hit or an exception is raised
        while True:
            try:
                return func(*args, **kwargs)

            # Retry on specified errors
            except errors as e:
                # Increment retries
                num_retries += 1

                # Check if max retries has been reached
                if num_retries > max_retries:
                    raise Exception(
                        f"Maximum number of retries ({max_retries}) exceeded."
                    )

                # Increment the delay
                delay *= exponential_base * (1 + jitter * random.random())

                # Sleep for the delay
                time.sleep(delay)

            # Raise exceptions for any errors not specified
            except Exception as e:
                raise e

    return wrapper

In [None]:
@retry_with_exponential_backoff
def perform_openai_call(prompt):
    completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": text,
            }
        ],
        max_tokens=1024,
        model="gpt-3.5-turbo",
    )
    return completion.choices[0].message.content.replace('\n', '')

def create_prompt(a, b, typ):
    prompt = f'I am running an experiment when I am making data augmentation on pairs of radiology images and corresponding findings. I am merging the images and I need you to merge the text ({typ} 1 and {typ} 2). The merged text will describe this merged image and needs to include all the information from both texts but include the most specific fact if there is ambiguity. For example, if one text says no abnormalities, but another describes abnormalities, then describe the abnormalities in the merged text.\n'
    prompt += f'{typ} 1: {a}\n'
    prompt += f'{typ} 2: {b}\n'
    prompt += f'DO THIS TASK CAREFULLY WITHOUT FORGETTING ANY DETAILS. KEEP THE LENGTH SIMILAR TO THE LONGEST {typ}s. NOW BELOW THIS TEXT, OUTPUT THE MERGED FINDINGS IN THE SAME FORMAT (MATCHING CASING IF NEEDED) WITHOUT ADDING NEW LINES, WITHOUT A PREFIX LIKE \'MERGED {typ}s:\', AND WITHOUT OTHER SPECIAL FORMATTING:\n'
    return prompt

def process_item(d):
    # Impression
    if d['impression_a'] and d['impression_b']:
        d['impression'] = perform_openai_call(create_prompt(d['impression_a'], d['impression_b'], 'impression'))
    else:
        d['impression'] = d['impression_a'] or d['impression_b']
    # Finding
    if d['findings_a'] and d['findings_b']:
        d['findings'] = perform_openai_call(create_prompt(d['findings_a'], d['findings_b'], 'finding'))
    else:
        d['findings'] = d['findings_a'] or d['findings_b']
    return d

In [None]:
prompts = []

def process_item_fake(d):
    # Impression
    if d['impression_a'] and d['impression_b']:
        d['impression'] = prompts.append(create_prompt(d['impression_a'], d['impression_b'], 'impression'))
    else:
        d['impression'] = d['impression_a'] or d['impression_b']
    # Finding
    if d['findings_a'] and d['findings_b']:
        d['findings'] = prompts.append(create_prompt(d['findings_a'], d['findings_b'], 'finding'))
    else:
        d['findings'] = d['findings_a'] or d['findings_b']
    return d

In [None]:
from tqdm.notebook import tqdm
for item in tqdm(data):
    process_item_fake(item)

In [None]:
prompts[0]

In [None]:
import json

batch_jsons = []
for idx, prompt in enumerate(prompts):
    #{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
    batch_jsons.append(json.dumps({
        'custom_id': str(idx),
        'method': 'POST',
        'url': '/v1/chat/completions',
        'body': {
            'model': 'gpt-3.5-turbo',
            'messages': [
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            'max_tokens': 1024
        }
    }))

In [None]:
batch_jsons[1]

In [None]:
batch_size = 10000
total_lines = len(batch_jsons)
number_of_files = (total_lines + batch_size - 1) // batch_size

for i in range(number_of_files):
    start_index = i * batch_size
    end_index = min((i + 1) * batch_size, total_lines)
    file_name = f'openai_batch{i+1}.jsonl'
    with open(file_name, 'w') as file:
        file.writelines([line + '\n' for line in batch_jsons[start_index:end_index]])

In [None]:
len(prompts), len(batch_jsons), len(batch_jsons[:50000]) + len(batch_jsons[50000:])

In [None]:
batch_file_ids = []
for i in range(1, number_of_files + 1):
    batch_input_file = client.files.create(
      file=open(f"openai_batch{i}.jsonl", "rb"),
      purpose="batch"
    )
    batch_file_ids.append(batch_input_file.id)
batch_file_ids

In [None]:
# !!!!
# GO TO OPENAI DASHBOARD TO SUBMIT THESE BATCH REQUESTS. They will process within 24 hours. Then save the responses in files batch1.jsonl to batch10.jsonl.
# !!!!

In [None]:
from tqdm.notebook import tqdm
import json
responses = {}

for i in tqdm(range(1, 10+1)):
    with open(f'batch{i}.jsonl', 'r') as file:
        for line in file:
            d = json.loads(line)
            responses[int(d['custom_id'])] = d['response']['body']['choices'][0]['message']['content']

responses[0], len(responses)

In [None]:
call_idx = 0

def process_item_offline(d):
    global call_idx
    # Impression
    if d['impression_a'] and d['impression_b']:
        d['impression'] = responses[call_idx]
        call_idx += 1
    else:
        d['impression'] = d['impression_a'] or d['impression_b']
    # Finding
    if d['findings_a'] and d['findings_b']:
        d['findings'] = responses[call_idx]
        call_idx += 1
    else:
        d['findings'] = d['findings_a'] or d['findings_b']
    return d

In [None]:
for item in tqdm(data):
    process_item_offline(item)

In [None]:
len(data)

In [None]:
from torch.utils.data import Dataset

class MixgenDataset(Dataset):
    def __init__(self, input_data):
        self.data = input_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        return self.data[idx]

In [None]:
import pickle
partition_size = len(data) // 4

data1 = data[:partition_size]
data2 = data[partition_size:2*partition_size]
data3 = data[2*partition_size:3*partition_size]
data4 = data[3*partition_size:]

with open('mixgen1.pkl', 'wb') as file:
    pickle.dump(data1, file)

with open('mixgen2.pkl', 'wb') as file:
    pickle.dump(data2, file)

with open('mixgen3.pkl', 'wb') as file:
    pickle.dump(data3, file)

with open('mixgen4.pkl', 'wb') as file:
    pickle.dump(data4, file)
