In [None]:
!pip install --upgrade openai

Collecting openai
  Downloading openai-1.35.13-py3-none-any.whl (328 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/328.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m266.2/328.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m931.1 kB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.

In [None]:
from openai import OpenAI
client = OpenAI(api_key='')
import pandas as pd
from tenacity import *
import re

In [None]:
# Define a system message for GPT.
system_message = "You are a drug consultant."
gpt_model_params = {
    "model": "gpt-4o",
    "temperature": 0,
    "top_p": 0,
    "presence_penalty": 0,
    "frequency_penalty": 0
}

# For user messages, define the size of medical entity batches to avoid exceeding GPT's response limits.
batch_size = 140

In [None]:
# Use the clean list of medical entities generated from extract_medical_entities.ipynb.
med_ents = pd.read_excel('med_ents_detailed_TM.xlsx')
med_ents

In [None]:
def create_batches(data, batch_size):
    keys = list(data.keys())
    batches = []
    for i in range(0, len(keys), batch_size):
        batch_keys = keys[i:i + batch_size]
        batch = {key: data[key] for key in batch_keys}
        batches.append(batch)
    return batches

In [None]:
def create_json(df):
    JSON_input = {}
    for i in range(0, df.shape[0]):
        JSON_input[i] = df.loc[i, 'med_ents']
    return(JSON_input)

In [None]:
def create_user_message(batch):
    i = 0
    user_message = f"""You receive a list of various terms, but you need only drug trade names on it. Your task for them is to specify their:
- manufacturer
- international non-proprietary names
- indications (diseases only)
- group of diseases, which includes the diseases from the indications for the use of drugs
If there is no trade name in the list, response 'None'.

The format of the answer should be as follows:
0. manufacturer : trade name : international non-proprietary name (INN) : indications : group of diseases
1. manufacturer : trade name : international non-proprietary name (INN) : indications : group of diseases

List of terms:"""
    for key in batch.keys():
        user_message = """{0}\n{1}. {2}""".format(user_message, i, batch[key])
        i += 1
    return user_message

In [None]:
@retry(stop=stop_after_attempt(3), wait=wait_fixed(300))
def completion_with_backoff(**kwargs):
    return client.chat.completions.create(**kwargs).choices[0].message.content

In [None]:
def compile_portfolio(JSON_segments, batch_size, system_message):
    full_portfolio = []
    data = JSON_segments
    batches = create_batches(data, batch_size)
    batch_num = 1
    for batch in batches:
        batch_index_list = list(batch.keys())
        user_message = create_user_message(batch)
        message = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ]
        print(message)
        try:
            portfolio_part = completion_with_backoff(messages=message, **gpt_model_params)
            full_portfolio.extend(list(filter(None, portfolio_part.split('\n'))))
        except Exception as e:
            print('Batch error!' + str(batch_num), e)
            pass
        batch_num += 1
    result = [re.sub(r'^[0-9]+\. ', '', i) for i in full_portfolio]
    return result

In [None]:
# Prepare the data, send requests to GPT, and save the responses for each batch.
JSON_input = create_json(med_ents)
output = compile_portfolio(JSON_input, batch_size, system_message)

In [None]:
# Filter out responses that do not match the required format, then split valid responses by " : ".
clean_list_of_responses = []

for response in output:
  if response.count(' : ') == 4:
    row = response.split(' : ')
    clean_list_of_responses.append(row)

In [None]:
# Compile a portfolio with the necessary columns and remove duplicates based on the Trade name column.
drug_portfolio = pd.DataFrame(clean_list_of_responses, columns = ["Manufacturer", "Trade name", "INN", "Indications", "Group of diseases"])
drug_portfolio.drop_duplicates(subset=["Trade name"])

In [None]:
drug_portfolio.to_excel('drug_portfolio.xlsx', index = False)