In [None]:
import pandas as pd

In [None]:
# Use the drug portfolio generated from compile_drug_portfolio.ipynb.
portfolio = pd.read_excel('drug_portfolio.xlsx')
portfolio

Unnamed: 0,Manufacturer,Trade name,INN,Indications,Group of diseases
0,Bayer,Naproxen,naproxen,"Pain, inflammation (arthritis, gout, ankylosin...",musculoskeletal and connective tissue diseases
1,Sanofi,Hydroxychloroquine,hydroxychloroquine,"Malaria, Rheumatoid arthritis, Lupus erythemat...","infectious and parasitic diseases, musculoskel..."
2,Roche,Carvedilol,carvedilol,"Hypertension, Heart failure, Angina",cardiovascular diseases
3,Eli Lilly,Methadone,methadone,"Opioid dependence, Pain management","mental and behavioural disorders, nervous syst..."
4,Bristol-Myers Squibb,Amlodipine,amlodipine,"Hypertension, Angina",cardiovascular diseases
...,...,...,...,...,...
1582,Sage Therapeutics,Zuranolone,zuranolone,Major depressive disorder,mental and behavioural disorders
1583,Gilead Sciences,Zydelig,idelalisib,"Chronic lymphocytic leukemia, Follicular B-cel...",cancer
1584,ADC Therapeutics,Zynlonta,loncastuximab tesirine,Diffuse large B-cell lymphoma,cancer
1585,Bluebird bio,Zynteglo,betibeglogene autotemcel,Beta thalassemia,hematological diseases


In [None]:
# Use the detailed translation memory augmented with medical entities for each row from extract_medical_entities.ipynb.
# Each row in the detailed translation memory contains a source segment, a target segment, a client name, an order number, and extracted medical entities.
df = pd.read_excel('detailed_TM_with_ents.xlsx')
df

In [None]:
def get_key(d, value):
# Get the corresponding key for a given value from the dictionaries below.
    for k, v in d.items():
        if value in v:
            return k

In [None]:
# The presence of both international nonproprietary names (INNs) and trade names of drugs helps identify the group of diseases mentioned in the segment.
# Since there can be multiple trade names for the same INN, it is convenient to group trade names and diseases by INNs for analysis.
# To facilitate this, compile two dictionaries:
# - A dictionary with unique INNs and their corresponding trade names.
# - A dictionary with unique INNs and their associated groups of diseases.

find_TN_for_INN = {}
find_group_of_diseases_for_INN = {}
i = 0

for inn in portfolio['INN']:
  if "/" in inn:
    list_if_inns = list(inn.split('/'))
    for inn_in_list in list_if_inns:
      if inn_in_list in find_TN_for_INN.keys():
        find_TN_for_INN[inn_in_list].append(portfolio.at[i, 'Trade name'])
        find_group_of_diseases_for_INN[inn_in_list].append(portfolio.at[i, 'Group of diseases'])
      else:
        find_TN_for_INN[inn_in_list] = []
        find_TN_for_INN[inn_in_list].append(portfolio.at[i, 'Trade name'])
        find_group_of_diseases_for_INN[inn_in_list] = []
        find_group_of_diseases_for_INN[inn_in_list].append(portfolio.at[i, 'Group of diseases'])
  else:
    if inn in find_TN_for_INN.keys():
      find_TN_for_INN[inn].append(portfolio.at[i, 'Trade name'])
      find_group_of_diseases_for_INN[inn].append(portfolio.at[i, 'Group of diseases'])
    else:
      find_TN_for_INN[inn] = []
      find_TN_for_INN[inn].append(portfolio.at[i, 'Trade name'])
      find_group_of_diseases_for_INN[inn] = []
      find_group_of_diseases_for_INN[inn].append(portfolio.at[i, 'Group of diseases'])
  i+=1

In [None]:
# For some INNs, there are multiple groups of diseases. Split these by ", " and add them to a list of disease groups.
for key, values in find_group_of_diseases_for_INN.items():
  edited_values = []
  for value in values:
    value = value.lower()
    if "," in value:
      edited_values.extend(value.split(', '))
    else:
      edited_values.append(value)
  find_group_of_diseases_for_INN[key] = list(set(edited_values))

In [None]:
inn = get_key(find_TN_for_INN, 'Valproic Acid')
inn

'valproic acid'

In [None]:
# For each segment containing an INN, add its trade names and groups of diseases to the corresponding columns.
df['INN'] = ''
df['Trade name'] = ''
df['Group of diseases'] = ''

for INN in find_TN_for_INN.keys():
  i = 0
  for segment_ents in df['Medical entities']:
    segment_ents = str(segment_ents).lower()
    if INN.lower() in segment_ents:
      df.at[i, 'INN'] = INN
      df.at[i, 'Trade name'] = find_TN_for_INN[INN]
      df.at[i, 'Group of diseases'] = find_group_of_diseases_for_INN[INN]
    i += 1

In [None]:
# For each segment containing a trade name, add its INN and groups of diseases to the corresponding columns.
for values in find_TN_for_INN.values():
  i = 0
  for value in values:
    for segment_ents in df['Medical entities']:
      segment_ents = str(segment_ents)
      if value in segment_ents:
        inn = get_key(find_TN_for_INN, value)
        df.at[i, 'INN'] = inn
        df.at[i, 'Trade name'] = find_TN_for_INN[inn]
        df.at[i, 'Group of diseases'] = find_group_of_diseases_for_INN[inn]
      i += 1

In [None]:
# Save the augmented detailed translation memory with added INNs, trade names, diseases, and corresponding groups of diseases.
# This file can be used to compile a dataset for fine-tuning based on the prevalence of different disease groups.
df.to_excel('detailed_TM_with_INN_TN_diseases_full.xlsx')

In [None]:
# Determine which disease groups are most prevalent in the translation memory.
filtered_df = df.dropna(subset = ['Group of diseases'])
groups_of_diseases_in_TM = list(filtered_df['Group of diseases'].unique())
statistics_on_diseases_in_TM = []

for group_of_diseases in groups_of_diseases_in_TM:
    number_of_segments_with_group_of_diseases = len(filtered_df[filtered_df['Group of diseases'] == group_of_diseases])
    statistics_on_diseases_in_TM.append([group_of_diseases, number_of_segments_with_group_of_diseases])

statistics_on_diseases_in_TM = pd.DataFrame(statistics_on_diseases_in_TM, columns = ['Group of diseases', 'Number of segments'])
statistics_on_diseases_in_TM = statistics_on_diseases_in_TM.sort_values(by=['Number of segments'], ascending = False)
statistics_on_diseases_in_TM

Unnamed: 0,Group of diseases,Number of segments
11,['cardiovascular diseases'],6922
15,['cancer'],6186
185,['muscular diseases'],5478
1,"['respiratory diseases', 'sleep disorders']",5363
61,['blood and lymphatic system diseases'],3544
...,...,...
218,['urological cancers'],1
252,"['cancer', 'eye and adnexa diseases']",1
217,['immunological diseases'],1
254,"['sexual health and reproductive disorders', '...",1


In [None]:
statistics_on_diseases_in_TM.to_excel('statistics_on_diseases_in_detailed_TM.xlsx')