# Compare EC orphan drugs with all EC registered active drugs

Source of data: 

1. Active: https://ec.europa.eu/health/documents/community-register/html/reg_od_act.htm?sort=n
2. Withdrawn, suspended, expired, not renewed: https://ec.europa.eu/health/documents/community-register/html/reg_hum_nact.htm?sort=a
3. Rejected: https://ec.europa.eu/health/documents/community-register/html/reg_hum_refus.htm

## Import data

### EC all drugs

In [58]:
import os

import numpy as np
import pandas as pd

path = os.path.dirname(os.getcwd())
data_all_drugs = pd.read_csv(path + '/data/ec_all_drugs.csv')
data_all_drugs['EU #'] = data_all_drugs['EU #'].astype(str)
data_all_drugs['Brand name'] = data_all_drugs['Brand name'].str.upper()
data_all_drugs

Unnamed: 0,EU #,Brand name,Marketing Authorisation Holder,Status,INN,Initial Decision date
0,EU/1/14/944,ABASAGLAR,Eli Lilly Nederland B.V.,Active,,
1,EU/1/04/276,ABILIFY,Otsuka Pharmaceutical Netherlands B.V.,Active,,
2,EU/1/13/882,ABILIFY MAINTENA,Otsuka Pharmaceutical Netherlands B.V.,Active,,
3,EU/1/07/428,ABRAXANE,Bristol-Myers Squibb Pharma EEIG,Active,,
4,EU/1/07/412,ABSEAMED,Medice Arzneimittel Pütter GmbH & Co KG,Active,,
...,...,...,...,...,...,...
1548,,WINFURAN,Toray International U.K. Limited,Rejected,nalfurafine,21 Mar 2014
1549,,XELJANZ (REFUSAL),Pfizer Limited,Rejected,tofacitinib,08 Nov 2013
1550,,YONDELIS,Pharma Mar S.A.,Rejected,Trabectedin,07 Sep 2004
1551,,ZEFTERA,Janssen-Cilag International NV,Rejected,Ceftobiprole medocaril,16 Sep 2010


### EC orphan drug designations

In [59]:
path = os.path.dirname(os.getcwd())
data_od_raw = pd.read_csv(path + '/data/ec_orphan_designations.csv', header=2)
data_od_raw['EU #'] = data_od_raw['EU #'].astype(str)
data_od_raw

Unnamed: 0,EU #,Product,Indication,Sponsor,Designation date,Tradename - EU product # - Implemented on
0,EU/3/20/2351,Adeno-associated virus serotype 5 containing t...,Treatment of RDH12 mutation associated retinal...,MeiraGTx B.V.,19 Oct 2020,-
1,EU/3/20/2350,"Poly(oxy-1,2-ethanediyl), alpha-hydro-omega-me...",Treatment of hypoparathyroidism,Ascendis Pharma Bone Diseases A/S,19 Oct 2020,-
2,EU/3/20/2349,Miglustat,Treatment of neuronal ceroid lipofuscinosis,Theranexus S.A.S.,19 Oct 2020,-
3,EU/3/20/2348,"Poly(oxy-1,2-ethanediyl), alpha-(carboxymethyl...",Treatment of homocystinuria,Aeglea Biotherapeutics UK Limited,19 Oct 2020,-
4,EU/3/20/2347,Trehalose,Treatment of neuronal ceroid lipofuscinosis,Theranexus S.A.S.,19 Oct 2020,-
...,...,...,...,...,...,...
1735,EU/3/01/034,Gusperimus trihydrochloride,Treatment of Wegener’s granulomatosis,Nordic Group B.V.,29 Mar 2001,-
1736,EU/3/01/028,Inolimomab,Treatment of Graft versus Host Disease,Elsalys Biotech SA,05 Mar 2001,-
1737,EU/3/01/026,L-Lysine-N-acetyl-L-cysteinate,Treatment of cystic fibrosis,LABORATOIRES SMB SA,14 Feb 2001,-
1738,EU/3/00/013,Ethyl Eicosopentaenoate,Treatment of Huntington's disease,Amarin Neuroscience Limited,29 Dec 2000,-


## Only keep orphan drug designations between 2001 and 2019

In [60]:
years = pd.to_datetime(
    data_od_raw['Designation date']).apply(lambda x: x.year)
data_od = data_od_raw.where((years > 2000) & (years < 2020))
data_od = data_od.dropna()
data_od['EU #'] = data_od['EU #'].astype(str)
data_od

Unnamed: 0,EU #,Product,Indication,Sponsor,Designation date,Tradename - EU product # - Implemented on
109,EU/3/19/2235,Synthetic double-stranded siRNA oligonucleotid...,Treatment of congenital alpha-1 antitrypsin de...,Dicerna Ireland Limited,16 Dec 2019,-
110,EU/3/19/2234,Pamrevlumab,Treatment of Duchenne muscular dystrophy,Voisin Consulting S.A.R.L.,16 Dec 2019,-
111,EU/3/19/2233,Navitoclax,Treatment of myelofibrosis,AbbVie Deutschland GmbH & Co. KG,16 Dec 2019,-
112,EU/3/19/2232,Lactobacillus plantarum,Treatment of amyotrophic lateral sclerosis,MDC RegAffairs GmbH,16 Dec 2019,-
113,EU/3/19/2231,H-Leu-Pro-Pro-Leu-Pro-Tyr-Pro-OH,Treatment of amyotrophic lateral sclerosis,AdRes EU B.V.,16 Dec 2019,-
...,...,...,...,...,...,...
1733,EU/3/01/044,Human Alpha1-Proteinase Inhibitor (respiratory...,Treatment of emphysema secondary to congenital...,CSL Behring GmbH,09 Jul 2001,-
1734,EU/3/01/038,Retroviral gamma-c cDNA containing vector,Treatment of Severe Combined Immunodeficiency ...,GENOPOIETIC S.A.S.,30 May 2001,-
1735,EU/3/01/034,Gusperimus trihydrochloride,Treatment of Wegener’s granulomatosis,Nordic Group B.V.,29 Mar 2001,-
1736,EU/3/01/028,Inolimomab,Treatment of Graft versus Host Disease,Elsalys Biotech SA,05 Mar 2001,-


## 1. How many ODs can be identified by EU # in the all drugs registry

In [61]:
import plotly.colors
import plotly.graph_objects as go

# Get all EU # in all drug registry
eu_numbers_all_drugs = data_all_drugs['EU #'].unique()

# Get all EU # in OD drug registry
eu_numbers_od_drugs = data_od['EU #'].unique()

# Check whether OD drug number can be found
is_found = np.zeros(shape=len(eu_numbers_od_drugs), dtype=bool)
for idx, number in enumerate(eu_numbers_od_drugs):
    if number in eu_numbers_all_drugs:
        is_found[idx] = True

# Create dataframe
matched_data = pd.DataFrame(columns=['EU #', 'Match status'])
matched_data = matched_data.append(pd.DataFrame({
    'EU #': eu_numbers_od_drugs[is_found],
    'Match status': 'In all drugs registry'}))
matched_data = matched_data.append(pd.DataFrame({
    'EU #': eu_numbers_od_drugs[~is_found],
    'Match status': 'Not in all drugs registry'}))

# Create figure
fig = go.Figure()
colours = plotly.colors.qualitative.Plotly

n_in_all_drugs = len(
    matched_data[matched_data['Match status'] == 'In all drugs registry'])
n_not_in_all_drugs = len(
    matched_data[matched_data['Match status'] == 'Not in all drugs registry'])

fig.add_trace(go.Pie(
    labels=['In all drugs registry', 'Not in all drugs registry'],
    values=[n_in_all_drugs, n_not_in_all_drugs],
    sort=False,
    pull=[0, 0.2],
    marker=dict(
        colors=colours[:2], 
        line=dict(color='#000000', width=1))))
fig.show()

The EC all drugs registry does not include any OD applications

## 2. How many OD drugs are repurposed drugs?

We check how many of the OD drug products (i.e. the international non-proprietary name which identifies the active ingredient of the medicine (INN)) are identical to those of drugs in the EC all drugs registry.

TODO:

1. Get INNs of EC drugs by cross-referencing EC drugs with EMA dataset.
2. Then check how many EC ODs products are in EC all drugs.

Procedure:

- Filter NaNs
- Capitalise all product names / INNs
- Check whether OD product is identical to an INN in all drugs dataset

### 2.1 Get INN / product names from EMA dataset

The EC dataset does not keep track of the INN for all drugs, so we would like to match drugs by EU # to get the INN from the EMA dataset. But **EU #** is not available for EMA all drugs registry. Alternative we attempt to match drugs based on Brand name.

In [62]:
path = os.path.dirname(os.getcwd())
data_ema = pd.read_excel(path + '/data/ema_all_drugs.xlsx', header=8)
data_ema['Medicine name'] = data_ema['Medicine name'].str.upper()
data_ema

Unnamed: 0,Category,Medicine name,Therapeutic area,International non-proprietary name (INN) / common name,Active substance,Product number,Patient safety,Authorisation status,ATC code,Additional monitoring,...,Vet pharmacotherapeutic group,Date of opinion,Decision date,Revision number,Condition / indication,Species,ATCvet code,First published,Revision date,URL
0,Veterinary,FRONTPRO (PREVIOUSLY KNOWN AS AFOXOLANER MERIAL),,afoxolaner,afoxolaner,EMEA/V/C/005126,no,Authorised,,no,...,Ectoparasiticides for systemic use,2019-03-21 01:00:00,2020-11-06 01:00:00,3.0,Treatment of flea (Ctenocephalides felis and C...,Dogs,QP53BE01,2019-06-07 17:00:00,2020-11-12 18:18:00,https://www.ema.europa.eu/en/medicines/veterin...
1,Human,CHOLIB,Dyslipidemias,"fenofibrate, simvastatin","fenofibrate, simvastatin",EMEA/H/C/002559,no,Authorised,C10BA04,no,...,,2013-06-27 00:00:00,2020-10-23 00:00:00,12.0,Cholib is indicated as adjunctive therapy to d...,,,2018-08-20 00:00:00,2020-11-12 17:50:00,https://www.ema.europa.eu/en/medicines/human/E...
2,Human,REPAGLINIDE KRKA,"Diabetes Mellitus, Type 2",repaglinide,repaglinide,EMEA/H/C/001066,no,Authorised,A10BX02,no,...,,2009-07-23 00:00:00,2020-10-28 01:00:00,6.0,Repaglinide is indicated in patients with type...,,,2017-10-27 00:00:00,2020-11-12 17:30:00,https://www.ema.europa.eu/en/medicines/human/E...
3,Human,LIPROLOG,Diabetes Mellitus,insulin lispro,insulin lispro,EMEA/H/C/000393,no,Authorised,"A10AB04, A10AD04",no,...,,2001-04-26 00:00:00,2020-09-04 00:00:00,28.0,For the treatment of adults and children with ...,,,2017-10-23 00:00:00,2020-11-12 16:54:00,https://www.ema.europa.eu/en/medicines/human/E...
4,Human,HEXACIMA,"Hepatitis B, Tetanus, Immunization, Meningitis...","diphtheria, tetanus, pertussis (acellular, com...","diphtheria toxoid / tetanus toxoid, two-compon...",EMEA/H/C/002702,no,Authorised,J07CA09,no,...,,2013-02-21 01:00:00,2020-09-24 00:00:00,21.0,Hexacima (DTaP-IPV-HB-Hib) is indicated for pr...,,,2018-01-08 12:30:00,2020-11-12 16:42:00,https://www.ema.europa.eu/en/medicines/human/E...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730,Human,LIPROLOG,Diabetes Mellitus,insulin lispro,insulin lispro,EMEA/H/C/000143,no,Withdrawn,A10AB04,no,...,,NaT,2001-02-19 01:00:00,0.0,For the treatment of patients with diabetes me...,,,2001-02-19 01:00:00,2001-08-01 00:00:00,https://www.ema.europa.eu/en/medicines/human/E...
1731,Human,ECHOGEN,Echocardiography,dodecafluoropentane,dodecafluoropentane,EMEA/H/C/000149,no,Withdrawn,V08DA,no,...,,NaT,2001-01-22 01:00:00,0.0,EchoGen is a transpulmonary echocardiographic ...,,,2001-01-22 01:00:00,2001-05-22 00:00:00,https://www.ema.europa.eu/en/medicines/human/E...
1732,Human,ECOKINASE,Myocardial Infarction,reteplase,reteplase,EMEA/H/C/000106,no,Withdrawn,B01AD07,no,...,,NaT,1999-07-30 00:00:00,0.0,Thrombolytic therapy of acute myocardial infar...,,,1999-07-30 00:00:00,2000-12-12 01:00:00,https://www.ema.europa.eu/en/medicines/human/E...
1733,Human,PRIMAVAX,"Hepatitis B, Tetanus, Immunization, Diphtheria","diphtheria, tetanus and hepatitis B vaccine, a...","diphtheria toxoid purified, hepatitis B, recom...",EMEA/H/C/000156,no,Withdrawn,J07CA,no,...,,NaT,2000-07-27 00:00:00,0.0,This vaccine is indicated for active immunizat...,,,2000-07-27 00:00:00,2000-12-04 01:00:00,https://www.ema.europa.eu/en/medicines/human/E...


#### Check whether some brand names occur multiple times

In [63]:
non_unique_brands = []
for brand in data_all_drugs['Brand name'].unique():
    mask = data_all_drugs['Brand name'] == brand
    if len(data_all_drugs[mask]) > 1:
        non_unique_brands.append(brand)

non_unique_brands

['BUDESONIDE/FORMOTEROL TEVA PHARMA B.V.',
 'CIMZIA',
 'CLOPIDOGREL RATIOPHARM',
 'LIPROLOG',
 'THYMANAX',
 'VALDOXAN',
 'YONDELIS',
 'IONSYS']

We manually checked: each non-unique brand appears only twice, one of which is 'active' the other is either 'not active' or 'rejected'. So it's likely that the same brand name labels the exact same drug and the multiple matches correspond to repeated applications or renewals.

### Match unique EC brands with EMA dataset and thereby find product number

For now: If EC brand name is contained in EMA brand name, we match. In the worst case scenario, this includes more active subtances in the resulting dataset than there truly are. But on the other hand, those active may also belong to non-OD drugs according to the EMA database (possibly explicitly exclude OD drugs in EMA database).

In [64]:
is_identified = []
inn = []
ec_brand = []
matched_ema_brand = []
brands = data_all_drugs['Brand name'].unique()
ema_brands = data_ema['Medicine name'].unique()
for brand in brands:
    is_matched = False
    for ema_brand in ema_brands:
        # Check whether ec brand is subset of ema brand
        if brand in ema_brand:
            is_identified.append('yes')
            matched_ema_brand.append(ema_brand)
            ec_brand.append(brand)

            # Mask for product
            mask = data_ema['Medicine name'] == ema_brand
            product = data_ema[mask]['Active substance'].iloc[0]
            inn.append(product)

            is_matched = True
    
    if is_matched is True:
        continue

    # If not identified
    is_identified.append('no')
    matched_ema_brand.append('none')
    inn.append('none')
    ec_brand.append(brand)


matched_drugs = pd.DataFrame({
    'Brand name': ec_brand,
    'EMA brand name': matched_ema_brand,
    'is identified': is_identified,
    'INN': inn})
matched_drugs

Unnamed: 0,Brand name,EMA brand name,is identified,INN
0,ABASAGLAR,ABASAGLAR (PREVIOUSLY ABASRIA),yes,insulin glargine
1,ABILIFY,ABILIFY,yes,aripiprazole
2,ABILIFY,ABILIFY MAINTENA,yes,aripiprazole
3,ABILIFY MAINTENA,ABILIFY MAINTENA,yes,aripiprazole
4,ABRAXANE,ABRAXANE,yes,paclitaxel
...,...,...,...,...
1614,VYNFINIT,none,no,none
1615,WINFURAN,none,no,none
1616,XELJANZ (REFUSAL),none,no,none
1617,ZEFTERA,ZEFTERA (PREVIOUSLY ZEVTERA),yes,ceftobiprole medocaril


### Check unmatched drugs

Some EC brand names could not be found. For EC brands of the form "name 1/name 2" or "name 1 (name 2)" the above matching method might miss the drugs, because the complete EC brand name has to be contained in the EMA brand name. We check those candidates manually below.

In [101]:
mask = matched_drugs['is identified'] == 'no'
matched_drugs[mask]

Unnamed: 0,Brand name,EMA brand name,is identified,INN
12,ADAKVEO,none,no,none
49,AMLODIPINE/VALSARTAN MYLAN,none,no,none
63,ARIKAYCE LIPOSOMAL,none,no,none
135,BROPAIR SPIROMAX,none,no,none
143,BYFAVO,none,no,none
...,...,...,...,...
1608,SEROSTIM,none,no,none
1613,VIVIQ,none,no,none
1614,VYNFINIT,none,no,none
1615,WINFURAN,none,no,none


Filter all names which are just one word

In [107]:
critical_brand_names = []
uncritical_brand_names = []
for brand in matched_drugs[mask]['Brand name']:
    if (' ' in brand) or ('/' in brand) or ('(' in brand) or ('-' in brand):
        critical_brand_names.append(brand)
    else:
        uncritical_brand_names.append(brand)

print('Critical drugs: ', len(critical_brand_names), '\n')
print(critical_brand_names)
print('\n')
print('Uncritical drugs: ', len(uncritical_brand_names), '\n')
print(uncritical_brand_names)

Critical drugs:  36 

['AMLODIPINE/VALSARTAN MYLAN', 'ARIKAYCE LIPOSOMAL', 'BROPAIR SPIROMAX', 'CLOPIDOGREL TAW PHARMA', 'CLOPIDOGREL/ACETYLSALICYLIC ACID MYLAN', 'COVID-19 VACCINE JANSSEN', 'COVID-19 VACCINE MODERNA', 'EMTRICITABINE /TENOFOVIR DISOPROXIL KRKA D.D.', 'EXPAREL LIPOSOMAL', 'HEPLISAV B', 'IRBESARTAN / HYDROCHLOROTHIAZIDE TEVA', 'IRBESARTAN HCT ZENTIVA', 'LAMIVUDINE / ZIDOVUDINE TEVA', 'LENALIDOMIDE KRKA', 'LENALIDOMIDE KRKA D.D.', 'LENALIDOMIDE KRKA D.D. NOVO MESTO', 'LENALIDOMIDE MYLAN', 'OBILTOXAXIMAB SFL', 'RIBAVIRIN TEVA PHARMA BV', 'RIVAROXABAN ACCORD', 'SEFFALAIR SPIROMAX', 'SUNITINIB ACCORD', 'THIOTEPA RIEMSER', 'TRIXEO AEROSPHERE', 'CLOPIDOGREL / ACETYLSALICYLIC ACID TEVA', 'HUMALOG-HUMAJECT', 'HUMALOG-PEN', 'IRBESARTAN HCT BMS', 'RIVASTIGMINE 3M HEALTH CARE LTD.', 'TECNEMAB-K-1', 'TENECTEPLASE BOEHRINGER INGELHEIM PHARMA GMBH & CO. KG', 'VALDYN (EX KUDEQ)', 'MYLOTARG REFUSAL', 'RAMELTEON TAKEDA GLOBAL RESEARCH AND DEVELOPMENT CENTRE (EUROPE) LTD', 'RAXONE REFUSAL

### Try to match critical brands that are obvious to parse

In [114]:
# Parse critical name manually
parse_dict = {
    'AMLODIPINE/VALSARTAN MYLAN': ['AMLODIPINE', 'VALSARTAN MYLAN'],
    'CLOPIDOGREL/ACETYLSALICYLIC ACID MYLAN': ['CLOPIDOGREL', 'ACETYLSALICYLIC ACID MYLAN'],
    'EMTRICITABINE /TENOFOVIR DISOPROXIL KRKA D.D.': ['EMTRICITABINE', 'TENOFOVIR DISOPROXIL KRKA D.D.'],
    'IRBESARTAN / HYDROCHLOROTHIAZIDE TEVA': ['IRBESARTAN', 'HYDROCHLOROTHIAZIDE TEVA'],
    'LAMIVUDINE / ZIDOVUDINE TEVA': ['LAMIVUDINE', 'ZIDOVUDINE TEVA'],
    'CLOPIDOGREL / ACETYLSALICYLIC ACID TEVA': ['CLOPIDOGREL', 'ACETYLSALICYLIC ACID TEVA']}

# Find parsed names in EMA dataset
ema_brands = data_ema['Medicine name'].unique()
for brand, parsed_brands in parse_dict.items():
    matched_brands = []
    inn = []
    for parsed_brand in parsed_brands:
        for ema_brand in ema_brands:
            # Check whether ec brand is subset of ema brand
            if parsed_brand in ema_brand:
                matched_brands.append(ema_brand)

                # Mask for product
                mask = data_ema['Medicine name'] == ema_brand
                product = data_ema[mask]['Active substance'].iloc[0]
                inn.append(product)

                # is_matched = True
    
    # Make sure that parsed drugs are not matched to different drugs
    if len(matched_brands) == 2:
        assert matched_brands[0] == matched_brands[1]

    # Update if one of the parsed drugs was matched
    if len(matched_brands) > 0:
        mask = matched_drugs['Brand name'] == brand
        matched_drugs.loc[mask, 'EMA brand name'] = matched_brands[0]
        matched_drugs.loc[mask, 'is identified'] = 'yes'
        matched_drugs.loc[mask, 'INN'] = inn[0]

In [115]:
mask = \
    (matched_drugs['Brand name'] == 'AMLODIPINE/VALSARTAN MYLAN') | \
    (matched_drugs['Brand name'] == 'CLOPIDOGREL/ACETYLSALICYLIC ACID MYLAN') | \
    (matched_drugs['Brand name'] == 'EMTRICITABINE /TENOFOVIR DISOPROXIL KRKA D.D.') | \
    (matched_drugs['Brand name'] == 'IRBESARTAN / HYDROCHLOROTHIAZIDE TEVA') | \
    (matched_drugs['Brand name'] == 'LAMIVUDINE / ZIDOVUDINE TEVA') | \
    (matched_drugs['Brand name'] == 'CLOPIDOGREL / ACETYLSALICYLIC ACID TEVA')
matched_drugs[mask]

Unnamed: 0,Brand name,EMA brand name,is identified,INN
49,AMLODIPINE/VALSARTAN MYLAN,AMLODIPINE / VALSARTAN MYLAN,yes,"Amlodipine besilate, valsartan"
193,CLOPIDOGREL/ACETYLSALICYLIC ACID MYLAN,CLOPIDOGREL / ACETYLSALICYLIC ACID MYLAN,yes,"acetylsalicylic acid, clopidogrel hydrogen sul..."
308,EMTRICITABINE /TENOFOVIR DISOPROXIL KRKA D.D.,EFAVIRENZ/EMTRICITABINE/TENOFOVIR DISOPROXIL M...,yes,"efavirenz, emtricitabine, tenofovir disoproxil..."
504,IRBESARTAN / HYDROCHLOROTHIAZIDE TEVA,IRBESARTAN HYDROCHLOROTHIAZIDE ZENTIVA (PREVIO...,yes,"irbesartan, hydrochlorothiazide"
567,LAMIVUDINE / ZIDOVUDINE TEVA,LAMIVUDINE TEVA,yes,lamivudine
1292,CLOPIDOGREL / ACETYLSALICYLIC ACID TEVA,CLOPIDOGREL / ACETYLSALICYLIC ACID MYLAN,yes,"acetylsalicylic acid, clopidogrel hydrogen sul..."


### Check double matches

In [116]:
double_matches = []
brands = matched_drugs['Brand name'].unique()
for brand in brands:
    mask = matched_drugs['Brand name'] == brand
    temp = matched_drugs[mask]
    if len(temp) > 1:
        double_matches.append(temp['Brand name'].iloc[0])

len(double_matches)

58

## Match EC OD drugs based on product 

### 1. Identical match

### Check INN of EC all drugs (mostly single words, 2 words, or even a list of compounds?)

In [122]:
critical_inns = []
uncritical_inns = []
inns = matched_drugs['INN'].unique()
for inn in inns:
    # Count number of spaces
    space_count = 0
    for letter in inn:
        if letter.isspace():
            space_count += 1
    if (space_count > 1) or (',' in inn) or ('/' in inn):
        critical_inns.append(inn)
    else:
        uncritical_inns.append(inn)

print('Critical INNs: ', len(critical_inns), '\n')
print(critical_inns)
print('\n')
print('Uncritical INNs: ', len(uncritical_inns), '\n')
print(uncritical_inns)

isartan, amlodipine', 'cobicistat on silicon dioxide', 'indacaterol, Glycopyrronium bromide', 'Glycopyrronium bromide, indacaterol maleate', 'meropenem trihydrate, vaborbactam', 'vibrio cholerae, strain cvd 103-hgr, live', 'Diphtheria toxoid, tetanus toxoid, Bordetella pertussis antigens: pertussis toxoid, filamentous haemagglutinin, pertactin, fimbriae Types 2 and 3, hepatitis B surface antigen produced in yeast cells, poliovirus (inactivated): type 1 (Mahoney), type 2 (MEF-1), type 3 (Saukett) produced in Vero cells/ Haemophilus influenzae type b polysaccharide (polyribosylribitol phosphate) conjugated to meningococcal protein.', 'mixture of polynuclear iron(III)-oxyhydroxide, sucrose and starches', 'patiromer sorbitex calcium', 'tenofovir alafenamide fumarate', 'Ombitasvir, paritaprevir, ritonavir', 'recombinant human n-acetylgalactosamine-6-sulfatase (rhgalns)', 'alogliptin benzoate, metformin hydrochloride', 'tenofovir disoproxil fumarate', 'canagliflozin, metformin hydrochloride'

### Check INN of EC OD drugs (mostly single words, 2 words, or even a list of compounds?)

In [123]:
critical_inns = []
uncritical_inns = []
inns = data_od['Product'].unique()
for inn in inns:
    # Count number of spaces
    space_count = 0
    for letter in inn:
        if letter.isspace():
            space_count += 1
    if (space_count > 1) or (',' in inn) or ('/' in inn):
        critical_inns.append(inn)
    else:
        uncritical_inns.append(inn)

print('Critical INNs: ', len(critical_inns), '\n')
print(critical_inns)
print('\n')
print('Uncritical INNs: ', len(uncritical_inns), '\n')
print(uncritical_inns)

inant human hepatitis C monoclonal antibody against C4 region of E1', 'Mercaptopurine (oral liquid)', 'Methotrexate (oral liquid)', '4-ethoxy-2-(piperazin-1-yl)-7-(pyridin-4-yl)-5H-pyrimido[5,4-b]indol', 'Adenovirus associated viral vector serotype 4 containing the human RPE65 gene', '4-Amino-1-[5-O-[(2R,4S)-2-oxido-4-(4-pyridinyl)-1,3,2-dioxaphosphorinan-2-yl]-ß-D-arabinofuranosyl]-2(1H)-pyrimidinone', 'Alginate oligosaccharide (G-block) fragment', 'Human coagulation factor X', 'L-threo-3,4-dihydroxyphenylserine', 'Pyridoxalated haemoglobin polyoxyethylene', '1-{3-[3-(4-chlorophenyl)propoxy]propyl}piperidine, hydrochloride', 'Recombinant fusion protein consisting of human coagulation factor IX attached to the Fc domain of human IgG1', 'R-1-[2,3-dihydro-2-oxo-1-pivaloylmethyl-5-(2-pyridyl)-1 H -1,4-benzodiazepin-3-yl]-3-(3-methylaminophenyl)urea', 'Autologous CD34+ cells transfected with lentiviral vector containing the human arylsulfatase A cDNA', 'Antisense Oligonucleotide (TATCCGGAG

It's not obvious how to parse most of the INNs, except for deleting additional information such as "(oral use)", "(rectal use)", etc.

### 1. Identical match based on "raw" INNs

In [129]:
is_identified = []
matched_all_drug_inns = []
matched_od_inns = []
all_drug_brand = []
od_inns = data_od['Product'].str.upper()
all_drugs_inns = matched_drugs['INN'].str.upper().unique()
for od_inn in od_inns:
    is_matched = False
    for all_drug_inn in all_drugs_inns:
        # Check whether ec brand is subset of ema brand
        if od_inn in all_drug_inn:
            is_identified.append('yes')
            matched_all_drug_inns.append(all_drug_inn)
            matched_od_inns.append(od_inn)

            # Mask for brand
            mask = matched_drugs['INN'].str.upper() == all_drug_inn
            brand = matched_drugs[mask]['Brand name'].iloc[0]
            all_drug_brand.append(brand)

            is_matched = True
    
    if is_matched is True:
        continue

    # If not identified
    is_identified.append('no')
    matched_all_drug_inns.append('none')
    matched_od_inns.append(od_inn)
    all_drug_brand.append('none')


matched_ods = pd.DataFrame({
    'OD INN': matched_od_inns,
    'All drug INN': matched_all_drug_inns,
    'is identified': is_identified,
    'All drug brand name': all_drug_brand})
matched_ods

Unnamed: 0,OD INN,All drug INN,is identified,All drug brand name
0,SYNTHETIC DOUBLE-STRANDED SIRNA OLIGONUCLEOTID...,none,no,none
1,PAMREVLUMAB,none,no,none
2,NAVITOCLAX,none,no,none
3,LACTOBACILLUS PLANTARUM,none,no,none
4,H-LEU-PRO-PRO-LEU-PRO-TYR-PRO-OH,none,no,none
...,...,...,...,...
1698,HUMAN ALPHA1-PROTEINASE INHIBITOR (RESPIRATORY...,none,no,none
1699,RETROVIRAL GAMMA-C CDNA CONTAINING VECTOR,none,no,none
1700,GUSPERIMUS TRIHYDROCHLORIDE,none,no,none
1701,INOLIMOMAB,none,no,none


In [130]:
mask = matched_ods['is identified'] == 'yes'
matched_ods[mask]

Unnamed: 0,OD INN,All drug INN,is identified,All drug brand name
18,PROPRANOLOL HYDROCHLORIDE,PROPRANOLOL HYDROCHLORIDE,yes,HEMANGIOL
20,PACLITAXEL,PACLITAXEL,yes,ABRAXANE
24,BESILESOMAB,BESILESOMAB,yes,SCINTIMUN
47,TEMOZOLOMIDE,TEMOZOLOMIDE,yes,TEMODAL
58,REGORAFENIB,REGORAFENIB,yes,STIVARGA
...,...,...,...,...
1667,DECITABINE,DECITABINE,yes,DACOGEN
1671,CHOLIC ACID,CHENODEOXYCHOLIC ACID,yes,CHENODEOXYCHOLIC ACID LEADIANT
1672,CHOLIC ACID,OBETICHOLIC ACID,yes,OCALIVA
1673,CHOLIC ACID,CHOLIC ACID,yes,ORPHACOL


There are some ODs that are falsely matched to multiple drugs, e.g. CHOLIC ACID is matched OBETICHOLIC ACID. Those will have to be filtered out manually.

1.1 Filter multiple matches

In [132]:
double_matches = []
mask = matched_ods['is identified'] == 'yes'
inns = matched_ods[mask]['OD INN'].unique()
for inn in inns:
    mask = matched_ods['OD INN'] == inn
    temp = matched_ods[mask]
    if len(temp) > 1:
        double_matches.append(temp['OD INN'].iloc[0])

len(double_matches)

55

How many of those 55 double matches are falsely matched multiple times? Keep only those double matches that are matched to different All drug INNs.

In [136]:
double_matches = pd.DataFrame(columns=['OD INN', 'All drug INN', 'is identified', 'All drug brand name'])
mask = matched_ods['is identified'] == 'yes'
inns = matched_ods[mask]['OD INN'].unique()
for inn in inns:
    mask = matched_ods['OD INN'] == inn
    temp = matched_ods[mask]
    if len(temp) > 1:
        unique_inns = temp['All drug INN'].unique()
        if len(unique_inns) > 1:
            double_matches = double_matches.append(temp)

double_matches

Unnamed: 0,OD INN,All drug INN,is identified,All drug brand name
61,RASAGILINE,RASAGILINE,yes,AZILECT
62,RASAGILINE,RASAGILINE TARTRATE,yes,RASAGILINE MYLAN
259,SIROLIMUS,SIROLIMUS,yes,RAPAMUNE
260,SIROLIMUS,TEMSIROLIMUS,yes,TORISEL
319,SIROLIMUS,SIROLIMUS,yes,RAPAMUNE
...,...,...,...,...
1640,ACETYLSALICYLIC ACID,"ACETYLSALICYLIC ACID, CLOPIDOGREL HYDROGEN SUL...",yes,CLOPIDOGREL/ACETYLSALICYLIC ACID MYLAN
1641,ACETYLSALICYLIC ACID,"CLOPIDOGREL, ACETYLSALICYLIC ACID",yes,DUOPLAVIN
1671,CHOLIC ACID,CHENODEOXYCHOLIC ACID,yes,CHENODEOXYCHOLIC ACID LEADIANT
1672,CHOLIC ACID,OBETICHOLIC ACID,yes,OCALIVA


In [100]:
mask = matched_drugs['INN'] == 'paclitaxel'
matched_drugs[mask]

Unnamed: 0,Brand name,EMA brand name,is identified,INN
4,ABRAXANE,ABRAXANE,yes,paclitaxel
55,APEALEA,APEALEA,yes,paclitaxel
783,PAZENIR,PAZENIR,yes,paclitaxel
1432,PAXENE,PAXENE,yes,paclitaxel
