In [1]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport

import util

In [2]:
data_config = util.get_data_config()

In [3]:
df = pd.read_csv(data_config['raw_synonyms_file_name'])
df.shape

(23369, 6)

In [4]:
df.head(10)

Unnamed: 0,syn_id,id,name,preferred_name,parent_id,lname
0,23310,5391.0,sacituzumab,,,sacituzumab
1,23311,5391.0,sacituzumab govitecan,1.0,,sacituzumab govitecan
2,23312,5391.0,sacituzumab govitecan-hziy,,,sacituzumab govitecan-hziy
3,23313,5391.0,trodelvy,,,trodelvy
4,23314,5391.0,IMMU-132,,,immu-132
5,23315,5391.0,TROP-2-SN-38,,,trop-2-sn-38
6,23316,5391.0,hRS7-SN-38-ADC,,,hrs7-sn-38-adc
7,23317,5392.0,capmatinib,1.0,,capmatinib
8,23318,5392.0,capmatinib hydrochloride,,,capmatinib hydrochloride
9,23319,5392.0,capmatinib hydrochloride hydrate,,,capmatinib hydrochloride hydrate


# Data quality checks

In [5]:
df.describe(include='all')

Unnamed: 0,syn_id,id,name,preferred_name,parent_id,lname
count,23369.0,23236.0,23369,5025.0,226.0,23369
unique,,,23369,,,23369
top,,,sacituzumab,,,sacituzumab
freq,,,1,,,1
mean,12120.703453,2705.518506,,0.999602,118.420354,
std,7117.0632,1671.680913,,0.019948,71.019655,
min,3.0,4.0,,0.0,1.0,
25%,6023.0,1280.75,,1.0,58.25,
50%,11983.0,2493.0,,1.0,114.5,
75%,17934.0,4230.0,,1.0,170.25,


In [6]:
profile = ProfileReport(
    df,
    title='Profiling Report',
    correlations=None,
)

In [7]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  annotation = ("{:" + self.fmt + "}").format(val)
(using `df.profile_report(missing_diagrams={"Heatmap": False}`)
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: '--'')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

## Check id and parent_id

In [8]:
df[~df['parent_id'].isna()]

Unnamed: 0,syn_id,id,name,preferred_name,parent_id,lname
68,23378,5401.0,BMS-626529,,230.0,bms-626529
69,23379,5401.0,BMS 626529,,230.0,bms 626529
310,23720,5448.0,serdexmethylphenidate,1.0,231.0,serdexmethylphenidate
476,23151,5377.0,N4-Hydroxycytidine,,229.0,n4-hydroxycytidine
477,23152,5377.0,Beta-D-N4-hydroxycytidine,,229.0,beta-d-n4-hydroxycytidine
...,...,...,...,...,...,...
22545,24416,,laidlomycin,1.0,236.0,laidlomycin
22546,24417,,BRN 1675244,,236.0,brn 1675244
22547,24418,,EINECS 260-095-1,,236.0,einecs 260-095-1
23071,25105,,carotegrast,1.0,237.0,carotegrast


In [9]:
df[df['name']=='clobetasone butyrate']

Unnamed: 0,syn_id,id,name,preferred_name,parent_id,lname
7516,4562,3911.0,clobetasone butyrate,1.0,,clobetasone butyrate


Notes: rows with missing `id` will be dropped. 
`tamatinib` gives 0 results in [https://drugcentral.org/?q=tamatinib&approval=](https://drugcentral.org/?q=tamatinib&approval=)

`clobetasone` gives 1 resut: [clobetasone butyrate](https://drugcentral.org/?q=clobetasone&approval=)
 This is not the same, `clobetasone` is not mentioned in the synonyms of [clobetasone butyrate](https://drugcentral.org/drugcard/3911)
 

In [10]:
df[df['id']==217.0]

Unnamed: 0,syn_id,id,name,preferred_name,parent_id,lname
7017,4046,217.0,anecortave acetate,,,anecortave acetate
16343,13593,217.0,anecortave,1.0,,anecortave
19700,17011,217.0,retaane,,,retaane


In [11]:
df[df['parent_id'] == 225.0]

Unnamed: 0,syn_id,id,name,preferred_name,parent_id,lname
576,22536,,tamatinib,1.0,225.0,tamatinib
577,22537,,R406,,225.0,r406
578,22538,,R 406,,225.0,r 406


In [12]:
(df['name'] == df['lname']).value_counts()

True     19151
False     4218
Name: count, dtype: int64

In [13]:
df[(df['name'] != df['lname'])]

Unnamed: 0,syn_id,id,name,preferred_name,parent_id,lname
4,23314,5391.0,IMMU-132,,,immu-132
5,23315,5391.0,TROP-2-SN-38,,,trop-2-sn-38
6,23316,5391.0,hRS7-SN-38-ADC,,,hrs7-sn-38-adc
11,23321,5392.0,NVP-INC280-NX,,,nvp-inc280-nx
12,23322,5392.0,INC280,,,inc280
...,...,...,...,...,...,...
23337,25458,5739.0,ARC1905,,,arc1905
23341,25462,5740.0,REGN3918,,,regn3918
23345,25466,5741.0,PF-06863135,,,pf-06863135
23346,25467,5741.0,RN613,,,rn613


In [14]:
df[(df['name'] != df['lname'])].preferred_name.value_counts()

preferred_name
1.0    86
0.0     1
Name: count, dtype: int64

In [15]:
original_df = df.copy()
df = df[~df['id'].isna()]
df.shape, original_df.shape

((23236, 6), (23369, 6))

## Check preferred_name

In [16]:
df.preferred_name.value_counts()

preferred_name
1.0    4992
0.0       2
Name: count, dtype: int64

In [17]:
_df = df[df.preferred_name == 0.]
_df

Unnamed: 0,syn_id,id,name,preferred_name,parent_id,lname
725,22694,5313.0,ALN-TTR02,0.0,,aln-ttr02
16253,13500,1549.0,arginine hydrochloride,0.0,,arginine hydrochloride


In [18]:
ids = set(_df.id)
df[df.id.map(lambda id: id in ids)].sort_values(by='id')

Unnamed: 0,syn_id,id,name,preferred_name,parent_id,lname
4989,1971,1549.0,l-arginine,,,l-arginine
8238,5319,1549.0,L-Arg,,,l-arg
11131,8279,1549.0,arginine,1.0,,arginine
16253,13500,1549.0,arginine hydrochloride,0.0,,arginine hydrochloride
20852,18180,1549.0,arginine HCl,,,arginine hcl
724,22693,5313.0,patisiran,1.0,,patisiran
725,22694,5313.0,ALN-TTR02,0.0,,aln-ttr02
726,22696,5313.0,ALN-18328,,,aln-18328
22365,24142,5313.0,patisiran sodium,,,patisiran sodium
22366,24143,5313.0,onpattro,,,onpattro


In [19]:
del ids, _df

`preferred_name` equal to 1. indicates preferred name, NaN or 0. not.

`arginine hydrochloride` is in the synonyms of [arginine](https://drugcentral.org/drugcard/1549?q=arginine)
`ALN-TTR02` is in the synonyms of [patisiran](https://drugcentral.org/drugcard/5313?q=patisiran)

In [20]:
check_df = df.groupby('id').agg(count_preferred_name=('preferred_name', 'sum'))
check_df.value_counts()

count_preferred_name
1.0                     4992
0.0                        3
Name: count, dtype: int64

There are 3 synonym groups that don't have a preferred name, they will be deleted.

In [21]:
df = df.set_index(['id']).drop(labels=check_df[check_df.count_preferred_name==0].index)
df = df.reset_index()

In [22]:
del check_df

## Other checks

In [23]:
assert df.syn_id.unique().shape[0] == df.shape[0]
assert df.name.unique().shape[0] == df.shape[0]

In [24]:
# lname is also unique, so if it is not equal to name we should keep it as an extra synonym.
assert df.lname.unique().shape[0] == df.shape[0]

# Group synonyms

In [25]:
df_preferred_names = df[df.preferred_name == 1][['id', 'name']].copy()
df_preferred_names.rename(columns={'name': 'preferred_name'}, inplace=True)
df_preferred_names.shape

(4992, 2)

In [26]:
df_preferred_names.head()

Unnamed: 0,id,preferred_name
1,5391.0,sacituzumab govitecan
7,5392.0,capmatinib
14,5393.0,selpercatinib
18,5394.0,ripretinib
22,5395.0,fluoroestradiol F 18


In [27]:
df_processed = df[['id', 'name', 'lname']].copy().merge(df_preferred_names[['id', 'preferred_name']].copy(), on='id')
df_processed.head()

Unnamed: 0,id,name,lname,preferred_name
0,5391.0,sacituzumab,sacituzumab,sacituzumab govitecan
1,5391.0,sacituzumab govitecan,sacituzumab govitecan,sacituzumab govitecan
2,5391.0,sacituzumab govitecan-hziy,sacituzumab govitecan-hziy,sacituzumab govitecan
3,5391.0,trodelvy,trodelvy,sacituzumab govitecan
4,5391.0,IMMU-132,immu-132,sacituzumab govitecan


In [28]:
synonyms = df_processed.groupby('id').agg({
    'name': np.unique,
    'lname': np.unique,
    'preferred_name': lambda g: g.iloc[0],
})
synonyms['synonyms'] = synonyms.apply(lambda row: np.unique(np.concatenate([row['name'], row['lname']], axis=0)), axis=1)
del synonyms['name'], synonyms['lname']
synonyms.head()

Unnamed: 0_level_0,preferred_name,synonyms
id,Unnamed: 1_level_1,Unnamed: 2_level_1
4.0,levobupivacaine,"[chirocain, levobupivacaine, levobupivacaine H..."
5.0,(S)-nicardipine,"[(-)-Nicardipine, (-)-nicardipine, (S)-nicardi..."
6.0,(S)-nitrendipine,"[(-)-Nitrendipine, (-)-nitrendipine, (S)-nitre..."
13.0,levdobutamine,"[LY206243, levdobutamine, levdobutamine lactob..."
21.0,aminopterin,"[4-aminofolic acid, Aminofolic acid, 4-, amino..."


In [29]:
del df_processed, df_preferred_names

# Save

In [30]:
synonyms.to_parquet(path=data_config['processed_synonyms_file_name'])

In [31]:
synonyms[synonyms['synonyms'].map(lambda s: 'anakinra' in s)]

Unnamed: 0_level_0,preferred_name,synonyms
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5017.0,anakinra,"[anakinra, kineret]"
