In [1]:
import re
import pandas as pd
import numpy as np
import os
from copy import copy
from sklearn.model_selection import train_test_split

In [2]:
from tqdm import tqdm
tqdm.pandas()

import sys
sys.path.append(f'../../')

from src.features.parallelize import apply_parallel 

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [55]:
def concat_and_explode_syn(df):
    df['term'] = df.apply(
        #lambda row: [row['STR']] + [i for i in list(set(eval(row['SNMS'])))[:3] if i.lower() != row['STR'].lower()], axis=1)
        lambda row: [row['STR']] + [i for i in list(set(eval(row['SNMS'])))[:3] if i.lower() != row['STR'].lower()], axis=1)
    return df

def concat_and_explode(df):
    df['term'] = df.apply(
        lambda row: [row['STR']], axis=1)
    return df

In [56]:
mdr_codes_to_exp = pd.read_csv('../../data/interim/meddra_codes_terms_synonims.csv')
mdr_codes_to_exp = mdr_codes_to_exp[['STR', 'SNMS', 'CODE']]
mdr_codes_to_exp = apply_parallel(mdr_codes_to_exp, concat_and_explode)
mdr_codes_to_exp = mdr_codes_to_exp.explode('term')[['term', 'CODE']].rename(columns={'CODE': 'code'})
mdr_codes_to_exp

Unnamed: 0,term,code
0,Ventilation pneumonitis,10000001
1,11-beta-hydroxylase deficiency,10000002
2,11-oxysteroid activity incr,10000003
3,11-oxysteroid activity increased,10000004
4,17 ketosteroids urine,10000005
...,...,...
84208,Infective pneumonia (SMQ),20000231
84209,Dehydration (SMQ),20000232
84210,['Hypokalaemia (SMQ)' 'Hypokalemia (SMQ)'],20000233
84211,Sepsis (SMQ),20000234


In [57]:
mdr_codes_to_exp = pd.read_csv('../../data/interim/meddra_codes_terms_synonims.csv')
mdr_codes_to_exp = mdr_codes_to_exp[['STR', 'SNMS', 'CODE']]
mdr_codes_to_exp = apply_parallel(mdr_codes_to_exp, concat_and_explode_syn)
mdr_codes_to_exp = mdr_codes_to_exp.explode('term')[['term', 'CODE']].rename(columns={'CODE': 'code'})
mdr_codes_to_exp

Unnamed: 0,term,code
0,Ventilation pneumonitis,10000001
0,Sauna takers lung,10000001
0,Humidifier AND/OR air conditioning pneumonitis,10000001
0,Humidifier lung,10000001
1,11-beta-hydroxylase deficiency,10000002
...,...,...
84208,Infective pneumonia (SMQ),20000231
84209,Dehydration (SMQ),20000232
84210,['Hypokalaemia (SMQ)' 'Hypokalemia (SMQ)'],20000233
84211,Sepsis (SMQ),20000234


In [58]:
import re

def clearing(x):
    x = re.sub('\[.*\]', '', x)
    x = re.sub('\(.*\)', '', x)
    x = re.sub('-.*-', '', x)
    x = x.replace('%', 'percent')
    x = x.replace(':', '')
    for abb in ['NOS', 'HOH', 'URS', 'AAOOR']:
        x = x.replace(abb, '')
    x = x.strip()
    return x

mdr_codes_to_exp['term'] = mdr_codes_to_exp['term'].apply(lambda x: clearing(x))
mdr_codes_to_exp = mdr_codes_to_exp.drop_duplicates()
mdr_codes_to_exp = mdr_codes_to_exp[mdr_codes_to_exp['term'] != '']
mdr_codes_to_exp

Unnamed: 0,term,code
0,Ventilation pneumonitis,10000001
0,Sauna takers lung,10000001
0,Humidifier AND/OR air conditioning pneumonitis,10000001
0,Humidifier lung,10000001
1,11hydroxylase deficiency,10000002
...,...,...
84207,['Non-haematological tumours of unspecified ma...,20000230
84208,Infective pneumonia,20000231
84209,Dehydration,20000232
84211,Sepsis,20000234


In [59]:
all_used_codes_cadec.shape, all_used_codes_psytar.shape, all_used_codes_smm4h21.shape, all_used_codes_smm4h17.shape

((786,), (617,), (543,), (725,))

In [60]:
used_codes = np.concatenate([
    all_used_codes_cadec, all_used_codes_psytar, all_used_codes_smm4h21, all_used_codes_smm4h17
]) #.shape
used_codes.shape

(2671,)

In [61]:
np.unique(used_codes).shape

(1451,)

In [62]:
mdr_codes_to_exp = mdr_codes_to_exp[mdr_codes_to_exp['code'].isin(np.unique(used_codes))]

# SMM4H 2021

In [63]:
smm4h21_spans_train = pd.read_csv('../../data/external/smm4h_2021/SMM4H_2021_train_spans.tsv', sep='\t', header=None)
smm4h21_tweets_train = pd.read_csv('../../data/external/smm4h_2021/SMM4H_2021_train_tweets.tsv', sep='\t', header=None)
smm4h21_train = smm4h21_spans_train.merge(smm4h21_tweets_train, on=0)

In [64]:
smm4h21_spans_val = pd.read_csv('../../data/external/smm4h_2021/SMM4H_2021_val_spans.tsv', sep='\t', header=None)
smm4h21_tweets_val = pd.read_csv('../../data/external/smm4h_2021/SMM4H_2021_val_tweets.tsv', sep='\t', header=None)
smm4h21_val = smm4h21_spans_val.merge(smm4h21_tweets_val, on=0)

In [65]:
mdr_codes = pd.read_csv('../../data/interim/meddra_codes_terms_synonims.csv')
mdr_cod_to_norm = mdr_codes[['CODE', 'STR', 'SNMS']]

In [66]:
# TRAIN
smm4h21_train = smm4h21_train.rename(columns={
    4: 'term', '1_y': 'text',
    2: 'start', 3: 'end',
    5: 'code'
})

smm4h21_train = smm4h21_train[['term', 'start', 'end', 'text', 'code']]
smm4h21_train = smm4h21_train[smm4h21_train['code'].str.isdigit()]
smm4h21_train['code'] = smm4h21_train['code'].astype(int)


smm4h21_train = pd.merge(smm4h21_train, mdr_cod_to_norm, left_on='code', right_on='CODE', how='left')
smm4h21_train = smm4h21_train.drop(columns=['CODE'])


# VALIDATION
smm4h21_val = smm4h21_val.rename(columns={
    4: 'term', '1_y': 'text',
    2: 'start', 3: 'end',
    5: 'code'
})

smm4h21_val = smm4h21_val[['term', 'start', 'end', 'text', 'code']]
smm4h21_val = smm4h21_val[smm4h21_val['code'].str.isdigit()]
smm4h21_val['code'] = smm4h21_val['code'].astype(int)
smm4h21_val = pd.merge(smm4h21_val, mdr_cod_to_norm, left_on='code', right_on='CODE', how='left')
smm4h21_val = smm4h21_val.drop(columns=['CODE'])

In [67]:
smm4h21_train.head(5)

Unnamed: 0,term,start,end,text,code,STR,SNMS
0,allergies,28,37,"do you have any medication allergies? ""asthma!...",10013661,Drug allergy,"['Drug allergy', 'Drug allergy', 'Drug allergy..."
1,HURT YOUR Liver,31,46,"@ashleylvivian if #avelox has hurt your liver,...",10024668,Liver damage,"['Liver damage', 'Liver damage', 'Liver damage..."
2,AD,48,50,"apparently, baclofen greatly exacerbates the ""...",10003731,Attention deficit disorder,"['Attention deficit disorder', 'Attention defi..."
3,focus,88,93,"apparently, baclofen greatly exacerbates the ""...",10003738,Attention impaired,[]
4,died,11,15,pt of mine died from cipro rt @ciproispoison: ...,10011906,Death,"['Death (finding)', 'Death (finding)', 'Death'..."


In [68]:
smm4h21_val.head(5)

Unnamed: 0,term,start,end,text,code,STR,SNMS
0,nerves,119,125,@crohnietweets i found the humira to fix all m...,10029177,Nerve damage,"['Nerve injury', 'Nerve injury', 'Nerve injury..."
1,muscle spasms,126,139,@crohnietweets i found the humira to fix all m...,10028334,Muscle spasms,"['Spasm', 'Spasm, NOS', 'Muscle spasm', 'Muscl..."
2,gaining,61,68,@jennabear32819 have to go to a doc now to see...,10047896,Weight gain,"['Weight gain', 'Weight gain', 'Increased body..."
3,gain like 50 pounds,91,110,@jennabear32819 have to go to a doc now to see...,10047896,Weight gain,"['Weight gain', 'Weight gain', 'Increased body..."
4,frontal headache,118,134,06.30 day 14 Rivaroxaban diary. Thanks to para...,10019211,Headache,"['Headache', 'Headache', 'Headache', 'Headache..."


In [69]:
smm4h21_train_ex = pd.concat([smm4h21_train, mdr_codes_to_exp])
smm4h21_train_ex

Unnamed: 0,term,start,end,text,code,STR,SNMS
0,allergies,28.0,37.0,"do you have any medication allergies? ""asthma!...",10013661,Drug allergy,"['Drug allergy', 'Drug allergy', 'Drug allergy..."
1,HURT YOUR Liver,31.0,46.0,"@ashleylvivian if #avelox has hurt your liver,...",10024668,Liver damage,"['Liver damage', 'Liver damage', 'Liver damage..."
2,AD,48.0,50.0,"apparently, baclofen greatly exacerbates the ""...",10003731,Attention deficit disorder,"['Attention deficit disorder', 'Attention defi..."
3,focus,88.0,93.0,"apparently, baclofen greatly exacerbates the ""...",10003738,Attention impaired,[]
4,died,11.0,15.0,pt of mine died from cipro rt @ciproispoison: ...,10011906,Death,"['Death (finding)', 'Death (finding)', 'Death'..."
...,...,...,...,...,...,...,...
82009,Spinal stenosis of unspecified region,,,,10082214,,
83081,Tabaquism,,,,10083286,,
83081,Tobacco dependence,,,,10083286,,
83081,Compulsive tobacco user syndrome,,,,10083286,,


In [70]:
smm4h21_train

Unnamed: 0,term,start,end,text,code,STR,SNMS
0,allergies,28,37,"do you have any medication allergies? ""asthma!...",10013661,Drug allergy,"['Drug allergy', 'Drug allergy', 'Drug allergy..."
1,HURT YOUR Liver,31,46,"@ashleylvivian if #avelox has hurt your liver,...",10024668,Liver damage,"['Liver damage', 'Liver damage', 'Liver damage..."
2,AD,48,50,"apparently, baclofen greatly exacerbates the ""...",10003731,Attention deficit disorder,"['Attention deficit disorder', 'Attention defi..."
3,focus,88,93,"apparently, baclofen greatly exacerbates the ""...",10003738,Attention impaired,[]
4,died,11,15,pt of mine died from cipro rt @ciproispoison: ...,10011906,Death,"['Death (finding)', 'Death (finding)', 'Death'..."
...,...,...,...,...,...,...,...
1707,orgasm,48,54,do you think anyone has ever managed to have a...,10021574,Inability to orgasm,"['Anorgasmia', 'Frigidity proper', 'Orgasm inc..."
1708,never have another orgasm,91,116,@verlieren thank you! it didn't even fucking w...,10021574,Inability to orgasm,"['Anorgasmia', 'Frigidity proper', 'Orgasm inc..."
1709,coma,65,69,i just found out that some dude from my friend...,10041349,Somnolence,"['Somnolence', 'Somnolence', 'Somnolence', 'So..."
1710,gain so much weight,72,91,that zyprexa really makes your vocal chords ma...,10047986,Wilms tumour,"['Nephroblastoma', 'Nephroblastoma', 'Nephrobl..."


In [71]:
#smm4h21_train, smm4h21_test = train_test_split(smm4h21, test_size=0.2)

smm4h21_train_ex.to_csv('../../data/interim/smm4h21/train_ex.csv', index=False)
smm4h21_train.to_csv('../../data/interim/smm4h21/train.csv', index=False)
smm4h21_val.to_csv('../../data/interim/smm4h21/test.csv', index=False)

In [72]:
smm4h21_train.shape, smm4h21_train_ex.shape

((1712, 7), (5303, 7))

In [73]:
all_used_codes_smm4h21 = np.concatenate([smm4h21_train['code'].unique(), smm4h21_val['code'].unique()])
all_used_codes_smm4h21.shape

(543,)

# SMM4H 2017

## text files

In [74]:
train_datasets = []
test_datasets = []

path = '../../data/external/smm4h_2017/text_files_direct/'
for file in os.listdir(path):
    print(file)
    df = pd.read_csv(path + file, sep='\t', header=None)
    if 'train' in file:
        train_datasets.append(df)
    elif 'eval' in file:
        test_datasets.append(df)
        
smm4h17_train = pd.concat(train_datasets).rename
smm4h17_test  = pd.concat(test_datasets)

task_3_normalization_evaluation.txt
task_3_normalization_training1.txt
task_3_normalization_training2.txt
task_3_normalization_training3.txt
task_3_normalization_training4.txt


## KFU's smm4h17

In [75]:
smm4h17_spans_train = pd.read_csv('../../data/external/smm4h_2017/smm4h_kfu/processed_train/0.concept', 
                                  sep='|', header=None).dropna(axis=1)[[7, 9]]
smm4h17_spans_test = pd.read_csv('../../data/external/smm4h_2017/smm4h_kfu/processed_test/0.concept', 
                                  sep='|', header=None).dropna(axis=1)[[7, 9]]

In [76]:
mdr_codes = pd.read_csv('../../data/interim/meddra_codes_terms_synonims.csv')
mdr_cod_to_norm = mdr_codes[['CODE', 'STR', 'SNMS']]

In [77]:
smm4h17_spans_train = smm4h17_spans_train.rename(columns={7: 'term', 9: 'code'})
smm4h17_spans_test = smm4h17_spans_test.rename(columns={7: 'term', 9: 'code'})

#smm4h17_spans_train = smm4h17_spans_train[smm4h17_spans_train['code'].str.isdigit()]
smm4h17_spans_test = smm4h17_spans_test[smm4h17_spans_test['code'].str.isdigit()]

smm4h17_spans_train['code'] = smm4h17_spans_train['code'].astype(int)
smm4h17_spans_test['code'] = smm4h17_spans_test['code'].astype(int)

smm4h17_spans_train = pd.merge(smm4h17_spans_train, mdr_cod_to_norm, left_on='code', right_on='CODE', how='left')
smm4h17_spans_train = smm4h17_spans_train.drop(columns=['CODE'])
smm4h17_spans_train

smm4h17_spans_test = pd.merge(smm4h17_spans_test, mdr_cod_to_norm, left_on='code', right_on='CODE', how='left')
smm4h17_spans_test = smm4h17_spans_test.drop(columns=['CODE'])
smm4h17_spans_test

Unnamed: 0,term,code,STR,SNMS
0,sleepier,10041349,Somnolence,"['Somnolence', 'Somnolence', 'Somnolence', 'So..."
1,dreamt colors,10000125,Abnormal dreams,"['Dream disorder', 'Abnormal dreams', 'Abnorma..."
2,zombie,10016322,Feeling abnormal,"['Feeling abnormal', 'Abnormal feeling', 'Abno..."
3,headache,10019211,Headache,"['Headache', 'Headache', 'Headache', 'Headache..."
4,crazy,10061920,Psychotic disorder,"['Psychotic disorder, NOS', 'Psychotic disorde..."
...,...,...,...,...
2494,sleptwalk,10041347,Somnambulism,"['Somnambulism', 'Somnambulism', 'Somnambulism..."
2495,fatigue,10016256,Fatigue,"['Fatigue', 'Fatigue', 'Fatigue', 'Tiredness',..."
2496,headache,10019211,Headache,"['Headache', 'Headache', 'Headache', 'Headache..."
2497,out of it,10041349,Somnolence,"['Somnolence', 'Somnolence', 'Somnolence', 'So..."


In [78]:
smm4h17_spans_train_ex = pd.concat([smm4h17_spans_train, mdr_codes_to_exp])
smm4h17_spans_train_ex

Unnamed: 0,term,code,STR,SNMS
0,addict,10013663,Drug dependence,"['Drug dependence', 'Drug dependence', 'Drug d..."
1,allergic reaction,10020751,Hypersensitivity,"['Hypersensitivity', 'Hypersensitivity', 'Hype..."
2,pre-cutting,10022524,Intentional self-injury,[]
3,withdrawals,10048010,Withdrawal syndrome,[]
4,delirious,10012218,Delirium,"['Delirium', 'Delirium', 'Delirium', 'Delirium..."
...,...,...,...,...
82009,Spinal stenosis of unspecified region,10082214,,
83081,Tabaquism,10083286,,
83081,Tobacco dependence,10083286,,
83081,Compulsive tobacco user syndrome,10083286,,


In [79]:
smm4h17_spans_train_ex.to_csv('../../data/interim/smm4h17/train_ex.csv', index=False)
smm4h17_spans_train.to_csv('../../data/interim/smm4h17/train.csv', index=False)
smm4h17_spans_test.to_csv('../../data/interim/smm4h17/test.csv', index=False)

In [80]:
smm4h17_spans_train.shape, smm4h17_spans_train_ex.shape

((6650, 4), (10241, 4))

In [81]:
all_used_codes_smm4h17 = np.concatenate([smm4h17_spans_train['code'].unique(), smm4h17_spans_test['code'].unique()])
all_used_codes_smm4h17.shape

(725,)

In [82]:
# smm4h17_dict_train = pd.read_csv('../../data/external/smm4h_2017/smm4h_kfu/train_dictionary.txt', 
#                                   sep='|', header=None, error_bad_lines=False)
# smm4h17_dict_test = pd.read_csv('../../data/external/smm4h_2017/smm4h_kfu/test_dictionary.txt', 
#                                   sep='|', header=None, error_bad_lines=False)

# UMLS micro

In [83]:
umls_eng_mdr = pd.read_csv('../../data/external/mrconso_umls/MRCONSO_ENG_MDR.csv')
umls_eng_mdr

Unnamed: 0.1,Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
0,2276,C0000727,ENG,P,L0000727,VCW,S0584932,N,A0639292,,,10000647,MDR,PT,10000647,Acute abdomen,3,N,256.0
1,2282,C0000727,ENG,P,L0000727,VCW,S0584932,N,A25741630,,,10000647,MDR,LLT,10000647,Acute abdomen,3,N,256.0
2,2296,C0000727,ENG,S,L0161339,PF,S1616740,Y,A25720821,,,10000647,MDR,LLT,10042784,Syndrome abdominal acute,3,N,
3,2299,C0000727,ENG,S,L0161339,VCW,S1616739,Y,A25708511,,,10000647,MDR,LLT,10000096,Abdominal syndrome acute,3,N,
4,2334,C0000729,ENG,P,L0000729,VC,S0353650,N,A25716812,,,10000081,MDR,LLT,10000057,Abdominal cramps,3,N,256.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112380,11068329,C5244084,ENG,P,L16308084,PF,S19795290,Y,A31790308,,,10055798,MDR,LLT,10084046,Severe bleeding - GUSTO classification,3,N,
112381,11068331,C5244085,ENG,P,L16308140,VC,S19795001,Y,A31790861,,,10053315,MDR,LLT,10084469,Home isolation,3,N,
112382,11068333,C5244086,ENG,P,L16307956,PF,S19794712,Y,A31789860,,,10084354,MDR,LLT,10084499,COVID-19 rapid POC test,3,N,
112383,11068334,C5244087,ENG,P,L16308134,PF,S19794970,Y,A31790128,,,10063630,MDR,LLT,10083228,Genital scratch,3,N,


In [84]:
code_to_term = umls_eng_mdr[['CODE', 'STR']].groupby('CODE').agg(set)
code_to_term[[len(i)>1 for i in code_to_term['STR']]].to_numpy()
code_to_term

Unnamed: 0_level_0,STR
CODE,Unnamed: 1_level_1
10000001,"{""Ventilation"" pneumonitis}"
10000002,{11-beta-hydroxylase deficiency}
10000003,{11-oxysteroid activity incr}
10000004,{11-oxysteroid activity increased}
10000005,{17 ketosteroids urine}
...,...
20000231,{Infective pneumonia (SMQ)}
20000232,{Dehydration (SMQ)}
20000233,"{Hypokalemia (SMQ), Hypokalaemia (SMQ)}"
20000234,{Sepsis (SMQ)}


# PsyTar

In [85]:
def get_MDR_code_by_CUI(CUI):
    potentials = umls_eng_mdr[umls_eng_mdr['CUI']==CUI]
    lst =  potentials['CODE'].to_list()
    return max(set(lst), key=lst.count) if len(lst) > 0 else None

In [86]:
psytar_text = pd.read_excel('../../data/external/psytar/PsyTAR_dataset.xlsx', sheet_name='Sentence_Labeling')
psytar_text = psytar_text[['drug_id', 'sentences']]
psytar_text = psytar_text.groupby('drug_id').agg(list)
psytar_text = psytar_text.reset_index(inplace=False)
psytar_text['sentences'] = psytar_text['sentences'].apply(lambda x: '<SENT>'.join([str(i) for i in x]))

In [87]:
psytar_sent = pd.read_excel('../../data/external/psytar/PsyTAR_dataset.xlsx', sheet_name='ADR_Identified')
psytar_adr = pd.read_excel('../../data/external/psytar/PsyTAR_dataset.xlsx', sheet_name='ADR_Mapped')

# SENT
psytar_sent['mention'] = psytar_sent.apply(
    lambda x: [x[col] for col in psytar_sent if 'ADR' in col and x[col] is not np.nan], 
    axis=1)
psytar_sent = psytar_sent[[col for col in psytar_sent.columns if "ADR" not in col]]
psytar_sent = psytar_sent[['id', 'drug_id', 'sentence_index', 'sentences']]
psytar_sent = psytar_sent.rename(columns={'sentences': 'sentence'})

# ADR
psytar_adr = psytar_adr[psytar_adr['type']=='ADR'][
    ['drug_id', 'sentence_index', 'ADRs', 'UMLS1']
]

In [88]:
# MERGE
psytar = pd.merge(psytar_adr, psytar_sent,  how='left', 
         left_on=['drug_id','sentence_index'], 
         right_on = ['drug_id','sentence_index'])

psytar = psytar.merge(psytar_text, on='drug_id', how='left')

In [89]:
# get CUI and term positions (may be more)
psytar['CUI'], psytar['term'] = zip(*psytar['UMLS1'].apply(lambda x: x.split('/')[:2] if len(x.split('/')) >= 2 else [x, None]))

# drop bad data
psytar = psytar[~psytar['term'].isna()]
psytar = psytar[psytar['CUI'].str[0]=='C']

# select needed columns
psytar_adrs = psytar[['ADRs', 'term', 'CUI', 'sentence_index', 'sentence', 'sentences']]
psytar_adrs['MDR'] = psytar_adrs['CUI'].apply(lambda x: get_MDR_code_by_CUI(x.strip()))

psytar_adrs = psytar_adrs.drop(columns=['CUI'])
psytar_adrs = psytar_adrs.dropna()
psytar_adrs = psytar_adrs.rename(columns={
    'term': 'norm_form',
    'ADRs': 'term',
    'sentence_index': 'sent_idx',
    'sentence': 'sent',
    'sentences': 'text',
    'MDR': 'code'
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [90]:
# MAKE SPANS
psytar_adrs['span'] = psytar_adrs.apply(
    lambda x: list(re.finditer(x['term'].lower(), x['text'].lower())), axis=1)

psytar_adrs['start'] = psytar_adrs['span'].apply(lambda x: x[0].span()[0] if len(x) == 1 else None)
psytar_adrs['end'] = psytar_adrs['span'].apply(lambda x: x[0].span()[1] if len(x) == 1 else None)

psytar_adrs = psytar_adrs.drop(columns=['span'])
psytar_adrs = psytar_adrs[
    ['term', 'start', 'end', 'sent_idx', 'text', 'norm_form', 'code']
]

#psytar_adrs = psytar_adrs[psytar_adrs['code'].str.isdigit()]

mdr_codes = pd.read_csv('../../data/interim/meddra_codes_terms_synonims.csv')
mdr_cod_to_norm = mdr_codes[['CODE', 'STR', 'SNMS']]
psytar_adrs = pd.merge(psytar_adrs, mdr_cod_to_norm, left_on='code', right_on='CODE', how='left')
psytar_adrs = psytar_adrs.drop(columns=['CODE'])
psytar_adrs

psytar_adrs

Unnamed: 0,term,start,end,sent_idx,text,norm_form,code,STR,SNMS
0,short-term memory loss,21.0,43.0,1,"extreme weight gain, short-term memory loss, h...",Poor short-term memory,10040602.0,Short-term memory loss,"['Poor short-term memory', 'Short-term memory ..."
1,hair loss,45.0,54.0,1,"extreme weight gain, short-term memory loss, h...",Alopecia,10001760.0,Alopecia,"['Alopecia', 'Alopecia', 'Alopecia', 'Alopecia..."
2,completely destroyed sexually functioning,0.0,41.0,1,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .<SE...,Sexual Dysfunction,10040477.0,Sexual dysfunction,"['Sexual dysfunction', 'Sexual dysfunction', '..."
3,completely destroyed my sexual functioning,144.0,186.0,4,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .<SE...,Sexual Dysfunction,10040477.0,Sexual dysfunction,"['Sexual dysfunction', 'Sexual dysfunction', '..."
4,pssd,,,5,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .<SE...,Sexual Dysfunction,10040477.0,Sexual dysfunction,"['Sexual dysfunction', 'Sexual dysfunction', '..."
...,...,...,...,...,...,...,...,...,...
3985,early on: nausea,,,1,"Stomach problems early on: bloating, nausea, c...",Nausea,10028813.0,Nausea,"['Nausea', 'Nausea', 'Nausea', 'Nausea', 'Naus..."
3986,early on: constipation,,,1,"Stomach problems early on: bloating, nausea, c...",Constipation,10010774.0,Constipation,"['Constipation', 'Constipation', 'Constipation..."
3987,yawning,137.0,144.0,3,"Stomach problems early on: bloating, nausea, c...",Yawning,10048232.0,Yawning,"['Yawning', 'Yawning', 'Yawn', '[D]Yawning', '..."
3988,mild insomnia for the first 3 days,,,1,The only side effects I experienced were mild ...,Sleeplessness,10022437.0,Insomnia,"['Sleeplessness', 'Sleeplessness', 'Sleeplessn..."


In [91]:
psytar_adrs_train, psytar_adrs_test = train_test_split(psytar_adrs, test_size=0.2)

In [92]:
psytar_adrs_train_ex = pd.concat([psytar_adrs_train, mdr_codes_to_exp])
psytar_adrs_train_ex

Unnamed: 0,term,start,end,sent_idx,text,norm_form,code,STR,SNMS
1857,loose stool for 2 weeks,83.0,106.0,1.0,"increased anxiety was the worst thing for me, ...",Loose stool,10024837.0,Loose bowel,"['Loose stool', 'Loose stool', 'Loose stool', ..."
1800,have the jitters,229.0,245.0,4.0,Its really to early to tell for me.<SENT>I had...,Feeling jittery,10016338.0,Feeling jittery,['Feeling jittery']
531,slight schizophrenia,122.0,142.0,3.0,The 1ST time taking the Lexapro .<SENT>Slight ...,Schizophrenia,10039626.0,Schizophrenia,"['Schizophrenia', 'Schizophrenia, NOS', 'Schiz..."
2856,nausea,,,1.0,"Insomnia, dry mouth, constipation, nausea.<SEN...",Nausea,10028813.0,Nausea,"['Nausea', 'Nausea', 'Nausea', 'Nausea', 'Naus..."
2186,agitated,37.0,45.0,1.0,"feel wired, feel homicidal/suicidal, agitated,...",Agitation,10001497.0,Agitation,"['Agitation', 'Increased purposeless goalless ..."
...,...,...,...,...,...,...,...,...,...
82009,Spinal stenosis of unspecified region,,,,,,10082214.0,,
83081,Tabaquism,,,,,,10083286.0,,
83081,Tobacco dependence,,,,,,10083286.0,,
83081,Compulsive tobacco user syndrome,,,,,,10083286.0,,


In [93]:
psytar_adrs_train_ex.to_csv('../../data/interim/psytar/train_ex.csv', index=False)
psytar_adrs_train.to_csv('../../data/interim/psytar/train.csv', index=False)
psytar_adrs_test.to_csv('../../data/interim/psytar/test.csv', index=False)

In [94]:
all_used_codes_psytar = np.concatenate([psytar_adrs_train['code'].unique(), psytar_adrs_test['code'].unique()])
all_used_codes_psytar.shape

(617,)

# CADEC

In [95]:
dfs = []

empty_files = 0
for file in os.listdir('../../data/external/cadec2/cadec/meddra/'):
    try:
        file_name = '.'.join(file.split('.')[:2])
        #print(file_name)
        text = pd.read_csv('../../data/external/cadec2/cadec/text/' + f"{file_name}.txt", sep='\t', header=None)
        adr = pd.read_csv('../../data/external/cadec2/cadec/meddra/' + file, sep='\t', header=None)

    except pd.errors.EmptyDataError as e:
        empty_files += 1
        #print(file, end=' ')
        continue
    
    # подбор предложения
    adr['sent_number'] = adr[0].apply(lambda x: int(x[2:])-1 if x[2:].isdigit() else None)
    text = text.reset_index()
    df = pd.merge(adr, text, how='left', left_on='sent_number', right_on='index')
    df['text'] = '<SENT>'.join(text[0].to_list())
    dfs.append(df)

In [96]:
cadec2 = pd.concat(dfs, axis=0)

cadec2 = cadec2.rename(
    columns={1: 'code_span', 2: 'term', '0_y': 'sent'})[['term', 'code_span', 'sent', 'text']]

cadec2 = cadec2.dropna()

cadec2['code'] = cadec2['code_span'].apply(lambda x: x.split(' ')[0])
cadec2['span'] = cadec2['code_span'].apply(lambda x: x.split(' ')[-2:])
cadec2['span'] = cadec2['span'].apply(lambda x: [i.split(';')[-1] for i in x])

cadec2 = cadec2[cadec2['code'].str.isdigit()]
cadec2['start'] = cadec2['span'].apply(lambda x: x[0])
cadec2['end'] = cadec2['span'].apply(lambda x: x[1])

cadec2 = cadec2[
    ['term', 'start', 'end', 'sent', 'text', 'code']
]

cadec2 = cadec2[cadec2['code'].str.isdigit()]
cadec2

Unnamed: 0,term,start,end,sent,text,code
0,light nausea,126,138,"For the first 8 days of ever taking it, the on...","For the first 8 days of ever taking it, the on...",10028813
1,sharp pain in my stomach,229,253,"After 8 days, the feeling elevated to an annoy...","For the first 8 days of ever taking it, the on...",10033371
2,pain unbearable,329,339,"Then two days later, I had to stop using it be...","For the first 8 days of ever taking it, the on...",10033371
3,stomach pain,395,407,Now I have been off for two days and I still h...,"For the first 8 days of ever taking it, the on...",10042076
4,stomach pain,424,436,Now I have to see my doctor again to see if I ...,"For the first 8 days of ever taking it, the on...",10042076
...,...,...,...,...,...,...
12,pain in back,366,370,I lost 2 months of my life and I still don't r...,Horrific medication - Suffered an acute pancre...,10003993
0,sedation,5,13,Mild sedation.,Mild sedation.<SENT>This is a GREAT drug for m...,10039897
0,nausea,0,6,nausea.,nausea.<SENT>some pain relief.,10028813
0,hurts throat,111,116,My throat still hurts while on it; however the...,Haven't really experienced any side effects th...,10033494


In [97]:
cadec2['code'] = cadec2['code'].astype(int)

In [98]:
mdr_codes = pd.read_csv('../../data/interim/meddra_codes_terms_synonims.csv')
mdr_cod_to_norm = mdr_codes[['CODE', 'STR', 'SNMS']]
cadec2 = pd.merge(cadec2, mdr_cod_to_norm, left_on='code', right_on='CODE', how='left')
cadec2 = cadec2.drop(columns=['CODE'])
cadec2

Unnamed: 0,term,start,end,sent,text,code,STR,SNMS
0,light nausea,126,138,"For the first 8 days of ever taking it, the on...","For the first 8 days of ever taking it, the on...",10028813,Nausea,"['Nausea', 'Nausea', 'Nausea', 'Nausea', 'Naus..."
1,sharp pain in my stomach,229,253,"After 8 days, the feeling elevated to an annoy...","For the first 8 days of ever taking it, the on...",10033371,Pain,"['Pain', 'Pain', 'Pain', 'Pain', 'Pain', 'Pain..."
2,pain unbearable,329,339,"Then two days later, I had to stop using it be...","For the first 8 days of ever taking it, the on...",10033371,Pain,"['Pain', 'Pain', 'Pain', 'Pain', 'Pain', 'Pain..."
3,stomach pain,395,407,Now I have been off for two days and I still h...,"For the first 8 days of ever taking it, the on...",10042076,Stomach ache,"['Stomach ache', 'Stomach ache', 'Belly ache',..."
4,stomach pain,424,436,Now I have to see my doctor again to see if I ...,"For the first 8 days of ever taking it, the on...",10042076,Stomach ache,"['Stomach ache', 'Stomach ache', 'Belly ache',..."
...,...,...,...,...,...,...,...,...
4396,pain in back,366,370,I lost 2 months of my life and I still don't r...,Horrific medication - Suffered an acute pancre...,10003993,Backache,"['Back pain', 'Back pain', 'Back pain', 'Back ..."
4397,sedation,5,13,Mild sedation.,Mild sedation.<SENT>This is a GREAT drug for m...,10039897,Sedation,"['Sedated state', 'Sedated', 'Under sedation',..."
4398,nausea,0,6,nausea.,nausea.<SENT>some pain relief.,10028813,Nausea,"['Nausea', 'Nausea', 'Nausea', 'Nausea', 'Naus..."
4399,hurts throat,111,116,My throat still hurts while on it; however the...,Haven't really experienced any side effects th...,10033494,Pain throat,"['Sore throat', 'Sore throat', 'Sore throat', ..."


In [99]:
cadec2_train, cadec2_test = train_test_split(cadec2, test_size=0.2)

In [100]:
cadec2_train_ex = pd.concat([cadec2_train, mdr_codes_to_exp])
cadec2_train_ex

Unnamed: 0,term,start,end,sent,text,code,STR,SNMS
1074,lethargy,0,8,lethargy and leg soreness.,lethargy and leg soreness.,10024264,Lethargy,"['Lethargy', 'Lethargy', 'Lethargy', 'Lethargi..."
4086,Pain in the shoulders,34,55,A tingling sensation in the back of my neck.,Joint pain in the knees and hips.<SENT>Pain in...,10040617,Shoulder pain,"['Shoulder pain', 'Shoulder pain', 'Shoulder p..."
4123,unable to walk,128,142,I stopped it 8 weeks ago and still have much m...,"Bouts of extreme forgetfulness, extreme muscle...",10049278,Unable to walk,"['Unable to walk (finding)', 'Unable to walk']"
1061,hematuria,30,39,Lipitor caused permanent liver and kidney dama...,"Severe back pain, flank pain, hematuria, kidne...",10018867,['Hematuria' 'Haematuria'],"['Hematuria', 'Hematuria', 'Hematuria', 'Haema..."
2540,Extreme vertigo,33,48,In decreasing order of severity: Extreme verti...,In decreasing order of severity: Extreme verti...,10047340,Vertigo,"['Vertigo', 'Vertigo', 'Vertigo', 'Vertigo, NO..."
...,...,...,...,...,...,...,...,...
82009,Spinal stenosis of unspecified region,,,,,10082214,,
83081,Tabaquism,,,,,10083286,,
83081,Tobacco dependence,,,,,10083286,,
83081,Compulsive tobacco user syndrome,,,,,10083286,,


In [101]:
cadec2_train_ex.to_csv('../../data/interim/cadec/train_ex.csv', index=False)
cadec2_train.to_csv('../../data/interim/cadec/train.csv', index=False)
cadec2_test.to_csv('../../data/interim/cadec/test.csv', index=False)

In [102]:
cadec2_train.shape, cadec2_train_ex.shape

((3520, 8), (7111, 8))

In [103]:
all_used_codes_cadec = np.concatenate([cadec2_train['code'].unique(), cadec2_test['code'].unique()])
all_used_codes_cadec.shape

(781,)

# MRCONSO

In [None]:
mrconso = pd.read_csv('../../data/external/mrconso_umls/MRCONSO_ENG.RRF', sep='|', header=None)

In [None]:
mrconso = mrconso.rename(columns={
    0: "CUI", 
    1: 'LAT',
    2: 'TS',
    3: "LUI",
    4: "STT",
    5: "SUI",
    6: "ISPREF",
    7: "AUI",
    8: "SAUI",
    9: "SCUI",
    10: "SDUI",
    11: "SAB",
    12: "TTY",
    13: "CODE",
    14: "STR",
    15: "SRL",
    16: "SUPPRESS",
    17: "CVF"
})[
    ["CUI", "TS", "SAB", "TTY", "CODE", "STR"]
]

In [None]:
mdr_codes = mrconso[mrconso['SAB']=="MDR"].groupby("CODE").agg(lambda x: x.unique())
snomed =    mrconso[mrconso['SAB']=="SNOMEDCT_US"]

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
mdr_codes = mdr_codes.reset_index(inplace=False)
mdr_codes['SNMS'] = mdr_codes['CUI'].progress_apply(lambda cui: snomed[snomed['CUI']==cui]['STR'].to_list())

In [None]:
mdr_codes

In [None]:
mdr_codes.to_csv('../../data/interim/meddra_codes_terms_synonims.csv', index=False)