In [1]:
import re
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# SMM4H 2021

In [3]:
smm4h21_spans = pd.read_csv('../../data/external/smm4h_2021/SMM4H_2021_train_spans.tsv', sep='\t', header=None)
smm4h21_tweets = pd.read_csv('../../data/external/smm4h_2021/SMM4H_2021_train_tweets.tsv', sep='\t', header=None)
smm4h21 = smm4h21_spans.merge(smm4h21_tweets, on=0)

In [4]:
smm4h21 = smm4h21.rename(columns={
    4: 'term', '1_y': 'text',
    2: 'start', 3: 'end',
    5: 'code'
})

smm4h21 = smm4h21[['term', 'start', 'end', 'text', 'code']]
smm4h21 = smm4h21[smm4h21['code'].str.isdigit()]
smm4h21

Unnamed: 0,term,start,end,text,code
0,allergies,28,37,"do you have any medication allergies? ""asthma!...",10013661
1,HURT YOUR Liver,31,46,"@ashleylvivian if #avelox has hurt your liver,...",10024668
2,AD,48,50,"apparently, baclofen greatly exacerbates the ""...",10003731
3,focus,88,93,"apparently, baclofen greatly exacerbates the ""...",10003738
4,died,11,15,pt of mine died from cipro rt @ciproispoison: ...,10011906
...,...,...,...,...,...
1708,orgasm,48,54,do you think anyone has ever managed to have a...,10021574
1709,never have another orgasm,91,116,@verlieren thank you! it didn't even fucking w...,10021574
1710,coma,65,69,i just found out that some dude from my friend...,10041349
1711,gain so much weight,72,91,that zyprexa really makes your vocal chords ma...,10047986


In [5]:
smm4h21_train, smm4h21_test = train_test_split(smm4h21, test_size=0.2)

In [6]:
smm4h21_train.to_csv('../../data/interim/smm4h21/train.csv', index=False)
smm4h21_test.to_csv('../../data/interim/smm4h21/test.csv', index=False)

# SMM4H 2017

## text files

In [7]:
train_datasets = []
test_datasets = []

path = '../../data/external/smm4h_2017/text_files_direct/'
for file in os.listdir(path):
    print(file)
    df = pd.read_csv(path + file, sep='\t', header=None)
    if 'train' in file:
        train_datasets.append(df)
    elif 'eval' in file:
        test_datasets.append(df)
        
smm4h17_train = pd.concat(train_datasets).rename
smm4h17_test  = pd.concat(test_datasets)

task_3_normalization_evaluation.txt
task_3_normalization_training1.txt
task_3_normalization_training2.txt
task_3_normalization_training3.txt
task_3_normalization_training4.txt


In [8]:
smm4h17_train

<bound method DataFrame.rename of           0                  1         2
0     10365             addict  10013663
1     11138  allergic reaction  10020751
2     10810        pre-cutting  10022524
3     10426        withdrawals  10048010
4     10287          delirious  10012218
...     ...                ...       ...
3909  44422           sedation  10039897
3910  41603             zombie  10016322
3911  41754     hallucinations  10019063
3912  43127    not go to sleep  10022437
3913  43299    unable to sleep  10022437

[6650 rows x 3 columns]>

In [9]:
smm4h17_test

Unnamed: 0,0,1,2
0,44675,sleepier,10041349
1,40103,dreamt colors,10000125
2,41585,zombie,10016322
3,41834,headache,10019211
4,46301,crazy,10061920
...,...,...,...
2495,44530,sleptwalk,10041347
2496,41240,fatigue,10016256
2497,41829,headache,10019211
2498,44667,out of it,10041349


## KFU

In [10]:
smm4h17_spans_train = pd.read_csv('../../data/external/smm4h_2017/smm4h_kfu/processed_train/0.concept', 
                                  sep='|', header=None).dropna(axis=1)[[7, 9]]
smm4h17_spans_test = pd.read_csv('../../data/external/smm4h_2017/smm4h_kfu/processed_test/0.concept', 
                                  sep='|', header=None).dropna(axis=1)[[7, 9]]

In [11]:
smm4h17_spans_train = smm4h17_spans_train.rename(columns={7: 'term', 9: 'code'})
smm4h17_spans_test = smm4h17_spans_test.rename(columns={7: 'term', 9: 'code'})

#smm4h17_spans_train = smm4h17_spans_train[smm4h17_spans_train['code'].str.isdigit()]
smm4h17_spans_test = smm4h17_spans_test[smm4h17_spans_test['code'].str.isdigit()]

smm4h17_spans_train.to_csv('../../data/interim/smm4h17/train.csv', index=False)
smm4h17_spans_test.to_csv('../../data/interim/smm4h17/test.csv', index=False)

In [12]:
# smm4h17_dict_train = pd.read_csv('../../data/external/smm4h_2017/smm4h_kfu/train_dictionary.txt', 
#                                   sep='|', header=None, error_bad_lines=False)
# smm4h17_dict_test = pd.read_csv('../../data/external/smm4h_2017/smm4h_kfu/test_dictionary.txt', 
#                                   sep='|', header=None, error_bad_lines=False)

# UMLS micro

In [15]:
umls_eng_mdr = pd.read_csv('../../data/external/mrconso_umls/MRCONSO_ENG_MDR.csv')
umls_eng_mdr

Unnamed: 0.1,Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
0,2276,C0000727,ENG,P,L0000727,VCW,S0584932,N,A0639292,,,10000647,MDR,PT,10000647,Acute abdomen,3,N,256.0
1,2282,C0000727,ENG,P,L0000727,VCW,S0584932,N,A25741630,,,10000647,MDR,LLT,10000647,Acute abdomen,3,N,256.0
2,2296,C0000727,ENG,S,L0161339,PF,S1616740,Y,A25720821,,,10000647,MDR,LLT,10042784,Syndrome abdominal acute,3,N,
3,2299,C0000727,ENG,S,L0161339,VCW,S1616739,Y,A25708511,,,10000647,MDR,LLT,10000096,Abdominal syndrome acute,3,N,
4,2334,C0000729,ENG,P,L0000729,VC,S0353650,N,A25716812,,,10000081,MDR,LLT,10000057,Abdominal cramps,3,N,256.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112380,11068329,C5244084,ENG,P,L16308084,PF,S19795290,Y,A31790308,,,10055798,MDR,LLT,10084046,Severe bleeding - GUSTO classification,3,N,
112381,11068331,C5244085,ENG,P,L16308140,VC,S19795001,Y,A31790861,,,10053315,MDR,LLT,10084469,Home isolation,3,N,
112382,11068333,C5244086,ENG,P,L16307956,PF,S19794712,Y,A31789860,,,10084354,MDR,LLT,10084499,COVID-19 rapid POC test,3,N,
112383,11068334,C5244087,ENG,P,L16308134,PF,S19794970,Y,A31790128,,,10063630,MDR,LLT,10083228,Genital scratch,3,N,


In [16]:
code_to_term = umls_eng_mdr[['CODE', 'STR']].groupby('CODE').agg(set)
code_to_term[[len(i)>1 for i in code_to_term['STR']]].to_numpy()
code_to_term

Unnamed: 0_level_0,STR
CODE,Unnamed: 1_level_1
10000001,"{""Ventilation"" pneumonitis}"
10000002,{11-beta-hydroxylase deficiency}
10000003,{11-oxysteroid activity incr}
10000004,{11-oxysteroid activity increased}
10000005,{17 ketosteroids urine}
...,...
20000231,{Infective pneumonia (SMQ)}
20000232,{Dehydration (SMQ)}
20000233,"{Hypokalemia (SMQ), Hypokalaemia (SMQ)}"
20000234,{Sepsis (SMQ)}


# PsyTar

In [17]:
def get_MDR_code_by_CUI(CUI):
    potentials = umls_eng_mdr[umls_eng_mdr['CUI']==CUI]
    lst =  potentials['CODE'].to_list()
    return max(set(lst), key=lst.count) if len(lst) > 0 else None

In [19]:
psytar_sent = pd.read_excel('../../data/external/psytar/PsyTAR_dataset.xlsx', sheet_name='ADR_Identified')
psytar_adr = pd.read_excel('../../data/external/psytar/PsyTAR_dataset.xlsx', sheet_name='ADR_Mapped')

# SENT
psytar_sent['mention'] = psytar_sent.apply(
    lambda x: [x[col] for col in psytar_sent if 'ADR' in col and x[col] is not np.nan], 
    axis=1)
psytar_sent = psytar_sent[[col for col in psytar_sent.columns if "ADR" not in col]]
psytar_sent = psytar_sent[['id', 'drug_id', 'sentence_index', 'sentences']]
psytar_sent

# ADR
psytar_adr = psytar_adr[psytar_adr['type']=='ADR'][
    ['drug_id', 'sentence_index', 'ADRs', 'UMLS1']
]

In [20]:
# MERGE
psytar = pd.merge(psytar_adr, psytar_sent,  how='left', 
         left_on=['drug_id','sentence_index'], 
         right_on = ['drug_id','sentence_index'])

# get CUI and term positions (may be more)
psytar['CUI'], psytar['term'] = zip(*psytar['UMLS1'].apply(lambda x: x.split('/')[:2] if len(x.split('/')) >= 2 else [x, None]))

# drop bad data
psytar = psytar[~psytar['term'].isna()]
psytar = psytar[psytar['CUI'].str[0]=='C']

# select needed columns
psytar_adrs = psytar[['ADRs', 'term', 'CUI', 'sentences']]
psytar_adrs['MDR'] = psytar_adrs['CUI'].apply(lambda x: get_MDR_code_by_CUI(x.strip()))

psytar_adrs = psytar_adrs.drop(columns=['CUI'])
psytar_adrs = psytar_adrs.dropna()
psytar_adrs = psytar_adrs.rename(columns={
    'term': 'norm_form',
    'ADRs': 'term',
    'sentences': 'text',
    'MDR': 'code'
})

# MAKE SPANS
psytar_adrs['span'] = psytar_adrs.apply(
    lambda x: list(re.finditer(x['term'].lower(), x['text'].lower())), axis=1)

psytar_adrs['start'] = psytar_adrs['span'].apply(lambda x: x[0].span()[0] if len(x) == 1 else None)
psytar_adrs['end'] = psytar_adrs['span'].apply(lambda x: x[0].span()[1] if len(x) == 1 else None)

psytar_adrs = psytar_adrs.drop(columns=['span'])
psytar_adrs = psytar_adrs[
    ['term', 'start', 'end', 'text', 'norm_form', 'code']
]

#psytar_adrs = psytar_adrs[psytar_adrs['code'].str.isdigit()]

psytar_adrs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Unnamed: 0,term,start,end,text,norm_form,code
1,short-term memory loss,21.0,43.0,"extreme weight gain, short-term memory loss, h...",Poor short-term memory,10040602.0
2,hair loss,45.0,54.0,"extreme weight gain, short-term memory loss, h...",Alopecia,10001760.0
3,completely destroyed sexually functioning,0.0,41.0,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .,Sexual Dysfunction,10040477.0
4,completely destroyed my sexual functioning,33.0,75.0,Just TWO tablets of Lexapro 10mg completely de...,Sexual Dysfunction,10040477.0
5,pssd,12.0,16.0,It's called PSSD: post-SSRI sexual dysfunction.,Sexual Dysfunction,10040477.0
...,...,...,...,...,...,...
4807,early on: nausea,,,"Stomach problems early on: bloating, nausea, c...",Nausea,10028813.0
4808,early on: constipation,,,"Stomach problems early on: bloating, nausea, c...",Constipation,10010774.0
4809,yawning,31.0,38.0,No side effects now accept for yawning.,Yawning,10048232.0
4810,mild insomnia for the first 3 days,,,The only side effects I experienced were mild ...,Sleeplessness,10022437.0


In [21]:
psytar_adrs_train, psytar_adrs_test = train_test_split(psytar_adrs, test_size=0.2)

psytar_adrs_train.to_csv('../../data/interim/psytar/train.csv', index=False)
psytar_adrs_test.to_csv('../../data/interim/psytar/test.csv', index=False)

# CADEC

In [None]:
dfs = []

empty_files = 0
for file in os.listdir('../../data/external/cadec2/cadec/meddra/'):
    try:
        file_name = '.'.join(file.split('.')[:2])
        #print(file_name)
        text = pd.read_csv('../../data/external/cadec2/cadec/text/' + f"{file_name}.txt", sep='\t', header=None)
        adr = pd.read_csv('../../data/external/cadec2/cadec/meddra/' + file, sep='\t', header=None)

    except pd.errors.EmptyDataError as e:
        empty_files += 1
        #print(file, end=' ')
        continue
    
    # подбор предложения
    adr[0] = adr[0].apply(lambda x: int(x[-1])-1 if x[-1].isdigit() else None)
    text = text.reset_index()
    df = pd.merge(adr, text, how='left', left_on=0, right_on='index')
    dfs.append(df)
    
print(f"EMPTY FILES: {empty_files}")
cadec2 = pd.concat(dfs, axis=0)

cadec2 = cadec2.rename(
    columns={1: 'code_span', 2: 'term', '0_y': 'text'})[['term', 'code_span', 'text']]

cadec2 = cadec2.dropna()

cadec2['code'] = cadec2['code_span'].apply(lambda x: x.split(' ')[0])
cadec2['span'] = cadec2['code_span'].apply(lambda x: x.split(' ')[-2:])
cadec2['span'] = cadec2['span'].apply(lambda x: [i.split(';')[-1] for i in x])

cadec2 = cadec2[cadec2['code'].str.isdigit()]
cadec2['start'] = cadec2['span'].apply(lambda x: x[0])
cadec2['end'] = cadec2['span'].apply(lambda x: x[1])

cadec2 = cadec2[
    ['term', 'start', 'end', 'text', 'code']
]

cadec2 = cadec2[cadec2['code'].str.isdigit()]

In [None]:
cadec2

In [None]:
cadec2_train, cadec2_test = train_test_split(cadec2, test_size=0.2)

cadec2_train.to_csv('../../data/interim/cadec/train.csv', index=False)
cadec2_test.to_csv('../../data/interim/cadec/test.csv', index=False)

# MRCONSO

In [85]:
mrconso = pd.read_csv('../../data/external/mrconso_umls/MRCONSO_ENG.RRF', sep='|', header=None)

  interactivity=interactivity, compiler=compiler, result=result)


In [86]:
mrconso = mrconso.rename(columns={
    0: "CUI", 
    1: 'LAT',
    2: 'TS',
    3: "LUI",
    4: "STT",
    5: "SUI",
    6: "ISPREF",
    7: "AUI",
    8: "SAUI",
    9: "SCUI",
    10: "SDUI",
    11: "SAB",
    12: "TTY",
    13: "CODE",
    14: "STR",
    15: "SRL",
    16: "SUPPRESS",
    17: "CVF"
})[
    ["CUI", "TS", "SAB", "TTY", "CODE", "STR"]
]

In [87]:
mdr_codes = mrconso[mrconso['SAB']=="MDR"].groupby("CODE").agg(lambda x: x.unique())
snomed =    mrconso[mrconso['SAB']=="SNOMEDCT_US"]

In [93]:
from tqdm import tqdm
tqdm.pandas()

In [94]:
mdr_codes = mdr_codes.reset_index(inplace=False)
mdr_codes['SNMS'] = mdr_codes['CUI'].progress_apply(lambda cui: snomed[snomed['CUI']==cui]['STR'].to_list())

100%|██████████| 84213/84213 [1:36:26<00:00, 14.55it/s]


In [95]:
mdr_codes

Unnamed: 0,level_0,index,CODE,CUI,TS,SAB,TTY,STR,SNMS
0,0,0,10000001,C0155891,S,MDR,OL,Ventilation pneumonitis,"[Humidifier lung, Ventilation pneumonitis, Air..."
1,1,1,10000002,C0268292,S,MDR,"[PT, LLT]",11-beta-hydroxylase deficiency,[Adrenogenital disorder due to 11-beta-hydroxy...
2,2,2,10000003,C0236132,S,MDR,OL,11-oxysteroid activity incr,[]
3,3,3,10000004,C0236132,P,MDR,LLT,11-oxysteroid activity increased,[]
4,4,4,10000005,C0855638,P,MDR,"[LLT, PT]",17 ketosteroids urine,[]
...,...,...,...,...,...,...,...,...,...
84208,84208,84208,20000231,C4524253,P,MDR,SMQ,Infective pneumonia (SMQ),[]
84209,84209,84209,20000232,C4552564,P,MDR,SMQ,Dehydration (SMQ),[]
84210,84210,84210,20000233,C4761275,P,MDR,"[SMQ, MTH_SMQ]","[Hypokalaemia (SMQ), Hypokalemia (SMQ)]",[]
84211,84211,84211,20000234,C5208327,P,MDR,SMQ,Sepsis (SMQ),[]


In [98]:
mdr_codes.to_csv('../../data/interim/meddra_codes_terms_synonims.csv')