In [43]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# SMM4H 2021

In [34]:
smm4h21_spans = pd.read_csv('../../data/external/smm4h_2021/SMM4H_2021_train_spans.tsv', sep='\t', header=None)
smm4h21_tweets = pd.read_csv('../../data/external/smm4h_2021/SMM4H_2021_train_tweets.tsv', sep='\t', header=None)
smm4h21 = smm4h21_spans.merge(smm4h21_tweets, on=0)

In [39]:
smm4h21 = smm4h21.rename(columns={
    4: 'term', '1_y': 'text',
    2: 'start', 3: 'end',
    5: 'code'
})

smm4h21 = smm4h21[['term', 'start', 'end', 'text', 'code']]
smm4h21

Unnamed: 0,term,start,end,text,code
0,allergies,28,37,"do you have any medication allergies? ""asthma!...",10013661
1,HURT YOUR Liver,31,46,"@ashleylvivian if #avelox has hurt your liver,...",10024668
2,AD,48,50,"apparently, baclofen greatly exacerbates the ""...",10003731
3,focus,88,93,"apparently, baclofen greatly exacerbates the ""...",10003738
4,died,11,15,pt of mine died from cipro rt @ciproispoison: ...,10011906
...,...,...,...,...,...
1708,orgasm,48,54,do you think anyone has ever managed to have a...,10021574
1709,never have another orgasm,91,116,@verlieren thank you! it didn't even fucking w...,10021574
1710,coma,65,69,i just found out that some dude from my friend...,10041349
1711,gain so much weight,72,91,that zyprexa really makes your vocal chords ma...,10047986


In [41]:
smm4h21_train, smm4h21_test = train_test_split(smm4h21, test_size=0.2)

In [42]:
smm4h21_train.to_csv('../../data/interim/smm4h21/train.csv')
smm4h21_test.to_csv('../../data/interim/smm4h21/test.csv')

# SMM4H 2017

## text files

In [50]:
train_datasets = []
test_datasets = []
for file in os.listdir('../../data/external/smm4h_2017/'):
    df = pd.read_csv('../../data/external/smm4h_2017/' + file, sep='\t', header=None)
    if 'train' in file:
        train_datasets.append(df)
    elif 'eval' in file:
        test_datasets.append(df)
        
smm4h17_train = pd.concat(train_datasets).rename
smm4h17_test  = pd.concat(test_datasets)

In [52]:
smm4h17_test

Unnamed: 0,0,1,2
0,44675,sleepier,10041349
1,40103,dreamt colors,10000125
2,41585,zombie,10016322
3,41834,headache,10019211
4,46301,crazy,10061920
...,...,...,...
2495,44530,sleptwalk,10041347
2496,41240,fatigue,10016256
2497,41829,headache,10019211
2498,44667,out of it,10041349


## KFU

In [114]:
smm4h17_spans_train = pd.read_csv('../../data/external/smm4h_2017/smm4h_kfu/processed_train/0.concept', 
                                  sep='|', header=None).dropna(axis=1)[[7, 9]]
smm4h17_spans_test = pd.read_csv('../../data/external/smm4h_2017/smm4h_kfu/processed_test/0.concept', 
                                  sep='|', header=None).dropna(axis=1)[[7, 9]]

In [134]:
smm4h17_spans_train = smm4h17_spans_train.rename(columns={7: 'term', 9: 'code'})
smm4h17_spans_test = smm4h17_spans_test.rename(columns={7: 'term', 9: 'code'})
smm4h17_spans_train.to_csv('../../data/interim/smm4h17/train.csv')
smm4h17_spans_test.to_csv('../../data/interim/smm4h17/test.csv')

In [135]:
# smm4h17_dict_train = pd.read_csv('../../data/external/smm4h_2017/smm4h_kfu/train_dictionary.txt', 
#                                   sep='|', header=None, error_bad_lines=False)
# smm4h17_dict_test = pd.read_csv('../../data/external/smm4h_2017/smm4h_kfu/test_dictionary.txt', 
#                                   sep='|', header=None, error_bad_lines=False)

# UMLS micro

In [143]:
umls_eng_mdr = pd.read_csv('../../data/external/MRCONSO_ENG_MDR.csv')

# PsyTar

In [138]:
def get_MDR_code_by_CUI(CUI):
    potentials = umls_eng_mdr[umls_eng_mdr['CUI']==CUI]
    lst =  potentials['CODE'].to_list()
    return max(set(lst), key=lst.count) if len(lst) > 0 else None

In [144]:
psytar = pd.read_excel('../../data/external/PsyTAR_dataset.xlsx', sheet_name='ADR_Mapped')
psytar = psytar[psytar['type']=='ADR'][
    ['ADRs', 'UMLS1', 'SNOMED-CT', 'UMLS2', 'SNOMED-CT.1']
]

# select columns
psytar = psytar[
    ['ADRs', 'UMLS1']
]

# get CUI and term positions (may be more)
psytar['CUI'], psytar['term'] = zip(*psytar['UMLS1'].apply(lambda x: x.split('/')[:2] if len(x.split('/')) >= 2 else [x, None]))

# drop bad data
psytar = psytar[~psytar['term'].isna()]
psytar = psytar[psytar['CUI'].str[0]=='C']

# select needed columns
psytar_adrs = psytar[['ADRs', 'CUI', 'term']]
psytar_adrs['MDR'] = psytar_adrs['CUI'].apply(lambda x: get_MDR_code_by_CUI(x.strip()))

# prepare raw format
psytar_adrs['term'] = psytar_adrs.apply(lambda x: [x['ADRs'], x['term']], axis=1)
psytar_adrs = psytar_adrs[
    ['term', 'MDR']
].explode('term')

psytar_adrs = psytar_adrs.rename(columns={'MDR': 'code'}).dropna()
psytar_adrs['code'] = psytar_adrs['code'].apply(lambda x: int(x))

In [148]:
psytar_adrs_train, psytar_adrs_test = train_test_split(psytar_adrs, test_size=0.2)

psytar_adrs_train.to_csv('../../data/interim/psytar/train.csv')
psytar_adrs_test.to_csv('../../data/interim/psytar/test.csv')

# CADEC 1

In [150]:
files = []
for file in os.listdir('../../data/external/cadec1/MedDRA/'):
    try:
        files.append(pd.read_csv('../../data/external/cadec1/MedDRA/' + file, sep='\t', header=None))
    except pd.errors.EmptyDataError as e:
        #print(file, end=' ')
        pass
        
cadec1 = pd.concat(files, axis=0)
cadec1 = cadec1.rename(
    columns={1: 'code', 2: 'term'}
)[['term', 'code']]
cadec1['code'] = cadec1['code'].apply(lambda x: x.split(' ')[0])
cadec1 = cadec1[(cadec1['code'] != '') & (cadec1['code'].str.isdigit())]
cadec1

Unnamed: 0,term,code
0,severe abdominal pain,10033371
1,bowel/uterine cramping,10000055
2,severe pain,10033371
0,bit drowsy,10013649
1,little blurred vision,10005886
...,...,...
0,sedation,10039897
0,nausea,10028813
0,hurts throat,10033494
0,stiff neck,10042043


In [151]:
cadec1_train, cadec1_test = train_test_split(cadec1, test_size=0.2)

cadec1_train.to_csv('../../data/interim/cadec1/train.csv')
cadec1_test.to_csv('../../data/interim/cadec1/test.csv')

# CADEC 2

In [153]:
files = []
for file in os.listdir('../../data/external/cadec2/cadec/meddra/'):
    try:
        files.append(pd.read_csv('../../data/external/cadec2/cadec/meddra/' + file, sep='\t', header=None))
    except pd.errors.EmptyDataError as e:
        print(file, end=' ')
        pass
        
cadec2 = pd.concat(files, axis=0)
cadec2 = cadec2.rename(
    columns={1: 'code', 2: 'term'}
)[['term', 'code']].dropna()
cadec2['code'] = cadec2['code'].apply(lambda x: x.split(' ')[0])
cadec2 = cadec2[(cadec2['code'] != '') & (cadec2['code'].str.isdigit())]
cadec2

ARTHROTEC.145.ann ARTHROTEC.102.ann ARTHROTEC.106.ann ARTHROTEC.11.ann ARTHROTEC.39.ann ARTHROTEC.40.ann ARTHROTEC.48.ann ARTHROTEC.129.ann ARTHROTEC.5.ann ARTHROTEC.54.ann ARTHROTEC.55.ann ARTHROTEC.136.ann ARTHROTEC.139.ann ARTHROTEC.79.ann ARTHROTEC.84.ann ARTHROTEC.91.ann ARTHROTEC.98.ann CAMBIA.1.ann DICLOFENAC-POTASSIUM.1.ann DICLOFENAC-POTASSIUM.2.ann FLECTOR.1.ann LIPITOR.127.ann LIPITOR.150.ann LIPITOR.16.ann LIPITOR.188.ann LIPITOR.197.ann LIPITOR.21.ann LIPITOR.243.ann LIPITOR.252.ann LIPITOR.267.ann LIPITOR.28.ann LIPITOR.285.ann LIPITOR.296.ann LIPITOR.299.ann LIPITOR.308.ann LIPITOR.313.ann LIPITOR.329.ann LIPITOR.333.ann LIPITOR.351.ann LIPITOR.37.ann LIPITOR.372.ann LIPITOR.373.ann LIPITOR.382.ann LIPITOR.383.ann LIPITOR.4.ann LIPITOR.40.ann LIPITOR.404.ann LIPITOR.409.ann LIPITOR.41.ann LIPITOR.416.ann LIPITOR.424.ann LIPITOR.437.ann LIPITOR.438.ann LIPITOR.444.ann LIPITOR.47.ann LIPITOR.487.ann LIPITOR.489.ann LIPITOR.49.ann LIPITOR.5.ann LIPITOR.510.ann LIPITOR.516.a

Unnamed: 0,term,code
0,severe abdominal pain,10033371
1,bowel/uterine cramping,10000055
2,severe pain,10033371
0,bit drowsy,10013649
1,little blurred vision,10005886
...,...,...
1,joint pains,10003239
2,light headedness(problem keeping balance),10024492
3,Tremors in Right hand,10044565
4,hand writting had deteriorated,10048053


In [154]:
cadec2_train, cadec2_test = train_test_split(cadec2, test_size=0.2)

cadec2_train.to_csv('../../data/interim/cadec2/train.csv')
cadec2_test.to_csv('../../data/interim/cadec2/test.csv')