# ICD10 Datasets Unifier

In [1]:
import numpy as np
import pandas as pd

In [2]:
read_dir = '../datasets/single/'
write_dir = '../datasets/'

## Read and Transform Datasets

### icd10_3sign_latin_clean_transliterated.csv

In [3]:
file_name = '{0}{1}'.format(read_dir, 'icd10_3sign_latin_clean_transliterated.csv')
icd10_3sign_latin_df = pd.read_csv(file_name, usecols=[0,2])
icd10_3sign_latin_df.columns = ['ICD10', 'Text']
icd10_3sign_latin_df.head()

Unnamed: 0,ICD10,Text
0,Q82,ф-ксп
1,Q82,г-ксп
2,Q82,б-ксп
3,E75,ет дг
4,Q82,а-ксп


In [4]:
icd10_3sign_latin_df.shape

(30152, 2)

### ICD10_3sign_WD_bg.csv

In [5]:
data = []
file_name = '{0}{1}'.format(read_dir, 'ICD10_3sign_WD_bg.csv')
with open(file_name, encoding="utf-8") as f:
    for line in f:
        line = line.rstrip().rsplit(',', 1)
        if len(line) == 2:
            data.append(line)

In [6]:
icd10_3sign_wd_df = pd.DataFrame(data[1:], columns = data[0])
icd10_3sign_wd_df.columns = ['Text', 'ICD10']
icd10_3sign_wd_df = icd10_3sign_wd_df[['ICD10', 'Text']]
icd10_3sign_wd_df.head()

Unnamed: 0,ICD10,Text
0,A66,Буба
1,A66,Ендемични трепонематози
2,A66,frambesia
3,A66,frambesia tropica
4,A66,frambosie


In [7]:
icd10_3sign_wd_df.shape

(31588, 2)

### ICD10_bg.csv

In [8]:
file_name = '{0}{1}'.format(read_dir, 'ICD10_bg.csv')
icd10_bg_df = pd.read_csv(file_name, header=None, names=['ICD10', 'Text'])
icd10_bg_df.head()

Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00.0,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00.1,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00.9,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [9]:
icd10_bg_df.shape

(10971, 2)

### ICD10_golden standard.csv

In [10]:
file_name = '{0}{1}'.format(read_dir, 'ICD10_golden standard.csv')
icd10_gs_df = pd.read_csv(file_name, header=None, names=['ICD10', 'Text'])
icd10_gs_df.head()

Unnamed: 0,ICD10,Text
0,A08.1,гастроентеропатия
1,A17.1,туберкулом
2,A26.0,еризипел
3,A66.8,латентна левостранна хемипареза
4,A66.8,десностранна латентна хемипареза


In [11]:
icd10_gs_df.shape

(4621, 2)

### ICD10-MSWord.csv

In [12]:
file_name = '{0}{1}'.format(read_dir, 'ICD10-MSWord.csv')
icd10_msword_df = pd.read_csv(file_name, usecols=[1,2])
icd10_msword_df.columns = ['ICD10', 'Text']
icd10_msword_df.head()

Unnamed: 0,ICD10,Text
0,F44.4,Абазия астазия хистерична
1,Q27.8,"Аберантен вроден артерия периферна ,"
2,Q27.8,"Аберантен вроден вена периферна ,"
3,Q83.8,Аберантен вроден гърда
4,Q89.2,"Аберантен вроден ендокринна жлеза,"


In [13]:
icd10_msword_df.shape

(47354, 2)

## Create and Save Datasets

In [14]:
dfs = [icd10_3sign_latin_df, icd10_3sign_wd_df, icd10_bg_df, icd10_gs_df, icd10_msword_df]

In [15]:
df = pd.concat(dfs, ignore_index=True)
df.head()

Unnamed: 0,ICD10,Text
0,Q82,ф-ксп
1,Q82,г-ксп
2,Q82,б-ксп
3,E75,ет дг
4,Q82,а-ксп


In [16]:
df.shape

(124686, 2)

In [17]:
file_name = '{0}{1}'.format(write_dir, 'ICD10_all.csv')
df.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=False)

In [18]:
df.drop_duplicates().shape

(121255, 2)

In [19]:
file_name = '{0}{1}'.format(write_dir, 'ICD10_all_unique.csv')
df.drop_duplicates().to_csv(file_name, sep=',', encoding='utf-8', header=True, index=False)

*** 3 Signs Codes ***

In [20]:
df['ICD10'] = df['ICD10'].apply(lambda x: x[0:3])
df.tail()

Unnamed: 0,ICD10,Text
124681,Q97,XXXX синдром жени
124682,Q97,XXXXX синдром жени
124683,Q98,XXXXY синдром
124684,Q98,XXY синдром
124685,В08,Yaba-вирусна инфекция


In [21]:
file_name = '{0}{1}'.format(write_dir, 'ICD10_all_3sign.csv')
df.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=False)

In [22]:
df.drop_duplicates().shape

(118697, 2)

In [23]:
file_name = '{0}{1}'.format(write_dir, 'ICD10_all_3sign_unique.csv')
df.drop_duplicates().to_csv(file_name, sep=',', encoding='utf-8', header=True, index=False)