# ICD10 Datasets Unifier

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
IS_CREATING_3SIGN_CODES = True

### Constants and Datasets Names

In [3]:
CODE_PATTERN = '^([A-Z][0-9]{2}(?:\.[0-9]{1})?)$'

READ_DIR = '../datasets/ICD10_Training_data_3sign/' if IS_CREATING_3SIGN_CODES else '../datasets/ICD10_Training_data_4sign/'
WRITE_DIR = '../datasets/'
INVALID_CODES_FILE_NAME = 'invalid_codes_3sign.txt' if IS_CREATING_3SIGN_CODES else 'invalid_codes_4sign.txt'
RESULT_FILE_NAME = 'ICD10_all_3sign.csv' if IS_CREATING_3SIGN_CODES else 'ICD10_all_4sign.csv'

In [4]:
dataset_names_3sign = ["ICD10_bg_3sign.csv",
                       "ICD10_golden standard_3sign.csv",
                       "ICD10-MSWord_3sign.csv",
                       "Wikidata_ICD10_3sign_bg.csv",
                       "Wikidata_ICD10_3sign_lat.csv",
                       "Wikidata_ICD10_3sign_lat_transliterated.csv"]
dataset_names_4sign = ["ICD10_bg_4sign.csv",
                       "ICD10_golden standard_4sign.csv",
                       "ICD10-MSWord_4sign.csv",
                       "Wikidata_ICD10_4sign_bg.csv",
                       "Wikidata_ICD10_4sign_lat.csv",
                       "Wikidata_ICD10_4sign_lat_transliterated.csv"]

In [5]:
dataset_keys = ["icd10_bg",
                "icd10_gs",
                "icd10_ms",
                "icd10_wd_bg",
                "icd10_wd_lt",
                "icd10_wd_trl"]
dataset_names = dataset_names_3sign if IS_CREATING_3SIGN_CODES else dataset_names_4sign
datasets = dict(zip(dataset_keys, dataset_names))

### Refactor Code Methods

In [6]:
# http://homoglyphs.net/
def replace_homoglyphs(s):
    """Replace Homoglyphs: Cyrillic Capital Letters to Latin Capital Letters!"""
    s = s.replace('А', 'A')
    s = s.replace('В', 'B')
    s = s.replace('С', 'C')
    s = s.replace('Е', 'E')
    s = s.replace('Н', 'H')
    s = s.replace('І', 'I')
    s = s.replace('Ј', 'J')
    s = s.replace('К', 'K')
    s = s.replace('М', 'M')
    s = s.replace('О', 'O')
    s = s.replace('Р', 'P')
    s = s.replace('Ѕ', 'S')
    s = s.replace('Т', 'T')
    s = s.replace('Ѵ', 'V')
    s = s.replace('Х', 'X')
    s = s.replace('У', 'Y')
    s = s.replace('Ү', 'Y')
    return s

In [7]:
def remove_star_end(code):
    if re.match(r'^([A-Z][0-9]{2}\*)$', code):
        return code[0:3]
    return code

In [8]:
def remove_dot_dash_end(code):
    if re.match(r'^([A-Z][0-9]{2}\.-)$', code):
        return code[0:3]
    return code

In [9]:
def convert_5sign_to_4sign(code):
    if re.match(r'^([A-Z][0-9]{2}\.[0-9]{2})$', code):
        return code[0:5]
    return code

In [10]:
def refactor_code(code):
    code = code.upper()
    code = replace_homoglyphs(code)
    code = remove_star_end(code)
    code = remove_dot_dash_end(code)
    code = convert_5sign_to_4sign(code)
    return code

In [11]:
def refactor_codes(df):
    df['ICD10'] = df['ICD10'].apply(lambda x: refactor_code(x))

In [12]:
def get_invalid_codes(df, pattern):
    return list(df[~df['ICD10'].str.contains(pattern)]['ICD10'].unique())

In [13]:
def get_valid_df(df, pattern):
    return df[df['ICD10'].str.contains(pattern)]

### Read Datasets

In [14]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_bg'])
df_icd10_bg = pd.read_csv(file_name, header=None, names=['ICD10', 'Text'])
df_icd10_bg.head()

Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [15]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_gs'])
df_icd10_gs = pd.read_csv(file_name, header=None, names=['ICD10', 'Text'])
df_icd10_gs.head()

Unnamed: 0,ICD10,Text
0,A08,гастроентеропатия
1,A17,туберкулом
2,A26,еризипел
3,A66,латентна левостранна хемипареза
4,A66,десностранна латентна хемипареза


In [16]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_ms'])
df_icd10_ms = pd.read_csv(file_name, usecols=[1,2])
df_icd10_ms.columns = ['ICD10', 'Text']
df_icd10_ms.head()

Unnamed: 0,ICD10,Text
0,Абазия астазия хистерична,
1,"Аберантен вроден артерия периферна ,",
2,"Аберантен вроден вена периферна ,",
3,Аберантен вроден гърда,
4,"Аберантен вроден ендокринна жлеза,",


In [17]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_wd_bg'])
df_icd10_wd_bg = pd.read_csv(file_name, header=None, names=['Text', 'ICD10'])
df_icd10_wd_bg = df_icd10_wd_bg[['ICD10', 'Text']]
df_icd10_wd_bg.head()

Unnamed: 0,ICD10,Text
0,B40,Бластомикоза и болест на Чикаго
1,B40,Бластомикоза и болест на Гилкристите
2,B40,Бластомикоза и северноамериканска
3,B85,Лайсна зараза и въшки
4,B85,Лека зараза и педикулоза и фтирус


In [18]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_wd_lt'])
df_icd10_wd_lt = pd.read_csv(file_name, header=None, names=['Text', 'ICD10'])
df_icd10_wd_lt = df_icd10_wd_lt[['ICD10', 'Text']]
df_icd10_wd_lt.head()

Unnamed: 0,ICD10,Text
0,B40,Et Chicago morbo BLASTOMYCOSIS
1,B40,Et BLASTOMYCOSIS morbo Gilchrists
2,B40,Et North American BLASTOMYCOSIS
3,B85,"Pediculus Pediculus humanus quod sit infestatio,"
4,B85,Pediculus humanus pediculosis quod infestatio ...


In [19]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_wd_trl'])
df_icd10_wd_trl = pd.read_csv(file_name, usecols=[1,2], header=None, names=['ICD10', 'Text'])
df_icd10_wd_trl.head()

Unnamed: 0,ICD10,Text
0,B40,﻿ет хикаго морбо бластомикозис
1,B40,ет бластомикозис морбо гилхристс
2,B40,ет норт американ бластомикозис
3,B85,"педикулус педикулус хуманус квод сит инфестацио,"
4,B85,педикулус хуманус педикулозис квод инфестацио ...


### Create and Refactor Dataframe

___Create Dataframe___

In [20]:
dfs = [df_icd10_bg, df_icd10_gs, df_icd10_ms, df_icd10_wd_bg, df_icd10_wd_trl]

In [21]:
df = pd.concat(dfs, ignore_index=True)
df.head()

Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [22]:
df.shape

(149504, 2)

___Refactor Dataframe___

In [23]:
refactor_codes(df)

In [24]:
invalid_codes = get_invalid_codes(df, CODE_PATTERN)
file_name = '{0}{1}'.format(WRITE_DIR, INVALID_CODES_FILE_NAME)
with open(file_name, 'w', encoding="utf-8") as f:
    for item in invalid_codes:
        f.write("%s\n" % item)

  return func(self, *args, **kwargs)


In [25]:
df = get_valid_df(df, CODE_PATTERN)
if len(get_invalid_codes(df, CODE_PATTERN)) == 0:
    print('Dataframe contains only valid codes!')
if IS_CREATING_3SIGN_CODES:
    df['ICD10'] = df['ICD10'].str[:3]
    print('Dataframe contains only 3sign codes!')
df.head()

  return func(self, *args, **kwargs)


Dataframe contains only valid codes!
Dataframe contains only 3sign codes!


Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [26]:
df.shape

(104227, 2)

In [27]:
df.drop_duplicates(inplace = True)
df.shape

(103717, 2)

### Save Dataframe

In [28]:
file_name = '{0}{1}'.format(WRITE_DIR, RESULT_FILE_NAME)
df.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=False)