# ICD10 3 Sign Datasets Unifier

In [1]:
import numpy as np
import pandas as pd
import re

### Constants and Datasets Names

In [2]:
CODE_PATTERN = '^([A-Z][0-9]{2})$'

READ_DIR = '../datasets/ICD10_Training_data_3sign/'
WRITE_DIR = '../datasets/'

RESULT_FILE_NAME = 'ICD10_all_3sign.csv'

In [3]:
datasets = {
    "icd10_bg": "ICD10_bg_3sign.csv",
    "icd10_gs": "ICD10_golden standard_3sign.csv",
    "icd10_ms": "ICD10-MSWord_3sign.csv",
    "icd10_wd_bg": "Wikidata_ICD10_3sign_bg.csv",
    "icd10_wd_lt": "Wikidata_ICD10_3sign_lat.csv",
    "icd10_wd_trl": "Wikidata_ICD10_3sign_lat_transliterated.csv"
}

### Refactor Code Methods

In [4]:
# http://homoglyphs.net/
def replace_homoglyphs(s):
    """Replace Homoglyphs: Cyrillic Capital Letters to Latin Capital Letters!"""
    s = s.replace('А', 'A')
    s = s.replace('В', 'B')
    s = s.replace('С', 'C')
    s = s.replace('Е', 'E')
    s = s.replace('Н', 'H')
    s = s.replace('І', 'I')
    s = s.replace('Ј', 'J')
    s = s.replace('К', 'K')
    s = s.replace('М', 'M')
    s = s.replace('О', 'O')
    s = s.replace('Р', 'P')
    s = s.replace('Ѕ', 'S')
    s = s.replace('Т', 'T')
    s = s.replace('Ѵ', 'V')
    s = s.replace('Х', 'X')
    s = s.replace('У', 'Y')
    s = s.replace('Ү', 'Y')
    return s

In [5]:
def replace_omoglyphs(s):
    s = s.replace('Б', 'B')
    return s

In [6]:
def replace_comma_with_dot(code):
    return code.replace(',', '.')

In [7]:
def remove_pre_text(code):
    m = re.match(r'^([A-Za-zА-Яа-я]+\s+)[A-Z][0-9]{2}\.[0-9]$', code)
    if m:
        return code.replace(m.group(1), '')
    return code

In [8]:
def remove_space(code):
    return code.replace(' ', '')

In [9]:
def remove_plus_end(code):
    m = re.match(r'^.*[^\s](\s*\+)$', code)
    if m:
        return code.replace(m.group(1), '')
    return code

In [10]:
def remove_star_end(code):
    if re.match(r'^([A-Z][0-9]{2}\*)$', code):
        return code[0:3]
    return code

In [11]:
def remove_dot_end(code):
    if re.match(r'^([A-Z][0-9]{2}\.)$', code):
        return code[0:3]
    return code

In [12]:
def remove_dot_dash_end(code):
    if re.match(r'^([A-Z][0-9]{2}\.-)$', code):
        return code[0:3]
    return code

In [13]:
def remove_dot_letter_end(code):
    if re.match(r'^([A-Z][0-9]{2}\.[A-Z])$', code):
        return code[0:3]
    return code

In [14]:
def convert_no_dot_code_to_dot_code(code):
    if re.match(r'^([A-Z][0-9]{2}[0-9]{1,2})$', code):
        return code[:3] + '.' + code[3:]
    return code

In [15]:
def convert_5sign_to_4sign(code):
    if re.match(r'^([A-Z][0-9]{2}\.[0-9]{2})$', code):
        return code[0:5]
    return code

In [16]:
def refactor_code(code):
    code = code.upper()
    code = replace_homoglyphs(code)
    code = replace_omoglyphs(code)
    code = replace_comma_with_dot(code)
    code = remove_pre_text(code)
    code = remove_space(code)
    code = remove_plus_end(code)
    code = remove_star_end(code)
    code = remove_dot_end(code)
    code = remove_dot_dash_end(code)
    code = remove_dot_letter_end(code)
    code = convert_no_dot_code_to_dot_code(code)
    code = convert_5sign_to_4sign(code)
    return code

In [17]:
def refactor_codes(df):
    df['ICD10'] = df['ICD10'].apply(lambda x: refactor_code(x))

In [18]:
def get_invalid_codes(df, pattern):
    return list(df[~df['ICD10'].str.contains(pattern)]['ICD10'].unique())

In [19]:
def get_valid_df(df, pattern):
    return df[df['ICD10'].str.contains(pattern)]

### Read Datasets

In [20]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_bg'])
df_icd10_bg = pd.read_csv(file_name, header=None, names=['ICD10', 'Text'])
df_icd10_bg.head()

Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [21]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_gs'])
df_icd10_gs = pd.read_csv(file_name, header=None, names=['ICD10', 'Text'])
df_icd10_gs.head()

Unnamed: 0,ICD10,Text
0,A08,гастроентеропатия
1,A17,туберкулом
2,A26,еризипел
3,A66,латентна левостранна хемипареза
4,A66,десностранна латентна хемипареза


In [22]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_ms'])
df_icd10_ms = pd.read_csv(file_name)
df_icd10_ms.head()

Unnamed: 0,ICD10,Text
0,F44,Абазия астазия хистерична
1,Q27,"Аберантен вроден артерия периферна ,"
2,Q27,"Аберантен вроден вена периферна ,"
3,Q83,Аберантен вроден гърда
4,Q89,"Аберантен вроден ендокринна жлеза,"


In [23]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_wd_bg'])
df_icd10_wd_bg = pd.read_csv(file_name, header=None, names=['Text', 'ICD10'])
df_icd10_wd_bg = df_icd10_wd_bg[['ICD10', 'Text']]
df_icd10_wd_bg.head()

Unnamed: 0,ICD10,Text
0,B40,Бластомикоза и болест на Чикаго
1,B40,Бластомикоза и болест на Гилкристите
2,B40,Бластомикоза и северноамериканска
3,B85,Лайсна зараза и въшки
4,B85,Лека зараза и педикулоза и фтирус


In [24]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_wd_lt'])
df_icd10_wd_lt = pd.read_csv(file_name, header=None, names=['Text', 'ICD10'])
df_icd10_wd_lt = df_icd10_wd_lt[['ICD10', 'Text']]
df_icd10_wd_lt.head()

Unnamed: 0,ICD10,Text
0,B40,Et Chicago morbo BLASTOMYCOSIS
1,B40,Et BLASTOMYCOSIS morbo Gilchrists
2,B40,Et North American BLASTOMYCOSIS
3,B85,"Pediculus Pediculus humanus quod sit infestatio,"
4,B85,Pediculus humanus pediculosis quod infestatio ...


In [25]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_wd_trl'])

In [26]:
df_icd10_wd_trl_bg = pd.read_csv(file_name, usecols=[1,2], header=None, names=['ICD10', 'Text'])
df_icd10_wd_trl_bg.head()

Unnamed: 0,ICD10,Text
0,B40,﻿ет хикаго морбо бластомикозис
1,B40,ет бластомикозис морбо гилхристс
2,B40,ет норт американ бластомикозис
3,B85,"педикулус педикулус хуманус квод сит инфестацио,"
4,B85,педикулус хуманус педикулозис квод инфестацио ...


In [27]:
df_icd10_wd_trl_lt = pd.read_csv(file_name, usecols=[0,1], header=None, names=['Text', 'ICD10'])
df_icd10_wd_trl_lt = df_icd10_wd_trl_lt[['ICD10', 'Text']]
df_icd10_wd_trl_lt.head()

Unnamed: 0,ICD10,Text
0,B40,Et Chicago morbo BLASTOMYCOSIS
1,B40,Et BLASTOMYCOSIS morbo Gilchrists
2,B40,Et North American BLASTOMYCOSIS
3,B85,"Pediculus Pediculus humanus quod sit infestatio,"
4,B85,Pediculus humanus pediculosis quod infestatio ...


### Create and Refactor Dataframe

___Create Dataframe___

In [28]:
dfs = [df_icd10_bg, df_icd10_gs, df_icd10_ms, df_icd10_wd_bg, df_icd10_wd_lt, df_icd10_wd_trl_bg, df_icd10_wd_trl_lt]

In [29]:
df = pd.concat(dfs, ignore_index=True)
df.head()

Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [30]:
df.shape

(239340, 2)

___Refactor Dataframe___

In [31]:
refactor_codes(df)

In [32]:
df = get_valid_df(df, CODE_PATTERN)
if len(get_invalid_codes(df, CODE_PATTERN)) == 0:
    print('Dataframe contains only valid codes!')

if len(df[df['ICD10'].str.len() > 3]) > 0:
    df['ICD10'] = df['ICD10'].str[:3]
    print('Dataframe is filtered to contains only 3sign codes!')

df.head()

  return func(self, *args, **kwargs)


Dataframe contains only valid codes!


Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [33]:
df.shape

(238973, 2)

In [34]:
df.drop_duplicates(inplace = True)
df.shape

(189756, 2)

***Filter by ICD10_bg_3sign.csv***

In [35]:
invalid_codes = df[~df['ICD10'].isin(df_icd10_bg['ICD10'])]['ICD10'].unique()
invalid_codes

array(['M56', 'B12', 'M26', 'B98', 'M58', 'B07', 'J59', 'G65', 'K34',
       'N38', 'N68', 'O54', 'F08', 'F46', 'E36', 'I17', 'C86', 'J09',
       'M04', 'M27', 'U06', 'K64', 'I75', 'G89', 'P84', 'R37', 'E08',
       'G14', 'C28'], dtype=object)

In [36]:
len(invalid_codes)

29

In [37]:
df = df[df['ICD10'].isin(df_icd10_bg['ICD10'])]
df.head()

Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [38]:
df.shape

(189254, 2)

### Save Dataframe

In [39]:
file_name = '{0}{1}'.format(WRITE_DIR, RESULT_FILE_NAME)
df.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=False)