# ICD10 4 Sign Datasets Unifier

In [1]:
import numpy as np
import pandas as pd
import re

### Constants and Datasets Names

In [2]:
CODE_PATTERN = '^([A-Z][0-9]{2}(?:\.[0-9]{1})?)$'

READ_DIR = '../datasets/ICD10_Training_data_4sign/'
WRITE_DIR = '../datasets/'

ICD10_ALL_3SIGN_FILE_NAME = '../datasets/ICD10_all_3sign.csv'
ICD10_BG_3SIGN_FILE_NAME = '../datasets/ICD10_Training_data_3sign/ICD10_bg_3sign.csv'

RESULT_FILE_NAME = 'ICD10_all_4sign.csv'

In [3]:
datasets = {
    "icd10_bg": "ICD10_bg_4sign.csv",
    "icd10_gs": "ICD10_golden standard_4sign.csv",
    "icd10_ms": "ICD10-MSWord_4sign.csv",
    "icd10_wd_bg": "Wikidata_ICD10_4sign_bg.csv",
    "icd10_wd_lt": "Wikidata_ICD10_4sign_lat.csv",
    "icd10_wd_trl": "Wikidata_ICD10_4sign_lat_transliterated.csv"
}

### Refactor Code Methods

In [4]:
# http://homoglyphs.net/
def replace_homoglyphs(s):
    """Replace Homoglyphs: Cyrillic Capital Letters to Latin Capital Letters!"""
    s = s.replace('А', 'A')
    s = s.replace('В', 'B')
    s = s.replace('С', 'C')
    s = s.replace('Е', 'E')
    s = s.replace('Н', 'H')
    s = s.replace('І', 'I')
    s = s.replace('Ј', 'J')
    s = s.replace('К', 'K')
    s = s.replace('М', 'M')
    s = s.replace('О', 'O')
    s = s.replace('Р', 'P')
    s = s.replace('Ѕ', 'S')
    s = s.replace('Т', 'T')
    s = s.replace('Ѵ', 'V')
    s = s.replace('Х', 'X')
    s = s.replace('У', 'Y')
    s = s.replace('Ү', 'Y')
    return s

In [5]:
def replace_omoglyphs(s):
    s = s.replace('Б', 'B')
    return s

In [6]:
def replace_comma_with_dot(code):
    return code.replace(',', '.')

In [7]:
def remove_pre_text(code):
    m = re.match(r'^([A-Za-zА-Яа-я]+\s+)[A-Z][0-9]{2}\.[0-9]$', code)
    if m:
        return code.replace(m.group(1), '')
    return code

In [8]:
def remove_space(code):
    return code.replace(' ', '')

In [9]:
def remove_plus_end(code):
    m = re.match(r'^.*[^\s](\s*\+)$', code)
    if m:
        return code.replace(m.group(1), '')
    return code

In [10]:
def remove_star_end(code):
    if re.match(r'^([A-Z][0-9]{2}\*)$', code):
        return code[0:3]
    return code

In [11]:
def remove_dot_end(code):
    if re.match(r'^([A-Z][0-9]{2}\.)$', code):
        return code[0:3]
    return code

In [12]:
def remove_dot_dash_end(code):
    if re.match(r'^([A-Z][0-9]{2}\.-)$', code):
        return code[0:3]
    return code

In [15]:
def convert_no_dot_code_to_dot_code(code):
    if re.match(r'^([A-Z][0-9]{2}[0-9]{1,2})$', code):
        return code[:3] + '.' + code[3:]
    return code

In [16]:
def remove_dot_letter_end(code):
    if re.match(r'^([A-Z][0-9]{2}\.[A-Z])$', code):
        return code[0:3]
    return code

In [17]:
def convert_5sign_to_4sign(code):
    if re.match(r'^([A-Z][0-9]{2}\.[0-9]{2})$', code):
        return code[0:5]
    return code

In [18]:
def refactor_code(code):
    code = code.upper()
    code = replace_homoglyphs(code)
    code = replace_omoglyphs(code)
    code = replace_comma_with_dot(code)
    code = remove_pre_text(code)
    code = remove_space(code)
    code = remove_plus_end(code)
    code = remove_star_end(code)
    code = remove_dot_end(code)
    code = remove_dot_dash_end(code)
    code = remove_dot_letter_end(code)
    code = convert_no_dot_code_to_dot_code(code)
    code = convert_5sign_to_4sign(code)
    return code

In [19]:
def refactor_codes(df):
    df['ICD10'] = df['ICD10'].apply(lambda x: refactor_code(x))

In [20]:
def get_invalid_codes(df, pattern):
    return list(df[~df['ICD10'].str.contains(pattern)]['ICD10'].unique())

In [21]:
def get_valid_df(df, pattern):
    return df[df['ICD10'].str.contains(pattern)]

In [29]:
def print_stat(df, only_main_stats=True):
    df = df.copy()
    
    file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_bg'])
    df_icd10_bg = pd.read_csv(file_name, header=None, names=['ICD10', 'Text'])

    num_inv = len(get_invalid_codes(df, CODE_PATTERN))
    
    refactor_codes(df)
    num_inv_not_fixable = len(get_invalid_codes(df, CODE_PATTERN))
    
    num_inst = len(df)
    
    df = get_valid_df(df, CODE_PATTERN)
    num_valid_inst = len(df)
    
    df.drop_duplicates(inplace = True)
    num_inst_no_dupl = len(df)
    
    num_unique_codes = len(df['ICD10'].unique())
    
    df_icd10_bg_3sign = pd.read_csv(ICD10_BG_3SIGN_FILE_NAME, header=None, names=['ICD10', 'Text'])
    df_valid_codes = pd.concat([df_icd10_bg, df_icd10_bg_3sign], ignore_index=True)
    df = df[df['ICD10'].isin(df_valid_codes['ICD10'])]
    num_valid = len(df)
    
    num_unique_valid_codes = len(df['ICD10'].unique())
    
    if only_main_stats:
        print(num_inst_no_dupl, num_valid, num_unique_valid_codes)
    else:
        print(num_inv, num_inv_not_fixable, num_inst, num_valid_inst, num_inst_no_dupl, num_unique_codes, num_valid, num_unique_valid_codes)

### Read Datasets

In [23]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_bg'])
df_icd10_bg = pd.read_csv(file_name, header=None, names=['ICD10', 'Text'])
df_icd10_bg.head()

Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00.0,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00.1,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00.9,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [24]:
print_stat(df_icd10_bg)

  return func(self, *args, **kwargs)


93 0 10971 10971 10971 10971 10971 10971


In [26]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_gs'])
df_icd10_gs = pd.read_csv(file_name, header=None, names=['ICD10', 'Text'])
df_icd10_gs.head()

Unnamed: 0,ICD10,Text
0,A08.1,гастроентеропатия
1,A17.1,туберкулом
2,A26.0,еризипел
3,A66.8,латентна левостранна хемипареза
4,A66.8,десностранна латентна хемипареза


In [24]:
print_stat(df_icd10_gs)

  return func(self, *args, **kwargs)


14 0 4621 4621 4571 440 4493 431


In [25]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_ms'])
df_icd10_ms = pd.read_csv(file_name)
df_icd10_ms.head()

Unnamed: 0,ICD10,Text
0,F44.4,Абазия астазия хистерична
1,Q27.8,"Аберантен вроден артерия периферна ,"
2,Q27.8,"Аберантен вроден вена периферна ,"
3,Q83.8,Аберантен вроден гърда
4,Q89.2,"Аберантен вроден ендокринна жлеза,"


In [26]:
print_stat(df_icd10_ms)

  return func(self, *args, **kwargs)


440 0 44987 44987 44984 8330 44010 8043


In [27]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_wd_bg'])
df_icd10_wd_bg = pd.read_csv(file_name, header=None, names=['Text', 'ICD10'])
df_icd10_wd_bg = df_icd10_wd_bg[['ICD10', 'Text']]
df_icd10_wd_bg.head()

Unnamed: 0,ICD10,Text
0,B40.9,Бластомикоза и болест на Чикаго
1,B40,Бластомикоза и болест на Чикаго
2,B40.9,Бластомикоза и болест на Гилкристите
3,B40,Бластомикоза и болест на Гилкристите
4,B40.9,Бластомикоза и северноамериканска


In [28]:
print_stat(df_icd10_wd_bg)

  return func(self, *args, **kwargs)


74 6 50445 50358 50349 3471 48963 3292


In [29]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_wd_lt'])
df_icd10_wd_lt = pd.read_csv(file_name, header=None, names=['Text', 'ICD10'])
df_icd10_wd_lt = df_icd10_wd_lt[['ICD10', 'Text']]
df_icd10_wd_lt.head()

Unnamed: 0,ICD10,Text
0,B40.9,Et Chicago morbo BLASTOMYCOSIS
1,B40,Et Chicago morbo BLASTOMYCOSIS
2,B40.9,Et BLASTOMYCOSIS morbo Gilchrists
3,B40,Et BLASTOMYCOSIS morbo Gilchrists
4,B40.9,Et North American BLASTOMYCOSIS


In [30]:
print_stat(df_icd10_wd_lt)

  return func(self, *args, **kwargs)


63 7 52007 51972 51962 3468 50536 3289


In [31]:
file_name = '{0}{1}'.format(READ_DIR, datasets['icd10_wd_trl'])

In [32]:
df_icd10_wd_trl_bg = pd.read_csv(file_name, usecols=[1,2], header=None, names=['ICD10', 'Text'])
df_icd10_wd_trl_bg.head()

Unnamed: 0,ICD10,Text
0,B40.9,﻿ет хикаго морбо бластомикозис
1,B40,ет хикаго морбо бластомикозис
2,B40.9,ет бластомикозис морбо гилхристс
3,B40,ет бластомикозис морбо гилхристс
4,B40.9,ет норт американ бластомикозис


In [33]:
print_stat(df_icd10_wd_trl_bg)

  return func(self, *args, **kwargs)


63 7 52007 51972 51853 3468 50428 3289


In [34]:
df_icd10_wd_trl_lt = pd.read_csv(file_name, usecols=[0,1], header=None, names=['Text', 'ICD10'])
df_icd10_wd_trl_lt = df_icd10_wd_trl_lt[['ICD10', 'Text']]
df_icd10_wd_trl_lt.head()

Unnamed: 0,ICD10,Text
0,B40.9,Et Chicago morbo BLASTOMYCOSIS
1,B40,Et Chicago morbo BLASTOMYCOSIS
2,B40.9,Et BLASTOMYCOSIS morbo Gilchrists
3,B40,Et BLASTOMYCOSIS morbo Gilchrists
4,B40.9,Et North American BLASTOMYCOSIS


In [35]:
print_stat(df_icd10_wd_trl_lt)

  return func(self, *args, **kwargs)


63 7 52007 51972 51962 3468 50536 3289


### Create and Refactor Dataframe

___Create Dataframe___

In [80]:
dfs = [df_icd10_bg, df_icd10_gs, df_icd10_ms, df_icd10_wd_bg, df_icd10_wd_lt, df_icd10_wd_trl_bg, df_icd10_wd_trl_lt]

In [81]:
df = pd.concat(dfs, ignore_index=True)
df.head()

Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00.0,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00.1,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00.9,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [84]:
refactor_codes(df)
get_invalid_codes(df, CODE_PATTERN)

  return func(self, *args, **kwargs)


['Ø14', 'Г-20', 'ICD10', 'D3A.8', 'T508X', 'G3L8', 'LXXXIVF', '0', 'ALXXXIII.']

In [59]:
print_stat(df)

  return func(self, *args, **kwargs)


640 9 267045 266853 210379 11406 205276 10971


In [60]:
df.shape

(267045, 2)

In [61]:
df2 = [df, pd.read_csv(ICD10_ALL_3SIGN_FILE_NAME)]
df2 = pd.concat(df2, ignore_index=True)
print_stat(df2)

  return func(self, *args, **kwargs)


640 9 456299 456107 383042 11406 377939 10971


___Refactor Dataframe___

In [62]:
refactor_codes(df)

In [63]:
df = get_valid_df(df, CODE_PATTERN)
if len(get_invalid_codes(df, CODE_PATTERN)) == 0:
    print('Dataframe contains only valid codes!')

df.head()

  return func(self, *args, **kwargs)


Dataframe contains only valid codes!


Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00.0,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00.1,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00.9,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [64]:
df.shape

(266853, 2)

___Concatenate 3 Sign Codes to 4 Sign Codes___

In [65]:
df_all_3sign = pd.read_csv(ICD10_ALL_3SIGN_FILE_NAME)
df = pd.concat([df, df_all_3sign], ignore_index=True)
df.head()

Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00.0,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00.1,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00.9,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [66]:
df.shape

(456107, 2)

In [67]:
df.drop_duplicates(inplace = True)
df.shape

(383042, 2)

***Filter by ICD10_bg_3sign.csv and ICD10_bg_4sign.csv***

Get valid codes.

In [68]:
df_icd10_bg_3sign = pd.read_csv(ICD10_BG_3SIGN_FILE_NAME, header=None, names=['ICD10', 'Text'])
df_valid_codes = pd.concat([df_icd10_bg, df_icd10_bg_3sign], ignore_index=True)
df_valid_codes.head()

Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00.0,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00.1,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00.9,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [69]:
valid_codes = df_valid_codes['ICD10'].unique()
len(valid_codes)

11064

Try to transform invalid codes to valid codes.

In [70]:
df_invalid_codes = df[~df['ICD10'].isin(valid_codes)]
df_invalid_codes['ICD10'] = df_invalid_codes['ICD10'].str[0:3]

df_valid_invalid_codes = df_invalid_codes[df_invalid_codes['ICD10'].isin(valid_codes)]
if len(df_valid_invalid_codes) > 0:
    print('{0} valid ICD10 records are generated from invalid ICD10 codes!'.format(len(df_valid_invalid_codes)))
    print('The generated unique valid ICD10 codes are {0}:'.format(len(df_valid_invalid_codes['ICD10'].unique())))
    print(df_valid_invalid_codes['ICD10'].unique())
    df = pd.concat([df, df_valid_invalid_codes], ignore_index=True)
df.head()

4582 valid ICD10 records are generated from invalid ICD10 codes!
The generated unique valid ICD10 codes are 150:
['E07' 'E13' 'E27' 'F10' 'G06' 'N05' 'R05' 'E14' 'Z97' 'Z84' 'N00' 'N01'
 'N02' 'N03' 'N04' 'N06' 'N07' 'R89' 'R87' 'M36' 'I99' 'R86' 'M68' 'B49'
 'F13' 'P08' 'F12' 'F14' 'F11' 'F15' 'F19' 'F16' 'F18' 'E10' 'E11' 'E12'
 'Q35' 'F17' 'H00' 'R45' 'L85' 'K27' 'Q27' 'O02' 'Q14' 'E85' 'R84' 'F54'
 'F05' 'P95' 'N70' 'D18' 'P11' 'B15' 'N62' 'F78' 'R32' 'B22' 'K28' 'K26'
 'K25' 'M45' 'I27' 'E71' 'C93' 'D68' 'E31' 'E78' 'A92' 'C84' 'C85' 'C94'
 'D47' 'B19' 'E88' 'R53' 'C81' 'C82' 'C91' 'F71' 'C80' 'K02' 'G47' 'C88'
 'L89' 'N45' 'N47' 'N36' 'A38' 'G83' 'H46' 'H73' 'N46' 'A24' 'D70' 'K20'
 'K85' 'R80' 'A77' 'C90' 'D89' 'E70' 'I72' 'M79' 'N50' 'H71' 'L74' 'F42'
 'R48' 'A48' 'A07' 'C46' 'J47' 'M67' 'O14' 'Q31' 'R31' 'C96' 'J81' 'N26'
 'H55' 'O43' 'I31' 'R91' 'K22' 'N44' 'F01' 'M31' 'F72' 'G90' 'G98' 'G44'
 'H47' 'B33' 'K52' 'L82' 'M72' 'Q83' 'G25' 'I50' 'I77' 'K55' 'N41' 'N21'
 'C92' 'H15

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00.0,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00.1,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00.9,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [71]:
df.shape

(387624, 2)

In [72]:
df.drop_duplicates(inplace = True)
df.shape

(383042, 2)

> _All generated (transformed) records are duplicates!_

In [73]:
invalid_codes_3sign = df_invalid_codes[~df_invalid_codes['ICD10'].isin(valid_codes)]['ICD10'].unique()
print('{0} unique ICD10 codes cannot be transformed to valid ICD10 codes:'.format(len(invalid_codes_3sign)))
invalid_codes_3sign

29 unique ICD10 codes cannot be transformed to valid ICD10 codes:


array(['M56', 'B12', 'M26', 'B98', 'M58', 'B07', 'J59', 'G65', 'K34',
       'N38', 'N68', 'O54', 'F08', 'F46', 'E36', 'I17', 'C86', 'J09',
       'M04', 'M27', 'U06', 'K64', 'I75', 'G89', 'P84', 'R37', 'E08',
       'G14', 'C28'], dtype=object)

In [74]:
invalid_codes = df[~df['ICD10'].isin(valid_codes)]['ICD10'].unique()
print('Invalid codes without transformations are {0}:'.format(len(invalid_codes)))
invalid_codes

Invalid codes without transformations are 435:


array(['E07.7', 'E13.5', 'E27.6', 'E27.7', 'E13.1', 'E13.4', 'E13.7',
       'F10.3', 'G06.6', 'N05.0', 'R05.2', 'M56.8', 'F10.4', 'F10.6',
       'F10.7', 'F10.2', 'F10.0', 'F10.5', 'E14.4', 'Z97.6', 'Z84.4',
       'E14.5', 'E14.3', 'B12', 'M26.4', 'N00.0', 'N01.0', 'N02.0',
       'N03.0', 'N04.0', 'N06.0', 'N07.0', 'R89.4', 'R89.8', 'R89.6',
       'R87.6', 'R89.7', 'E14.6', 'M36.6', 'I99.0', 'R86.9', 'M68.2',
       'B98.9', 'M58.0', 'B49.0', 'N00.6', 'N01.6', 'N02.6', 'N03.6',
       'N04.6', 'N05.6', 'N06.6', 'N07.6', 'B07', 'F13.2', 'P08.8',
       'F10.1', 'F13.1', 'F12.1', 'F14.1', 'F11.1', 'F15.1', 'F19.1',
       'F16.1', 'F18.1', 'J59.9', 'N00.3', 'N01.3', 'N02.3', 'N03.3',
       'N04.3', 'N05.3', 'N06.3', 'N07.3', 'N00.5', 'N01.5', 'N02.5',
       'N03.5', 'N04.5', 'N05.5', 'N06.5', 'N07.5', 'N00.2', 'N01.2',
       'N02.2', 'N03.2', 'N04.2', 'N05.2', 'N06.2', 'N07.2', 'N00.7',
       'N01.7', 'N02.7', 'N03.7', 'N04.7', 'N05.7', 'N06.7', 'N07.7',
       'N00.4', 'N01.4',

Remove invalid codes

In [75]:
df = df[df['ICD10'].isin(valid_codes)]
df.head()

Unnamed: 0,ICD10,Text
0,A00,Холера
1,A00.0,"Холера, предизвикана от холерен вибрион 01, би..."
2,A00.1,"Холера, предизвикана от холерен вибрион 01, би..."
3,A00.9,"Холера, неуточнена"
4,A01,Тиф и паратиф


In [76]:
df.shape

(377939, 2)

In [77]:
df.drop_duplicates(inplace = True)
df.shape

(377939, 2)

In [79]:
len(df['ICD10'].unique())

10971

### Save Dataframe

In [47]:
# file_name = '{0}{1}'.format(WRITE_DIR, RESULT_FILE_NAME)
# df.to_csv(file_name, sep=',', encoding='utf-8', header=True, index=False)