# Generate cause codes dict for ICD10 condensed codes

In [43]:
import csv

def get_icd10_condensed_cause_code_dict():
    cause_codes_path = "/home/danny/git/who-mortality-data-query-engine/resources" + r"/who-mortality/ICD10_list_101_103_cause_codes.csv"
    cc_dict = {}
    with open(cause_codes_path, 'r', encoding='utf-8') as f:
        csv_reader = csv.reader(f, delimiter=',')
        next(csv_reader, None)
        for row in csv_reader:
            cc_dict[row[0]] = row[2]
    return cc_dict

# Generate cause codes dict for ICD10 3 character codes

Annex table 2, Field "List", states that the value in the List column dictates whether the 'code' or 'Detailed List Numbers' values are used from Table 8

In [44]:
import re
cause_code_split_regex = re.compile(r"([A-Z]{1})([0-9]{2})")
ranged_cause_code_validity_regex = re.compile(r"[A-Z]{1}[0-9]{2}-[A-Z]{1}[0-9]{2}")

def extract_char_numeral_pair(code_str):
    match = cause_code_split_regex.match(code_str)
    if match:
        return match.groups()
    else:
        raise ValueError(f"Unable to extract char and numeral from {code_str}")

In [45]:
def code_range_str_to_list(cause_code_range_str):
    if ranged_cause_code_validity_regex.match(cause_code_range_str) is None:
        return [cause_code_range_str]
    
    start_code, end_code = cause_code_range_str.split('-')
    
    try:
        start_char, start_num = extract_char_numeral_pair(start_code)
        end_char, end_num = extract_char_numeral_pair(end_code)
    except ValueError as e:
        print(f"Failed to generate list between codes {start_code} and {end_code}.\n{e}")
        raise
    
    output_list = []
    
    current_char = start_char
    current_num = int(start_num)
    output_list.append(current_char + str(current_num).zfill(2))
    while True:
        current_num += 1
        if current_num >= 100:
            current_num = 1
            current_char = chr(ord(current_char)+1)
            
        output_list.append(current_char + str(current_num).zfill(2))
        if current_char == end_char and str(current_num).zfill(2) == end_num:
            break
    
    return output_list

In [46]:
def detailed_code_str_to_list(detailed_list_numbers_str):
    
    codes = []
    for code_range_str in detailed_list_numbers_str.replace(' ', '').split(','):
        codes.extend(code_range_str_to_list(code_range_str))
    
    return codes

In [47]:
def get_icd10_3_char_cause_code_dict():
    cause_codes_path = "/home/danny/git/who-mortality-data-query-engine/resources" + r"/who-mortality/ICD10_list_101_103_cause_codes.csv"
    cause_codes_df = pd.read_csv(cause_codes_path, dtype={'code': int,
                                                     'Detailed List Numbers': str,
                                                     'Cause': str}).rename(columns={'Detailed List Numbers': 'detailed_codes',
                                                                                   'Cause': 'cause'})
    
    cause_codes_df['detailed_codes'] = cause_codes_df[cause_codes_df['detailed_codes'].notna()]['detailed_codes'].apply(detailed_code_str_to_list)
    
    detailed_code_cause_dict = {}
    for row in cause_codes_df[cause_codes_df['detailed_codes'].notna()].itertuples():
        for code in row.detailed_codes:
            detailed_code_cause_dict[code] = row.cause
    
    return detailed_code_cause_dict

# Generate cause codes for ICD 10 special list for Portugal - data for 2004-2005

In [48]:
def get_icd10_portugal_condensed_cause_code_dict():
    cause_codes_path = "/home/danny/git/who-mortality-data-query-engine/resources" + r"/who-mortality/ICD10_list_UE1_cause_codes.csv"

    cc_dict = {}
    with open(cause_codes_path, 'r', encoding='utf-8') as f:
        csv_reader = csv.reader(f, delimiter=',')
        next(csv_reader, None)
        for row in csv_reader:
            cc_dict[row[0]] = row[2]
    return cc_dict

# Update mortality data cause codes with relevant mapping

### Read in mortality rates data

In [49]:
import pandas as pd

who_mortality_rates_name = "~/git/who-mortality-data-query-engine/ingest/Morticd10_part1"
mortality_pt1_df = pd.read_csv(who_mortality_rates_name, dtype={'Country': int,
                                                                'Year': int,
                                                                'Sex': int,
                                                                'Deaths1': int})
mortality_pt1_df.head()

Unnamed: 0,Country,Admin1,SubDiv,Year,List,Cause,Sex,Frmat,IM_Frmat,Deaths1,...,Deaths21,Deaths22,Deaths23,Deaths24,Deaths25,Deaths26,IM_Deaths1,IM_Deaths2,IM_Deaths3,IM_Deaths4
0,1400,,,2001,101,1000,1,7,8,332,...,95.0,,,,,0.0,8.0,,,
1,1400,,,2001,101,1000,2,7,8,222,...,112.0,,,,,0.0,11.0,,,
2,1400,,,2001,101,1001,1,7,8,24,...,5.0,,,,,0.0,0.0,,,
3,1400,,,2001,101,1001,2,7,8,14,...,6.0,,,,,0.0,0.0,,,
4,1400,,,2001,101,1002,1,7,8,0,...,0.0,,,,,0.0,0.0,,,


### Check which cause code revisions need updating

In [50]:
mortality_pt1_df.loc[:, 'List'].unique()

array([101, 103, 104, '104', '10M', 'UE1', '103'], dtype=object)

The above list of unique List's shows that sometimes integer ICD versions have been listed as strings, e.g. '103' instead of 103, and hence these rows have been missed during conversion. To rectify this, such rows will need their List value cleaning to be of the appropriate type ahead of mapping.

### Apply condensed cause codes to List=101 rows

In [51]:
icd10_condensed_cause_code_dict = get_icd10_condensed_cause_code_dict()

In [52]:
mortality_pt1_df.loc[mortality_pt1_df['List'] == 101, 'Cause'] = mortality_pt1_df.loc[mortality_pt1_df['List'] == 101, 'Cause'].map(icd10_condensed_cause_code_dict)
mortality_pt1_df.head()

Unnamed: 0,Country,Admin1,SubDiv,Year,List,Cause,Sex,Frmat,IM_Frmat,Deaths1,...,Deaths21,Deaths22,Deaths23,Deaths24,Deaths25,Deaths26,IM_Deaths1,IM_Deaths2,IM_Deaths3,IM_Deaths4
0,1400,,,2001,101,All causes,1,7,8,332,...,95.0,,,,,0.0,8.0,,,
1,1400,,,2001,101,All causes,2,7,8,222,...,112.0,,,,,0.0,11.0,,,
2,1400,,,2001,101,Certain infectious and parasitic diseases,1,7,8,24,...,5.0,,,,,0.0,0.0,,,
3,1400,,,2001,101,Certain infectious and parasitic diseases,2,7,8,14,...,6.0,,,,,0.0,0.0,,,
4,1400,,,2001,101,Cholera,1,7,8,0,...,0.0,,,,,0.0,0.0,,,


### Apply ICD(revision 3) cause codes to List=103 rows
These are indicated by having a List value of 103.
They are typically in the format of 1 character followed by two numberals, e.g. A01

In [54]:
icd10_3_char_cause_code_dict = get_icd10_3_char_cause_code_dict()

In [55]:
mortality_pt1_df.loc[mortality_pt1_df['List'] == '103', 'List'] = 103
mortality_pt1_df.loc[mortality_pt1_df['List'] == 103, 'Cause'] = mortality_pt1_df.loc[mortality_pt1_df['List'] == 103, 'Cause'].map(icd10_3_char_cause_code_dict)
mortality_pt1_df.loc[mortality_pt1_df['List'] == 103, 'Cause'].tail()

1388101    All other external causes
1388102    All other external causes
1388103    All other external causes
1388104    All other external causes
1388105    All other external causes
Name: Cause, dtype: object

### Apply ICD10(revision 4) cause codes to List=104 rows
These are indicated by having a List value of 104.
They are typically in the format of 1 character followed by three numerals, e.g. A010
Detailed cause codes for ICD 10 can be viewed here - https://icd.who.int/browse10/2019/en

The additional numeral can be interpreted as a subset of the cause indicated by the previous character and numbers. For example:
A01 = "Typhoid and paratyphoid fevers"
A01.2 = "Paratyphoid fever B" 

For the purposes of this project, this level of granularity is not necessary therefore all suffix's will be ignored and the code will resolve to parent category, i.e. A012 will be converted to "Typhoid and paratyphoid fevers"

In [56]:
mortality_pt1_df.loc[mortality_pt1_df['List'] == '104', 'List'] = 104
mortality_pt1_df.loc[mortality_pt1_df['List'] == 104, 'Cause'] = mortality_pt1_df.loc[mortality_pt1_df['List'] == 104, 'Cause'].map(lambda x: x[:3]).map(icd10_3_char_cause_code_dict)

## Apply ICD10(revision M) cause codes to List=10M rows
These are indicated by having a List value of 10M.
They are typically in the format of 1 character followed by three numerals, e.g. A010
Detailed cause codes for ICD 10 can be viewed here - https://icd.who.int/browse10/2019/en

The additional numeral can be interpreted as a subset of the cause indicated by the previous character and numbers. For example:
A01 = "Typhoid and paratyphoid fevers"
A01.2 = "Paratyphoid fever B" 

For the purposes of this project, this level of granularity is not necessary therefore all suffix's will be ignored and the code will resolve to parent category, i.e. A012 will be converted to "Typhoid and paratyphoid fevers"

In [57]:
mortality_pt1_df.loc[mortality_pt1_df['List'] == '10M', 'Cause'] = mortality_pt1_df.loc[mortality_pt1_df['List'] == '10M', 'Cause'].map(lambda x: x[:3]).map(icd10_3_char_cause_code_dict)

## Apply ICD10(revision UE1) Portugal cause codes to List=UE1 rows

In [58]:
mortality_pt1_df.loc[mortality_pt1_df['List'] == 'UE1']['Cause'].unique()

array(['CH00', 'CH01', 'UE02', 'UE04', 'UE05', 'CH02', 'UE07', 'UE08',
       'UE09', 'UE10', 'UE11', 'UE12', 'UE13', 'UE14', 'UE15', 'UE16',
       'UE17', 'UE18', 'UE19', 'UE20', 'UE21', 'UE22', 'UE23', 'UE24',
       'CH03', 'CH04', 'UE27', 'CH05', 'UE29', 'CH06', 'CH07', 'CH08',
       'UE32', 'CH09', 'UE34', 'UE35', 'UE36', 'CH10', 'UE39', 'UE40',
       'UE41', 'CH11', 'UE43', 'UE44', 'CH12', 'CH13', 'UE47', 'CH14',
       'UE49', 'CH15', 'CH16', 'CH17', 'UE53', 'UE54', 'CH18', 'UE57',
       'CH20', 'UE59', 'UE60', 'UE61', 'UE62', 'UE63', 'UE64', 'UE65'],
      dtype=object)

The cause codes corresponding to the List=UE1 rows are all 4 characters, which in the case of the portugal data corresponds to the 'condensed' codes, hence the condensed Portugal cause code dict will be used

In [59]:
icd10_portugal_condensed_cause_code_dict = get_icd10_portugal_condensed_cause_code_dict()

In [60]:
mortality_pt1_df.loc[mortality_pt1_df['List'] == 'UE1', 'Cause'] = mortality_pt1_df.loc[mortality_pt1_df['List'] == 'UE1', 'Cause'].map(icd10_portugal_condensed_cause_code_dict)

### Check what cause codes didn't get replaced

In [61]:
mortality_pt1_df.loc[mortality_pt1_df['Cause'].str.len() <= 4]['List'].unique()

array([], dtype=object)

No strings of 4 characters or less remain (the max length for a cause code) so we can consider all cause codes mapped to the corresponding cause of death.

A quick check of the mapped causes confirms this

In [68]:
mortality_pt1_df['Cause'].unique()

array(['All causes', 'Certain infectious and parasitic diseases',
       'Cholera',
       'Diarrhoea and gastroenteritis of presumed infectious origin',
       'Other intestinal infectious diseases', 'Respiratory tuberculosis',
       'Other tuberculosis', 'Plague', 'Tetanus', 'Diphtheria',
       'Whooping cough', 'Meningococcal infection', 'Septicaemia',
       'Infections with a predominantly sexual mode of transmission',
       'Acute poliomyelitis', 'Rabies', 'Yellow fever',
       'Other arthropod-borne viral fevers and viral haemorrhagic fevers',
       'Measles', 'Viral hepatitis',
       'Human immunodeficiency virus [HIV] disease', 'Malaria',
       'Leishmaniasis', 'Trypanosomiasis', 'Schistosomiasis',
       'Remainder of certain infectious and parasitic diseases',
       'Neoplasms', 'Malignant neoplasm of lip, oral cavity and pharynx',
       'Malignant neoplasm of oesophagus',
       'Malignant neoplasm of stomach',
       'Malignant neoplasm of colon, rectum and anus',