In [2]:
'''
@File:   GCAF_Preprocessing.ipynb
@ Author: Asra Aslam
@ Create Time: 2024-09-17 11:26:39
@ Modified time: 2024-09-17 12:09:44
@ License :   (C)Copyright Asra Aslam DynAIRX
@ Description: GCAF_Preprocessing
'''

import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix

from lifelines import KaplanMeierFitter

In [3]:
filepath_snomed="mappings_and_baselines/Multimorbidity_Codelist_16.11.2021.xlsx"
filepath_Baseline1_codelist="mappings_and_baselines/Baseline1_codelist.csv"
filepath_snomed_readcode="Output/Snomed_Generated_for_Baseline1_codelist.csv"
filepath_missing_snomed="Output/Missing_Snomed_in_Mapping_for_Baseline1_codelis.csv"

# Analyzing Lauren's code file which has ReadCodes

In [4]:
df_Baseline1_codelist = pd.read_csv(filepath_Baseline1_codelist) 
df_Baseline1_codelist

Unnamed: 0,Read_code,Term,Disease
0,7A11000,Emerg repl aneurysm bifurc aorta by anast aort...,Abdominal Aortic Aneurysm
1,7A11211,Y graft of abdominal Aortic aneurysm (emergency),Abdominal Aortic Aneurysm
2,7A13.00,Emergency replacement of aneurysmal segment of...,Abdominal Aortic Aneurysm
3,7A13.11,Emergency repair of aortic aneurysm,Abdominal Aortic Aneurysm
4,7A13411,Tube graft abdominal Aortic aneurysm (emergency),Abdominal Aortic Aneurysm
...,...,...,...
5719,C32y.00,Other disorders of lipoid metabolism,Lipid Disorder
5720,C32yz00,Other disorder of lipoid metabolism NOS,Lipid Disorder
5721,C32z.00,Disorder of lipoid metabolism NOS,Lipid Disorder
5722,Cyu8D00,[X]Other hyperlipidaemia,Lipid Disorder


In [5]:
df_Baseline1_codelist.columns

Index(['Read_code', 'Term', 'Disease'], dtype='object')

# Analyzing Pieta's file which has Snomed Codes with ReadCodes

In [6]:
df_snomed = pd.read_excel(filepath_snomed, sheet_name=0, dtype={'SnomedCTConceptId': str, 'SnomedCTDescriptionId': str, 'MedCodeId': str})
df_snomed

Unnamed: 0,CleansedReadCode,Disease,MedCodeId,Term,OriginalReadCode,SnomedCTConceptId,SnomedCTDescriptionId
0,7A11000,ABDOMINALAORTICANEURYSM,639641000006113,Emerg repl aneurysm bifurc aorta by anast aort...,7A110,175283003,639641000006113
1,7A11211,ABDOMINALAORTICANEURYSM,349689013,Y graft of abdominal Aortic aneurysm (emergency),7A112-1,233374003,349689013
2,7A13.00,ABDOMINALAORTICANEURYSM,271446014,Emergency replacement of aneurysmal segment of...,7A13,175297006,271446014
3,7A13.11,ABDOMINALAORTICANEURYSM,271447017,Emergency repair of aortic aneurysm,7A13-1,175297006,271447017
4,7A13411,ABDOMINALAORTICANEURYSM,349694013,Tube graft abdominal Aortic aneurysm (emergency),7A134-1,233377005,349694013
...,...,...,...,...,...,...,...
5719,C32y.00,LIPIDDISORDER,293342011,Other disorders of lipoid metabolism,C32y,267431006,32501000006111
5720,C32yz00,LIPIDDISORDER,293353017,Other disorder of lipoid metabolism NOS,C32yz,267431006,32311000006115
5721,C32z.00,LIPIDDISORDER,293354011,Disorder of lipoid metabolism NOS,C32z,267431006,624051000006117
5722,Cyu8D00,LIPIDDISORDER,293821018,[X]Other hyperlipidaemia,Cyu8D,55822004,410871000006119


In [7]:
df_snomed.dtypes

CleansedReadCode         object
Disease                  object
MedCodeId                object
Term                     object
OriginalReadCode         object
SnomedCTConceptId        object
SnomedCTDescriptionId    object
dtype: object

In [8]:
df_snomed_readcode=df_snomed[['CleansedReadCode', 'SnomedCTConceptId']]
df_snomed_readcode.head()

Unnamed: 0,CleansedReadCode,SnomedCTConceptId
0,7A11000,175283003
1,7A11211,233374003
2,7A13.00,175297006
3,7A13.11,175297006
4,7A13411,233377005


In [9]:
df_snomed_readcode.columns

Index(['CleansedReadCode', 'SnomedCTConceptId'], dtype='object')

In [10]:
df_Baseline1_codelist.columns

Index(['Read_code', 'Term', 'Disease'], dtype='object')

In [11]:
df_Baseline1_codelist.rename(columns={'Read_code':'CleansedReadCode'}, inplace=True)
df_Baseline1_codelist.columns

Index(['CleansedReadCode', 'Term', 'Disease'], dtype='object')

In [12]:
df_Baseline1_codelist.head()

Unnamed: 0,CleansedReadCode,Term,Disease
0,7A11000,Emerg repl aneurysm bifurc aorta by anast aort...,Abdominal Aortic Aneurysm
1,7A11211,Y graft of abdominal Aortic aneurysm (emergency),Abdominal Aortic Aneurysm
2,7A13.00,Emergency replacement of aneurysmal segment of...,Abdominal Aortic Aneurysm
3,7A13.11,Emergency repair of aortic aneurysm,Abdominal Aortic Aneurysm
4,7A13411,Tube graft abdominal Aortic aneurysm (emergency),Abdominal Aortic Aneurysm


In [13]:
df_map=pd.merge(df_Baseline1_codelist, df_snomed_readcode, on='CleansedReadCode')
df_map.head()

Unnamed: 0,CleansedReadCode,Term,Disease,SnomedCTConceptId
0,7A11000,Emerg repl aneurysm bifurc aorta by anast aort...,Abdominal Aortic Aneurysm,175283003
1,7A11211,Y graft of abdominal Aortic aneurysm (emergency),Abdominal Aortic Aneurysm,233374003
2,7A13.00,Emergency replacement of aneurysmal segment of...,Abdominal Aortic Aneurysm,175297006
3,7A13.11,Emergency repair of aortic aneurysm,Abdominal Aortic Aneurysm,175297006
4,7A13411,Tube graft abdominal Aortic aneurysm (emergency),Abdominal Aortic Aneurysm,233377005


In [14]:
print("Lauren's Codelist shape: {}".format(df_Baseline1_codelist.shape))
print("SNOMED Mapping shape: {}".format(df_snomed_readcode.shape))
print("Resultant Mapping Shape: {}".format(df_map.shape))

Lauren's Codelist shape: (5724, 3)
SNOMED Mapping shape: (5724, 2)
Resultant Mapping Shape: (6108, 4)


In [15]:
df_Baseline1_codelist['CleansedReadCode'].isin(df_snomed_readcode['CleansedReadCode']).value_counts()

True    5724
Name: CleansedReadCode, dtype: int64

In [16]:
df_map=pd.merge(df_Baseline1_codelist, df_snomed_readcode, on='CleansedReadCode', how='left')
print("Lauren's Codelist shape: {}".format(df_Baseline1_codelist.shape))
print("SNOMED Mapping shape: {}".format(df_snomed_readcode.shape))
print("Resultant Mapping Shape: {}".format(df_map.shape))
print("There are {} SnomedCTConceptId missing values in mapping.".format(df_map['SnomedCTConceptId'].isnull().sum()))
print("There are {} Disease missing values in mapping.".format(df_map['Disease'].isnull().sum()))
print("There are {} Term missing values in mapping.".format(df_map['Term'].isnull().sum()))
print("There are {} CleansedReadCode missing values in mapping.".format(df_map['CleansedReadCode'].isnull().sum()))
df_map.head()

Lauren's Codelist shape: (5724, 3)
SNOMED Mapping shape: (5724, 2)
Resultant Mapping Shape: (6108, 4)
There are 10 SnomedCTConceptId missing values in mapping.
There are 0 Disease missing values in mapping.
There are 10 Term missing values in mapping.
There are 0 CleansedReadCode missing values in mapping.


Unnamed: 0,CleansedReadCode,Term,Disease,SnomedCTConceptId
0,7A11000,Emerg repl aneurysm bifurc aorta by anast aort...,Abdominal Aortic Aneurysm,175283003
1,7A11211,Y graft of abdominal Aortic aneurysm (emergency),Abdominal Aortic Aneurysm,233374003
2,7A13.00,Emergency replacement of aneurysmal segment of...,Abdominal Aortic Aneurysm,175297006
3,7A13.11,Emergency repair of aortic aneurysm,Abdominal Aortic Aneurysm,175297006
4,7A13411,Tube graft abdominal Aortic aneurysm (emergency),Abdominal Aortic Aneurysm,233377005


## Analyze Missing Values and Duplicates

In [17]:
df_map=df_map.drop_duplicates()
print("Lauren's Codelist shape: {}".format(df_Baseline1_codelist.shape))
print("SNOMED Mapping shape: {}".format(df_snomed_readcode.shape))
print("Resultant Mapping Shape: {}".format(df_map.shape))
print("There are {} SnomedCTConceptId missing values in mapping.".format(df_map['SnomedCTConceptId'].isnull().sum()))
print("There are {} Disease missing values in mapping.".format(df_map['Disease'].isnull().sum()))
print("There are {} Term missing values in mapping.".format(df_map['Term'].isnull().sum()))
print("There are {} CleansedReadCode missing values in mapping.".format(df_map['CleansedReadCode'].isnull().sum()))
df_map.head()

Lauren's Codelist shape: (5724, 3)
SNOMED Mapping shape: (5724, 2)
Resultant Mapping Shape: (5724, 4)
There are 10 SnomedCTConceptId missing values in mapping.
There are 0 Disease missing values in mapping.
There are 10 Term missing values in mapping.
There are 0 CleansedReadCode missing values in mapping.


Unnamed: 0,CleansedReadCode,Term,Disease,SnomedCTConceptId
0,7A11000,Emerg repl aneurysm bifurc aorta by anast aort...,Abdominal Aortic Aneurysm,175283003
1,7A11211,Y graft of abdominal Aortic aneurysm (emergency),Abdominal Aortic Aneurysm,233374003
2,7A13.00,Emergency replacement of aneurysmal segment of...,Abdominal Aortic Aneurysm,175297006
3,7A13.11,Emergency repair of aortic aneurysm,Abdominal Aortic Aneurysm,175297006
4,7A13411,Tube graft abdominal Aortic aneurysm (emergency),Abdominal Aortic Aneurysm,233377005


In [18]:
df_map_missing_snomed=df_map[df_map.isnull().any(axis=1)]
df_map_missing_snomed

Unnamed: 0,CleansedReadCode,Term,Disease,SnomedCTConceptId
1619,na,,Chronic Kidney Disease,
5107,3355,,Dermatitis atopic contact,
5318,1594,,Female genital Prolapse,
5548,ZRq8.00,,Lupus Erythematosus,
5549,ZRq8.11,,Lupus Erythematosus,
5550,ZRq9.00,,Lupus Erythematosus,
5551,ZRq9.11,,Lupus Erythematosus,
5898,1464,,Schizophrenia,
6030,2376,,Scoliosis,
6090,1442,,Lipid Disorder,


# Save SNOMED and missing SNOMED files

In [19]:
# Save
df_map.to_csv(filepath_snomed_readcode, index=False)
df_map_missing_snomed.to_csv(filepath_missing_snomed, index=False)