In [7]:
#!/usr/bin/env python
# -*-coding:utf-8 -*-
'''
@File    :   condition_specific_functions_for_difficult_diseases.ipynb
@Time    :   2023/08/21 11:55:10
@Author  :   Asra Aslam 
@Version :   1.0
@Contact :   a.aslam2@leeds.ac.uk
@License :   (C)Copyright Asra Aslam DynAIRX
@Desc    :   This file is specifically designed for alcohol conditon, where we need to extract 
             snomed codes from other splitted categories and compare with categories in efi file
             and report the remaining ones
'''

import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns

In [8]:
filepaths_source_database_splitted_condition=['output/pulmonary_related_diseases/grouped_database_with_snomed_COPD.csv'] #['output/alcohol_manual_analysis/Alcohol Harmful.csv', 'output/alcohol_manual_analysis/Alcohol High Risk.csv', 'output/alcohol_manual_analysis/Alcohol Low Risk.csv', 'output/alcohol_manual_analysis/Alcohol Previous High Risk.csv', 'output/alcohol_manual_analysis/Alcohol Zero.csv'] #['output/alcohol_manual_analysis/1 Alcoholic Liver Disease.csv', 'output/alcohol_manual_analysis/2 Liver Disease - Viral.csv', 'output/alcohol_manual_analysis/3 Oesophageal varices.csv', 'output/alcohol_manual_analysis/4 Fatty Liver.csv', 'output/alcohol_manual_analysis/5 Alcohol-related Brain Injury.csv', 'output/alcohol_manual_analysis/6 Liver Disease - Other.csv', 'output/alcohol_manual_analysis/7 Autoimmune liver Disease_doubt.csv'] #---Lauren file for grouping----
filepath_target_database_efi_general_condition="output/pulmonary_related_diseases/grouped_database_with_snomed_Chronic Obstructive Pulmonary Disease (COPD).csv" #"output/alcohol_manual_analysis/grouped_database_with_snomed_Alcohol Problem.csv" #"output/alcohol_manual_analysis/efi Liver problems.csv" #---efi file for grouping----
filepath_output_database_remaining_SNOMED="output/pulmonary_related_diseases/Remaining_copd_problems_with_snomed.csv"

In [9]:
df_database_target = pd.read_csv(filepath_target_database_efi_general_condition, dtype={'SNOMEDCT_CONCEPTID': str}) 
df_database_target

Unnamed: 0,SnomedCTConceptId,Term,Disease,Otherinstructions,origin
0,135836000,End stage chronic obstructive airways disease,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
1,13645005,Chronic obstructive airways disease,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
2,13645005,Chronic obstructive airways disease NOS,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
3,13645005,Chronic obstructive pulmonary disease,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
4,13645005,Chronic obstructive pulmonary disease NOS,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
5,13645005,Other specified chronic obstructive airways di...,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
6,13645005,Other specified chronic obstructive pulmonary ...,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
7,13645005,[X]Other specified chronic obstructive pulmona...,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
8,16003001,Giant bullous emphysema,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
9,185086009,Emphysematous bronchitis,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']


In [10]:
# rename SNOMEDCT_CONCEPTID, Deficit and Codedescription column in efi list same as LW list
df_database_target.rename(columns={'SNOMEDCT_CONCEPTID':'SnomedCTConceptId'}, inplace=True)
df_database_target.rename(columns={'Codedescription':'Term'}, inplace=True)
df_database_target.rename(columns={'Deficit':'Disease'}, inplace=True)
df_database_target.columns

Index(['SnomedCTConceptId', 'Term', 'Disease', 'Otherinstructions', 'origin'], dtype='object')

In [11]:
df_database_target.dtypes

SnomedCTConceptId      int64
Term                  object
Disease               object
Otherinstructions    float64
origin                object
dtype: object

In [12]:
#merging all input files to get used snomed codes column in one---
df_concatenated = pd.DataFrame(columns=['SnomedCTConceptId', 'Term', 'Disease', 'Otherinstructions', 'origin', 'Comment']) #--just an empty dataframe at first--
for filepath in filepaths_source_database_splitted_condition:
    df_database_source = pd.read_csv(filepath, dtype={'SnomedCTConceptId': str}) 
    print("For file: ", filepath)
    #print(df_database_source)
    print(df_database_source.columns)
    df_concatenated = pd.concat([df_concatenated, df_database_source], ignore_index=True, sort=False)
    print(df_concatenated)

For file:  output/pulmonary_related_diseases/grouped_database_with_snomed_COPD.csv
Index(['SnomedCTConceptId', 'Term', 'Disease', 'Otherinstructions', 'origin'], dtype='object')
     SnomedCTConceptId                                               Term  \
0    10692761000119100  asthma-chronic obstructive pulmonary disease o...   
1      110011000000107  Did not attend chronic obstructive pulmonary d...   
2            135836000                     End stag chron obst airway dis   
3             13645005                   Chronic obstructive lung disease   
4             15081005                           Pulmonary rehabilitation   
..                 ...                                                ...   
103    866901000000103                            Eosinophilic bronchitis   
104           87433001                                          Emphysema   
105    892321000000109  chronic obstruct pulmonary disease management ...   
106           89549007                          Simp

In [13]:
df_concatenated

Unnamed: 0,SnomedCTConceptId,Term,Disease,Otherinstructions,origin,Comment
0,10692761000119100,asthma-chronic obstructive pulmonary disease o...,COPD,,['efi'],
1,110011000000107,Did not attend chronic obstructive pulmonary d...,COPD,,['efi'],
2,135836000,End stag chron obst airway dis,COPD,,['efi'],
3,13645005,Chronic obstructive lung disease,COPD,,['efi'],
4,15081005,Pulmonary rehabilitation,COPD,,['efi'],
...,...,...,...,...,...,...
103,866901000000103,Eosinophilic bronchitis,COPD,,['efi'],
104,87433001,Emphysema,COPD,,['efi'],
105,892321000000109,chronic obstruct pulmonary disease management ...,COPD,,['efi'],
106,89549007,Simple chronic bronchitis,COPD,,['efi'],


In [14]:
#df_concatenated.to_csv('output/alcohol_manual_analysis/concatenated_alcohol_categories.csv') 

In [15]:
df_concatenated.SnomedCTConceptId

0      10692761000119100
1        110011000000107
2              135836000
3               13645005
4               15081005
             ...        
103      866901000000103
104             87433001
105      892321000000109
106             89549007
107      941201000000103
Name: SnomedCTConceptId, Length: 108, dtype: object

In [16]:
df_database_target.SnomedCTConceptId.isin(df_concatenated.SnomedCTConceptId)

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35    False
36    False
37    False
38    False
39    False
40    False
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49    False
50    False
51    False
52    False
Name: SnomedCTConceptId, dtype: bool

In [17]:
df_remaining_rows=df_database_target[~df_database_target.SnomedCTConceptId.isin(df_concatenated.SnomedCTConceptId)]
df_remaining_rows

Unnamed: 0,SnomedCTConceptId,Term,Disease,Otherinstructions,origin
0,135836000,End stage chronic obstructive airways disease,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
1,13645005,Chronic obstructive airways disease,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
2,13645005,Chronic obstructive airways disease NOS,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
3,13645005,Chronic obstructive pulmonary disease,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
4,13645005,Chronic obstructive pulmonary disease NOS,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
5,13645005,Other specified chronic obstructive airways di...,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
6,13645005,Other specified chronic obstructive pulmonary ...,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
7,13645005,[X]Other specified chronic obstructive pulmona...,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
8,16003001,Giant bullous emphysema,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']
9,185086009,Emphysematous bronchitis,Chronic Obstructive Pulmonary Disease (COPD),,['Lauren_codelist']


In [18]:
df_remaining_rows.to_csv(filepath_output_database_remaining_SNOMED, index=False) 