In [1]:
from pypdf import PdfReader 
import glob
import os
import pandas as pd
import re
import numpy as np 
import tabula

In [2]:
cwd = os.getcwd()
directory = cwd + '/datasets/old_data/lab/'
pdf_files = glob.glob(os.path.join(directory, '**', '*.pdf'), recursive=True)

In [3]:
df_list = []
for pdf in pdf_files:
    pdf_list = tabula.read_pdf(pdf, pages='all')
    pages = []
    for page in pdf_list:
        page = pd.concat([pd.DataFrame([page.columns], columns=page.columns), page], ignore_index=True)
        page.columns = range(len(page.columns))
        pages.append(page)
    combined_df = pd.concat(pages, ignore_index=True, sort=False)
    for col in combined_df.columns:
        combined_df[col] = combined_df[col].map(lambda x: np.nan if isinstance(x, str) and 'Unnamed: ' in x else x)
    combined_df["patient"] = int(re.search(r'/(\d+)\.pdf$', pdf).group(1))
    df_list.append(combined_df)

df = pd.concat(df_list, ignore_index=True, sort=False)

In [4]:
col_to_move = 'patient'
columns = [col_to_move] + [col for col in df.columns if col != col_to_move]
df = df[columns]

In [5]:
df

Unnamed: 0,patient,0,1,2,3,4,5,6,7,8,9,10
0,3,"Hasta Adı, Soyadı",: VEZIR YASAR,,,,,,,,,
1,3,TC Kimlik,: 31*******92,,,,,,,,,
2,3,"Doğum Tarihi, Cinsiyeti",: 15.05.1963 - Erkek / 60 Yıl,,,,,,,,,
3,3,Protokol / Dosya / İşlem No:,: P2022238881 / Dosya / 22119247 Kurum:,,SGK,,,,,,,
4,3,Rapor Numarası,: 388466.1100.13303183.2023,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3788,30,,,,,,Onay Tar : 24.10.2023 08:29:13,,,,,
3789,30,Ünite: Hormon,Barkod Trh : 23.10.2023 14:49:34,,Num. Kabul : 23.10.2023 15:39:02,,,,,,,
3790,30,,,,Referans Aralığı,/,ESKİ SONUÇLAR,,,,,
3791,30,TETKİK ADI,Durum Sonuç,,Birim,,,,,,,


In [6]:
for col in df.columns:
    print("--------------- " , col , " -----------------------")
    print(df[col].unique())

---------------  patient  -----------------------
[ 3 23 25 27 22 19 14 13 41 31 29  8 39 40 10 17  2  1 34 21 12 38 37 26
  5  7 32 15 28 16 24 33 20  9 11 18 35  4 36  6 30]
---------------  0  -----------------------
['Hasta Adı, Soyadı' 'TC Kimlik' 'Doğum Tarihi, Cinsiyeti'
 'Protokol / Dosya / İşlem No:' 'Rapor Numarası' nan
 'KVC LAB Laboratuvarı' 'Tetkiki İsteyen:'
 'İhtisas Kardiyoloji-Uz.Dr.FATİH' 'KÖKSAL POL. (KVC Binası)'
 'Ünite: Biyokimya' 'TETKİK ADI' 'LDL K' 'VLDL K' 'AMİLAZ' 'GFR (MDRD)'
 'KALSİYUM (Ca)' 'BİLİRÜBİN-TOTAL' 'BİLİRÜBİN-DİREKT' 'BİLİRÜBİN-İNDİREKT'
 'AKŞ GLUKOZ' 'KREATİNİN (CREA)' 'TRİGLİSERİD' 'T.KOLESTEROL' 'SGPT (ALT)'
 'SGOT(AST)' 'SODYUM' 'POTASYUM (K)' 'KLOR (Cl)' 'Serum İndeksi (Hemoliz)'
 'Ünite: Hematoloji' 'WBC' 'RBC' 'HGB' 'HCT' 'PLT' 'MCV' 'MCH' 'MCHC'
 'RDW-CV' 'RDW-SD' 'MPV' 'PCT' 'PDW' 'NE%' 'LYM%' 'MONO%' 'EOS%' 'BASO%'
 'Free T3' 'Free T4' 'TSH' 'FOLAT' 'ViİTAMİN B12' 'SODYUM (Na)'
 'Serum İndeksi (İkter)' 'NEU#' 'LYM#' 'MONO#' 'EOS#' 'BASO

In [7]:
filter_conditions = (
    (df[0] == 'TC Kimlik') | 
    (df[0] == 'Hasta Adı, Soyadı') |
    (df[0] == 'Rapor Numarası') |
    (df[0] == 'KVC LAB Laboratuvarı') |
    (df[0] == 'Tetkiki İsteyen:') |
    (df[0] == 'İhtisas Kardiyoloji-Uz.Dr.FATİH') |
    (df[0] == 'Ünite: Biyokimya') |
    (df[0] == 'Doğum Tarihi, Cinsiyeti') |
    (df[0] == 'KÖKSAL POL. (KVC Binası)') |
    (df[0] == 'TETKİK ADI') |
    (df[0] == 'Ünite: Hematoloji') |
    (df[0] == 'Hasta Adı, Soyadı : NURAY KABA') |
    (df[0] == 'TC Kimlik : 32*******18') |
    (df[0] == 'Doğum Tarihi, Cinsiyeti : 4.06.1982 - Kadın / 41 Yıl') |
    (df[0] == 'Protokol / Dosya / İşlem No: : P2013167794 / Dosya / 22189782') |
    (df[0] == 'Rapor Numarası : 388466.1100.13303343.2023') |
    (df[0] == 'TIBBİ LABORATUVAR TETKİK SONUÇ RAPORU(123)') |
    (df[0] == 'KARDİYOLOJİ Uzm. Dr. MAHMUT') |
    (df[0] == 'KAPSIZ POL. (KVC Binası) İstem Trh:') |
    (df[0] == 'Ünite: Biyokimya Barkod Trh :') |
    (df[0] == 'TETKİK ADI Durum') |
    (df[0] == 'Ünite: Hematoloji Barkod Trh :') |
    (df[0] == 'TETKİK') |
    (df[0] == 'Ünite: Hormon') |
    (df[0] == 'ANA BİNA LAB Laboratuvarı') |
    (df[0] == 'Protokol / Dosya / İşlem No:')
)  


filtered_df = df[filter_conditions]
filtered_df

Unnamed: 0,patient,0,1,2,3,4,5,6,7,8,9,10
0,3,"Hasta Adı, Soyadı",: VEZIR YASAR,,,,,,,,,
1,3,TC Kimlik,: 31*******92,,,,,,,,,
2,3,"Doğum Tarihi, Cinsiyeti",: 15.05.1963 - Erkek / 60 Yıl,,,,,,,,,
3,3,Protokol / Dosya / İşlem No:,: P2022238881 / Dosya / 22119247 Kurum:,,SGK,,,,,,,
4,3,Rapor Numarası,: 388466.1100.13303183.2023,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3785,30,Tetkiki İsteyen:,,,,,,,,,,
3786,30,İhtisas Kardiyoloji-Uz.Dr.FATİH,,,,,,,,,,
3787,30,KÖKSAL POL. (KVC Binası),İstem Trh: 23.10.2023 14:49:00,,Num. Alma : 23.10.2023 14:53:17,,,,,,,
3789,30,Ünite: Hormon,Barkod Trh : 23.10.2023 14:49:34,,Num. Kabul : 23.10.2023 15:39:02,,,,,,,


In [8]:
filtered_df.isna().sum()

patient       0
0             0
1           399
2          1135
3          1050
4          1293
5          1305
6          1305
7          1305
8          1305
9          1305
10         1305
dtype: int64

In [9]:
df = df[~filter_conditions]

In [10]:
df

Unnamed: 0,patient,0,1,2,3,4,5,6,7,8,9,10
5,3,,Lab.Ruhsat No:431/1,,,,,,,,,
6,3,,TIBBİ LABORATUVAR TETKİK SONUÇ RAPORU(123),,,,,,,,,
11,3,,,,,,Onay Tar : 18.10.2023 12:14:44,,,,,
13,3,,Referans,Aralığı,,/,ESKİ SONUÇLAR,,,,,
15,3,,Karar,Sınırı,,,1.Sonuç - 2.Sonuç,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3777,30,,,,,,,,,,,(04.08.2023)
3778,30,BASO#,,,,005,10^3/mL,"0,0 - 0,3",,,"0,04 (04.09.2023)",003
3788,30,,,,,,Onay Tar : 24.10.2023 08:29:13,,,,,
3790,30,,,,Referans Aralığı,/,ESKİ SONUÇLAR,,,,,


In [11]:
df[0].unique()

array([nan, 'LDL K', 'VLDL K', 'AMİLAZ', 'GFR (MDRD)', 'KALSİYUM (Ca)',
       'BİLİRÜBİN-TOTAL', 'BİLİRÜBİN-DİREKT', 'BİLİRÜBİN-İNDİREKT',
       'AKŞ GLUKOZ', 'KREATİNİN (CREA)', 'TRİGLİSERİD', 'T.KOLESTEROL',
       'SGPT (ALT)', 'SGOT(AST)', 'SODYUM', 'POTASYUM (K)', 'KLOR (Cl)',
       'Serum İndeksi (Hemoliz)', 'WBC', 'RBC', 'HGB', 'HCT', 'PLT',
       'MCV', 'MCH', 'MCHC', 'RDW-CV', 'RDW-SD', 'MPV', 'PCT', 'PDW',
       'NE%', 'LYM%', 'MONO%', 'EOS%', 'BASO%', 'Free T3', 'Free T4',
       'TSH', 'FOLAT', 'ViİTAMİN B12', 'SODYUM (Na)',
       'Serum İndeksi (İkter)', 'NEU#', 'LYM#', 'MONO#', 'EOS#', 'BASO#',
       'KALSİYUM', 'LDL-K (KİT)', 'Ünite: HbA1C', 'HDL-K',
       'Serum İndeksi (Lipemi)', 'BUN', 'VLDL', 'ÜRİK ASİT', 'KREATİNİN',
       'POTASYUM'], dtype=object)

In [12]:
allowed_values = [
    'ÜRİK ASİT',
    'LDL K',
    'LDL-K (KİT)',
    'VLDL K',
    'VLDL',
    'GFR (MDRD)',
    'BİLİRÜBİN-İNDİREKT',
    'AKŞ GLUKOZ',
    'BUN',
    'KREATİNİN (CREA)',
    'KREATİNİN',
    'TRİGLİSERİD',
    'T.KOLESTEROL',
    'HDL-K',
    'SGPT (ALT)',
    'SGOT(AST)',
    'AMİLAZ',
    'BİLİRÜBİN-TOTAL',
    'BİLİRÜBİN-DİREKT',
    'KALSİYUM (Ca)',
    'KALSİYUM',
    'SODYUM (Na)',
    'SODYUM',
    'POTASYUM (K)',
    'POTASYUM',
    'KLOR (Cl)',
    'Serum İndeksi (Hemoliz)',
    'Serum İndeksi (İkter)',
    'Serum İndeksi (Lipemi)',
    'WBC',
    'RBC',
    'HGB',
    'HCT',
    'PLT',
    'MCV',
    'MCH',
    'MCHC',
    'RDW-CV',
    'RDW-SD',
    'MPV',
    'PCT',
    'PDW',
    'NE%',
    'LYM%',
    'MONO%',
    'EOS%',
    'BASO%',
    'NEU#',
    'LYM#',
    'MONO#',
    'EOS#',
    'BASO#',
    'Free T3',
    'Free T4',
    'TSH',
    'FOLAT',
    'ViİTAMİN B12'
]

filtered_df = df[df[0].isin(allowed_values)]

print(filtered_df)


      patient              0    1    2      3      4           5           6  \
16          3          LDL K  NaN  [Y]  132,5  mg/dL     0 - 130         NaN   
17          3         VLDL K  NaN  NaN   27,8  mg/dl      0 - 50         NaN   
18          3         AMİLAZ  NaN  NaN     38    U/L    28 - 100         NaN   
19          3     GFR (MDRD)  NaN  NaN   90,8  ml/dk    70 - 180         NaN   
20          3  KALSİYUM (Ca)  NaN  NaN    8,9  mg/dl  8,4 - 10,2         NaN   
...       ...            ...  ...  ...    ...    ...         ...         ...   
3770       30           NEU#  NaN  NaN    NaN   3,51     10^3/mL     1,9 - 8   
3772       30           LYM#  NaN  NaN    NaN   2,28     10^3/mL   0,9 - 2,9   
3774       30          MONO#  NaN  NaN    NaN   0,73     10^3/mL  0,3 - 0,90   
3776       30           EOS#  NaN  NaN    NaN   0,18     10^3/mL   0,0 - 0,5   
3778       30          BASO#  NaN  NaN    NaN   0,05     10^3/mL   0,0 - 0,3   

        7    8                  9    10

In [13]:
filtered_df

Unnamed: 0,patient,0,1,2,3,4,5,6,7,8,9,10
16,3,LDL K,,[Y],1325,mg/dL,0 - 130,,,,,
17,3,VLDL K,,,278,mg/dl,0 - 50,,,,,
18,3,AMİLAZ,,,38,U/L,28 - 100,,,,,
19,3,GFR (MDRD),,,908,ml/dk,70 - 180,,,,,
20,3,KALSİYUM (Ca),,,89,mg/dl,"8,4 - 10,2",,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3770,30,NEU#,,,,351,10^3/mL,"1,9 - 8",,,"4,76 (04.09.2023)",745
3772,30,LYM#,,,,228,10^3/mL,"0,9 - 2,9",,,"2,19 (04.09.2023)",189
3774,30,MONO#,,,,073,10^3/mL,"0,3 - 0,90",,,"0,82 (04.09.2023)",107
3776,30,EOS#,,,,018,10^3/mL,"0,0 - 0,5",,,"0,06 (04.09.2023)",007


In [14]:
filtered_df.isna().sum()

patient       0
0             0
1           710
2           545
3           901
4           302
5           816
6           996
7          1408
8          1422
9          1058
10         1073
dtype: int64

In [15]:
filtered_df

Unnamed: 0,patient,0,1,2,3,4,5,6,7,8,9,10
16,3,LDL K,,[Y],1325,mg/dL,0 - 130,,,,,
17,3,VLDL K,,,278,mg/dl,0 - 50,,,,,
18,3,AMİLAZ,,,38,U/L,28 - 100,,,,,
19,3,GFR (MDRD),,,908,ml/dk,70 - 180,,,,,
20,3,KALSİYUM (Ca),,,89,mg/dl,"8,4 - 10,2",,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3770,30,NEU#,,,,351,10^3/mL,"1,9 - 8",,,"4,76 (04.09.2023)",745
3772,30,LYM#,,,,228,10^3/mL,"0,9 - 2,9",,,"2,19 (04.09.2023)",189
3774,30,MONO#,,,,073,10^3/mL,"0,3 - 0,90",,,"0,82 (04.09.2023)",107
3776,30,EOS#,,,,018,10^3/mL,"0,0 - 0,5",,,"0,06 (04.09.2023)",007


In [16]:
filtered_df[filtered_df[0]=='LDL K']

Unnamed: 0,patient,0,1,2,3,4,5,6,7,8,9,10
16,3,LDL K,,[Y],1325,mg/dL,0 - 130,,,,,
371,22,LDL K,,719,mg/dL,0 - 130,"130,4 (16.12.2022) 138,9",,,,,
710,41,LDL K,"[Y] 202,1 mg/dL","0 - 130 174,7 (03.01.2023)",,,,,,,,
815,31,LDL K,,913,mg/dL,0 - 130,,,,,,
914,29,LDL K,119 mg/dL,"0 - 130 123,1 (26.01.2023)",,,,,,,,
1319,17,LDL K,"[Y] 188,5 mg/dL","0 - 130 169,3 (12.10.2023)",,,,,,,,
1600,34,LDL K,,[Y],1388,mg/dL,0 - 130,,,,,
1691,21,LDL K,,,,75,mg/dL,0 - 130,,"64,7 (21.09.2023)",779.0,
1898,38,LDL K,"79,8 mg/dL","0 - 130 145,9 (17.02.2023)",,,,,,,,
2028,37,LDL K,"116,1 mg/dL","0 - 130 109,5 (13.10.2023)",,,,,,,,


In [17]:
filtered_df[filtered_df[0]=='LDL-K (KİT)']

Unnamed: 0,patient,0,1,2,3,4,5,6,7,8,9,10
280,27,LDL-K (KİT),,[D],1224,mg/dL,<130 *[25],,,,,
1128,39,LDL-K (KİT),,[D],121,mg/dL,<130 *[25],,,,,


In [20]:
'LDL-K (KİT)',
'VLDL',
'GFR (MDRD)',
'BİLİRÜBİN-İNDİREKT',
'AKŞ GLUKOZ',
'BUN',
'KREATİNİN (CREA)',
'KREATİNİN',
'TRİGLİSERİD',
'T.KOLESTEROL'

'T.KOLESTEROL'

In [19]:
filtered_df[filtered_df[0]=='VLDL K']

Unnamed: 0,patient,0,1,2,3,4,5,6,7,8,9,10
17,3,VLDL K,,,278,mg/dl,0 - 50,,,,,
180,25,VLDL K,,[Y],626,mg/dl,"0 - 50 36,2 (18.05.2023)",,,,,
262,27,VLDL K,,[Y],1068,mg/dl,0 - 50,,,,,
373,22,VLDL K,,192,mg/dl,0 - 50,46 (16.12.2022) 47 (15.08.2022),,,,,
711,41,VLDL K,"41,8 mg/dl","0 - 50 63,8 (03.01.2023)",,,,,,,,
915,29,VLDL K,"42,6 mg/dl","0 - 50 25,8 (26.01.2023)",,,,,,,,
1112,39,VLDL K,,[Y],1374,mg/dl,"0 - 50 63,4 (21.02.2023)",,,,,
1320,17,VLDL K,"38,2 mg/dl","0 - 50 49,8 (12.10.2023)",,,,,,,,
1601,34,VLDL K,,,33,mg/dl,0 - 50,,,,,
1693,21,VLDL K,,,,308,mg/dl,0 - 50,,"25,8 (21.09.2023)",34 (14.08.2023),
