In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## Full set

In [25]:
df = pd.read_csv('data/All/df_All.csv')
df

Unnamed: 0,img_index,subject_id,study_id,dataset,race,gender,age,insurance,survival_year,survive
0,0,15529160,50585404,train,ASIAN,M,56.0,Medicaid,-1.0,
1,1,15529160,50585404,train,ASIAN,M,56.0,Medicaid,-1.0,
2,2,15529160,51401638,train,ASIAN,M,56.0,Medicaid,-1.0,
3,3,15529160,51401638,train,ASIAN,M,56.0,Medicaid,-1.0,
4,4,15529160,55049258,train,ASIAN,M,56.0,Medicaid,-1.0,
...,...,...,...,...,...,...,...,...,...,...
194354,194354,19948788,56226482,val,BLACK,F,44.0,Medicaid,-1.0,
194355,194355,19948788,58554921,val,BLACK,F,44.0,Medicaid,-1.0,
194356,194356,19948788,58554921,val,BLACK,F,44.0,Medicaid,-1.0,
194357,194357,19966826,57451836,val,BLACK,F,74.0,Other,-1.0,


In [26]:
df_fill = df.fillna('NA')
print('Number of subjects', df_fill.subject_id.nunique())

df = df.astype({'age': 'float32'}) # make age float instead of string
df_insu = df[(df['age'] < 66) & (df['insurance'].notna())]
print(df_insu['insurance'].unique())
df_insu = df_insu.replace({'Medicare':'Medicaid'})
print(df_insu['insurance'].unique()) # yes, succesfully casted medicare under 65 into medicaid
print('Data size', len(df_insu), '\n')
print(df_insu['insurance'].value_counts())
print(df_insu['insurance'].value_counts(normalize = True))

Number of subjects 44953
['Medicaid' 'Other' 'Medicare']
['Medicaid' 'Other']
Data size 103179 

Other       63521
Medicaid    39658
Name: insurance, dtype: int64
Other       0.615639
Medicaid    0.384361
Name: insurance, dtype: float64


### Split up by subject_id

In [27]:
import random
# set random seed for reproducibility

def split_by_subject_id(df_insu, train_size = 0.60, seed = 123, add_col = False):
# list of patient IDs
    random.seed(seed)

    patient_ids = list(df_insu.subject_id.unique())

    # shuffle patient IDs
    random.shuffle(patient_ids)

    train_s = train_size
    val_s = (1 - train_s) / 2
    # calculate size of each set: 70/15/15 split
    train_size = int(train_s * len(patient_ids))
    val_size = int(val_s * len(patient_ids))
    test_size = len(patient_ids) - train_size - val_size

    # split patient IDs into sets
    train_subjects = patient_ids[:train_size]
    val_subjects = patient_ids[train_size:train_size+val_size]
    test_subjects = patient_ids[train_size+val_size:]


    print("Number of patients in train set:", len(train_subjects))
    print("Number of patients in validation set:", len(val_subjects))
    print("Number of patients in test set:", len(test_subjects))
    assert len(train_subjects) + len(val_subjects) + len(test_subjects) == len(patient_ids)

    print('# Train instances', df_insu[df_insu['subject_id'].isin(train_subjects)].shape[0])
    print('# Val instances',df_insu[df_insu['subject_id'].isin(val_subjects)].shape[0])
    print('# Test instances',df_insu[df_insu['subject_id'].isin(test_subjects)].shape[0])

    print(df_insu[df_insu['subject_id'].isin(train_subjects)]['insurance'].value_counts(normalize=True))
    print(df_insu[df_insu['subject_id'].isin(val_subjects)]['insurance'].value_counts(normalize=True))
    print(df_insu[df_insu['subject_id'].isin(test_subjects)]['insurance'].value_counts(normalize=True))


    if add_col:
        split = []
        subjects = list(df_insu.subject_id)
        for s in tqdm(subjects):
            e = 'test'
            if s in train_subjects:
                e = 'train'
            elif s in val_subjects:
                e = 'val'
            split.append(e)
        df_insu['dataset'] = split
        print(df_insu['dataset'].value_counts(normalize=1))

        for split in ['train','val','test']:
            print(f'Distribution of insurance in {split} set: ')
            print(df_insu[df_insu['dataset']==split]['insurance'].value_counts(normalize=1))
    return df_insu #train_subjects, val_subjects



In [28]:
df_insu = split_by_subject_id(df_insu)
df_insu

Number of patients in train set: 15193
Number of patients in validation set: 5064
Number of patients in test set: 5066
# Train instances 61343
# Val instances 21042
# Test instances 20794
Other       0.619484
Medicaid    0.380516
Name: insurance, dtype: float64
Other       0.595998
Medicaid    0.404002
Name: insurance, dtype: float64
Other       0.62417
Medicaid    0.37583
Name: insurance, dtype: float64


Unnamed: 0,img_index,subject_id,study_id,dataset,race,gender,age,insurance,survival_year,survive
0,0,15529160,50585404,train,ASIAN,M,56.0,Medicaid,-1.00,
1,1,15529160,50585404,train,ASIAN,M,56.0,Medicaid,-1.00,
2,2,15529160,51401638,train,ASIAN,M,56.0,Medicaid,-1.00,
3,3,15529160,51401638,train,ASIAN,M,56.0,Medicaid,-1.00,
4,4,15529160,55049258,train,ASIAN,M,56.0,Medicaid,-1.00,
...,...,...,...,...,...,...,...,...,...,...
194352,194352,19937419,57790204,val,WHITE,M,62.0,Medicaid,0.11,DIE
194353,194353,19948788,54253734,val,BLACK,F,44.0,Medicaid,-1.00,
194354,194354,19948788,56226482,val,BLACK,F,44.0,Medicaid,-1.00,
194355,194355,19948788,58554921,val,BLACK,F,44.0,Medicaid,-1.00,


In [29]:
df_insu.to_csv('df_insu_full_split_subjects_0327.csv', index = False)

## Normal CXR

In [7]:
df = pd.read_csv('data/No_finding/df_No_finding_03032023.csv')
df

Unnamed: 0,img_index,subject_id,study_id,dataset,race,gender,age,insurance,survival_year
0,0,11135741,52912807,train,ASIAN,M,53.0,Other,-1.00
1,1,13091743,51959936,train,ASIAN,M,85.0,Medicare,0.85
2,2,13091743,53279329,train,ASIAN,M,85.0,Medicare,0.51
3,3,13091743,53691912,train,ASIAN,M,85.0,Medicare,0.20
4,4,13091743,56124505,train,ASIAN,M,85.0,Medicare,0.68
...,...,...,...,...,...,...,...,...,...
59869,59869,19948788,54253734,val,BLACK,F,44.0,Medicaid,-1.00
59870,59870,19948788,56226482,val,BLACK,F,44.0,Medicaid,-1.00
59871,59871,19948788,58554921,val,BLACK,F,44.0,Medicaid,-1.00
59872,59872,19948788,58554921,val,BLACK,F,44.0,Medicaid,-1.00


In [4]:
df_fill = df.fillna('NA')
print(df_fill.subject_id.nunique())
print(df_fill.groupby('subject_id').nunique()['insurance'].value_counts(),'\n',
df_fill.groupby('subject_id').nunique()['age'].value_counts())
# Verified that Each subject has unique insurance type, and age corresponds to their entry age 
# which is an issue but not important for now

29148
1    29148
Name: insurance, dtype: int64 
 1    29148
Name: age, dtype: int64


In [5]:
df = df.astype({'age': 'float32'}) # make age float instead of string
df_insu = df[(df['age'] < 66) & (df['insurance'].notna())]
print(df_insu['insurance'].unique())
df_insu = df_insu.replace({'Medicare':'Medicaid'})
print(df_insu['insurance'].unique()) # yes, succesfully casted medicare under 65 into medicaid

print('Data size', len(df_insu), '\n')
print(df_insu['insurance'].value_counts())
print(df_insu['insurance'].value_counts(normalize = True))

['Other' 'Medicare' 'Medicaid']
['Other' 'Medicaid']


In [6]:
img_array = np.load(f'data/No_finding/X_No_finding_03032023.npy')
img_array.shape

(59874, 256, 256, 3)

In [7]:
df_insu.img_index

0            0
11          11
12          12
13          13
16          16
         ...  
59868    59868
59869    59869
59870    59870
59871    59871
59872    59872
Name: img_index, Length: 39251, dtype: int64

In [8]:
sorted(df_insu.img_index.values) == list(df_insu.img_index.values)

True

In [9]:
img_array_use = img_array[df_insu.img_index.values,:,:,:]
img_array_use.shape

(39251, 256, 256, 3)

In [10]:
b1 = img_array_use[:10000,:,:,:]
b2 = img_array_use[10000:20000,:,:,:]
b3 = img_array_use[20000:30000,:,:,:]
b4 = img_array_use[30000:,:,:,:]
#b5 = img_array_use[40000:,:,:,:]

In [11]:
b1.shape[0] + b2.shape[0] + b3.shape[0]+ b4.shape[0]#+ b5.shape[0]

39251

In [16]:
#np.savez_compressed('data/No_finding/batch1_X_No_finding_for_insurance_0226_2023', b1)


In [12]:
#np.savez_compressed('data/No_finding/batch2_X_No_finding_for_insurance_0226_2023',b2)
#np.savez_compressed('data/No_finding/batch3_X_No_finding_for_insurance_0226_2023',b3)
np.savez_compressed('data/No_finding/batch4_X_No_finding_for_insurance_0303_2023',b4)


In [18]:
df_insu = pd.read_csv('data/No_finding/metadata_No_finding_for_insurance_split_by_subjects_0306_2023.csv')
df_insu

Unnamed: 0,img_index,subject_id,study_id,race,gender,age,insurance,survival_year,dataset
0,0,11135741,52912807,ASIAN,M,53.0,Other,-1.00,train
1,11,14755254,54276748,WHITE,M,62.0,Medicaid,0.02,train
2,12,14074579,54580125,WHITE,F,65.0,Medicaid,-1.00,train
3,13,14074579,58207593,WHITE,F,65.0,Medicaid,-1.00,train
4,16,16834871,59549398,WHITE,M,33.0,Other,-1.00,test
...,...,...,...,...,...,...,...,...,...
39246,59868,19931923,50154207,BLACK,M,55.0,Other,-1.00,train
39247,59869,19948788,54253734,BLACK,F,44.0,Medicaid,-1.00,test
39248,59870,19948788,56226482,BLACK,F,44.0,Medicaid,-1.00,test
39249,59871,19948788,58554921,BLACK,F,44.0,Medicaid,-1.00,test


In [24]:
np.unique(df_insu.groupby('subject_id')['dataset'].nunique())

array([1])

In [20]:
import pandas as pd
from scipy.stats import pearsonr
from sklearn.preprocessing import LabelEncoder

# create a sample dataframe with two columns of string values
df = df_insu
# label encode the columns
le = LabelEncoder()
df['race_encoded'] = le.fit_transform(df['race'])
df['gender_encoded'] = le.fit_transform(df['gender'])
df['insurance_encoded'] = le.fit_transform(df['insurance'])
df

Unnamed: 0,img_index,subject_id,study_id,race,gender,age,insurance,survival_year,dataset,race_encoded,gender_encoded,insurance_encoded
0,0,11135741,52912807,ASIAN,M,53.0,Other,-1.00,train,1,1,1
1,11,14755254,54276748,WHITE,M,62.0,Medicaid,0.02,train,6,1,0
2,12,14074579,54580125,WHITE,F,65.0,Medicaid,-1.00,train,6,0,0
3,13,14074579,58207593,WHITE,F,65.0,Medicaid,-1.00,train,6,0,0
4,16,16834871,59549398,WHITE,M,33.0,Other,-1.00,test,6,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
39246,59868,19931923,50154207,BLACK,M,55.0,Other,-1.00,train,2,1,1
39247,59869,19948788,54253734,BLACK,F,44.0,Medicaid,-1.00,test,2,0,0
39248,59870,19948788,56226482,BLACK,F,44.0,Medicaid,-1.00,test,2,0,0
39249,59871,19948788,58554921,BLACK,F,44.0,Medicaid,-1.00,test,2,0,0


In [22]:
# calculate the Pearson correlation between the encoded columns
corr, _ = pearsonr(df['race_encoded'], df['insurance_encoded'])
print('Pearson correlation:', corr)

corr, _ = pearsonr(df['gender_encoded'], df['insurance_encoded'])
print('Pearson correlation:', corr)

corr, _ = pearsonr(df['survival_year'], df['insurance_encoded'])
print('Pearson correlation:', corr)

Pearson correlation: 0.04703425841140067
Pearson correlation: -0.004156771357168227
Pearson correlation: -0.14087208522207792
