# Creating the data splits

In [2]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

data_dir = Path("/cxr/tables")
meta_df = pd.read_csv(data_dir / "datathon_cxr_metadata.csv")
findings_df = pd.read_csv(data_dir / "datathon_cxr_findings.csv")
merged = pd.merge(meta_df, findings_df, left_on="AccessionNumber_anon", right_on="acc_nbr_anon")

In [3]:
interesting_col = ["AccessionNumber_anon", 'empi_anon_x', 'ImagePath', 'Cardiomegaly', 'Pleural Effusion', 'Pneumothorax', 'ImageView_Lateral']

In [4]:
merged_s = merged[interesting_col]
merged_s = merged_s.fillna(0)
merged_s.rename(columns={'empi_anon_x': 'empi_anon'}, inplace=True)

In [5]:
merged_s = merged_s.loc[
        (merged_s["Pleural Effusion"].astype(int) != -1) & ((merged_s["Cardiomegaly"].astype(int) != -1) & (merged_s["Pneumothorax"].astype(int) != -1))
]
merged_s = merged_s.loc[merged_s.ImageView_Lateral == 0]
merged_s

Unnamed: 0,AccessionNumber_anon,empi_anon,ImagePath,Cardiomegaly,Pleural Effusion,Pneumothorax,ImageView_Lateral
0,4565490549548134,98572016,CXR/0x5e016f0/0x10384a1a6e6466/99578940d67f633...,0.0,1.0,0.0,0
2,7705188003700623,32886211,CXR/0x1f5cdc3/0x1b5fd3e1e35f8f/bb25a781da01bad...,0.0,0.0,0.0,0
3,3091395609988214,91465233,CXR/0x573a611/0xafb9ba1318c76/2f448126139142c1...,0.0,0.0,0.0,0
4,3091395609988214,91465233,CXR/0x573a611/0xafb9ba1318c76/5dab76d3569452d3...,0.0,0.0,0.0,0
5,1125463708179293,22775463,CXR/0x15b86a7/0x3ff9a70889b5d/ba1ae65a9fd3918c...,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...
742858,7551421464534325,85742898,CXR/0x51c5532/0x1ad3fa51f15d35/8c3ef0520fa6052...,1.0,0.0,0.0,0
742859,3796372021140806,76798928,CXR/0x493dbd0/0xd7cc7bdc15d46/dc38525ee5ceb795...,0.0,0.0,0.0,0
742860,7073383445409188,48512181,CXR/0x2e43cb5/0x1921346b9fada4/e4e33a451b5d25a...,0.0,0.0,0.0,0
742861,8487610103534545,82479720,CXR/0x4ea8a68/0x1e276fb93693d1/e5e697c213e5b75...,1.0,1.0,0.0,0


In [6]:
train_id, val_test_id = train_test_split(merged_s.empi_anon, test_size=.25)
test_id, val_id = train_test_split(val_test_id, test_size=.02)

In [7]:
train_df = merged_s.loc[merged_s.empi_anon.isin(train_id)][:100000]
val_df = merged_s.loc[merged_s.empi_anon.isin(val_id)][:20000]
test_df = merged_s.loc[merged_s.empi_anon.isin(test_id)]

In [8]:
train_df
train_df.to_csv("/shared/team6/train_df.csv")

In [9]:
val_df.to_csv("/shared/team6/val_df.csv")

In [10]:
test_df.to_csv("/shared/team6/test_df.csv")