<a href="https://colab.research.google.com/github/louisechilds/ADS2002-Catheter/blob/main/Catheter_file_management.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Catheter file management
This notebook should only be executed once. It imports a copy of the images files directly from the RANCZR source to a specifically structured directory in our local drive.

In [1]:
import pandas as pd
import cv2
import os
import numpy as np
from sklearn.model_selection import train_test_split
import random
RS=42#can change

In [2]:
train=pd.read_csv('train.csv')
train_annots=pd.read_csv('train_annotations.csv')

In [3]:
#ChatGpt: delete all files in nested directory
def remove_files_in_folder(folder_path):
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                os.remove(file_path)
        print("All files in the folder and its subdirectories have been removed.")
    else:
        print(f"The folder '{folder_path}' does not exist or is not a directory.")


In [4]:
#set file name
def set_files_on_dataframe(df,tag='StudyInstanceUID'):
  '''
  df: target data frame
  tag: StudyInstanceUID
  '''
  if 'image_file' not in df.columns:
    df['image_file']=''
  for i,row in df.iterrows():
    if '.jpg' not in row[tag]:
      path=row[tag]+'.jpg'
      df._set_value(i,'image_file',path)
    elif '.jpg' in row[tag]:
      path=row[tag]
      df._set_value(i,'image_file',path)
  return df
source='/projects/sc73/ranzcr-clip-catheter-line-classification/train'
import shutil
#moves a copy of image files to target directory
def copy_files(destination_directory,source_directory,image_files):
  '''
  image files argument is a list of image files to move
  '''
  # Iterate through the image files and move them to the destination directory
  for image_file in image_files:
      source_path = os.path.join(source_directory,image_file)
      destination_path = os.path.join(destination_directory,image_file)
      # Check if the destination file already exists, and if it does, rename it to avoid overwriting
      if os.path.exists(destination_path):
          base, extension = os.path.splitext(image_file)
          count = 1
          while os.path.exists(destination_path):
              new_filename = f"{base}_{count}{extension}"
              destination_path = os.path.join(destination_directory, new_filename)
              count += 1
      # Move the image file to the destination directory
      if os.path.exists(destination_path):
        print(f"File '{image_file}' already exists in the destination directory. Skipping...")
      else:
        # Move the image file to the destination directory
        shutil.copy(source_path, destination_path)

The following function will assume a train-validation-split and move a copy of images to target locations.

In [5]:
def form_trval_dir(origin,train_dir,val_dir,cath_type,sample):
  '''
  args
  -origin: source file
  -train/test/val dir: location of folders containing files for testing training, etc.
  -cath_typle: catheter type we test on
  -sample: must only use annotations file with one type of catheter
  '''
  train,validate = train_test_split(sample, test_size=0.2,random_state=RS)
  '''
  splits data into a train and validate set, testing will be done on non-annotated images
  must match certain directory structure to work
  dataframe must match certain structure to work see above
  '''
  train_normal=train[train['label'] == cath_type+' - Normal']
  train_abnormal=train[train['label'] == cath_type+' - Abnormal']
  train_borderline=train[train['label'] == cath_type+' - Borderline']
  validate_normal=validate[validate['label'] == cath_type+' - Normal']
  validate_abnormal=validate[validate['label'] == cath_type+' - Abnormal']
  validate_borderline=validate[validate['label'] == cath_type+' - Borderline']

  train_norm_dir=train_dir+'/Normal'
  train_border_dir=train_dir+'/Borderline'
  train_ab_dir=train_dir+'/Abnormal'
  val_norm_dir=val_dir+'/Normal'
  val_border_dir=val_dir+'/Borderline'
  val_ab_dir=val_dir+'/Abnormal'

  train_list_normal=train_normal['image_file'].tolist()
  train_list_abnormal=train_abnormal['image_file'].tolist()
  train_list_borderline=train_borderline['image_file'].tolist()
  val_list_normal=validate_normal['image_file'].tolist()
  val_list_abnormal=validate_abnormal['image_file'].tolist()
  val_list_borderline=validate_borderline['image_file'].tolist()

  copy_files(destination_directory=train_border_dir,source_directory=origin,image_files=train_list_borderline)
  copy_files(destination_directory=train_norm_dir,source_directory=origin,image_files=train_list_normal)
  copy_files(destination_directory=train_ab_dir,source_directory=origin,image_files=train_list_abnormal)
  copy_files(destination_directory=val_border_dir,source_directory=origin,image_files=val_list_borderline)
  copy_files(destination_directory=val_norm_dir,source_directory=origin,image_files=val_list_normal)
  copy_files(destination_directory=val_ab_dir,source_directory=origin,image_files=val_list_abnormal)

  return 'Done'
 

In [6]:
merged = train.merge(train_annots, on='StudyInstanceUID', how='left', indicator=True)
# Keep only the rows that are not in both DataFrames (left_only)
test = merged[merged['_merge'] == 'left_only']
# Drop the indicator column and reset the index if needed
test = test.drop(columns=['_merge','label','data']).reset_index(drop=True)


In [7]:
def clean_dataframe(df,cath_type):
    '''
    only runs for train, not train annotations
    '''
    if cath_type == 'CVC' or cath_type == 'ETT':
        dataframe=df[['StudyInstanceUID',cath_type+' - Abnormal',cath_type+' - Normal',cath_type+' - Borderline','image_file']]
        dataframe=dataframe.drop(dataframe[(dataframe[cath_type+' - Abnormal'] == 0) & (dataframe[cath_type+' - Normal'] == 0) & (dataframe[cath_type+' - Borderline'] == 0)].index)
        dataframe=dataframe.drop(dataframe[(dataframe[cath_type+' - Abnormal'] == 1) & (dataframe[cath_type+' - Normal'] == 1)].index)
        dataframe=dataframe.drop(dataframe[(dataframe[cath_type+' - Borderline'] == 1) & (dataframe[cath_type+' - Normal'] == 1)].index)
        dataframe=dataframe.drop(dataframe[(dataframe[cath_type+' - Abnormal'] == 1) & (dataframe[cath_type+' - Borderline'] == 1)].index)
    elif cath_type == 'NGT':
        dataframe=df[['StudyInstanceUID','NGT - Abnormal','NGT - Normal','NGT - Borderline','NGT - Incompletely Imaged','image_file']]
        dataframe=dataframe.drop(dataframe[(dataframe['NGT - Abnormal'] == 0) & (dataframe['NGT - Normal'] == 0) & (dataframe['NGT - Borderline'] == 0) & (dataframe['NGT - Incompletely Imaged']==0)].index)
        dataframe=dataframe.drop(dataframe[(dataframe['NGT - Incompletely Imaged']==1)].index)
        dataframe=dataframe.drop(dataframe[(dataframe['NGT - Abnormal'] == 1) & (dataframe['NGT - Normal'] == 1)].index)
        dataframe=dataframe.drop(dataframe[(dataframe['NGT - Borderline'] == 1) & (dataframe['NGT - Normal'] == 1)].index)
        dataframe=dataframe.drop(dataframe[(dataframe['NGT - Abnormal'] == 1) & (dataframe['NGT - Borderline'] == 1)].index)
    return dataframe

Create the structure of the directory first before executing. For more information, consult with ChatGPT on how to format a directory for tensorflow supervised learning methods. Remember that we are only using annotated images for training and validating. Perhaps for testing we will use non-annotated images

In [8]:
#artifically balance the data by selecting a third of images to be normal, a third of images to be abnormal and a third of images to be borderline
#discount incomplete images for the reasons stated in other notebook
test=set_files_on_dataframe(test)
train_annots=set_files_on_dataframe(train_annots)

In [9]:
#for simplicity only use UIDs with at most one of each type of catheter placement
cvc_annots_normal=train_annots[train_annots['label']=='CVC - Normal']
cvc_selection_normal=cvc_annots_normal.head(500)
cvc_selection_normal = cvc_selection_normal.drop_duplicates(subset=['image_file'], keep='first')
cvc_annots_abnormal=train_annots[train_annots['label']=='CVC - Abnormal']
cvc_selection_abnormal=cvc_annots_abnormal.head(500)
cvc_selection_abnormal = cvc_selection_abnormal.drop_duplicates(subset=['image_file'], keep='first')
cvc_annots_borderline=train_annots[train_annots['label']=='CVC - Borderline']
cvc_selection_borderline=cvc_annots_borderline.head(500)
cvc_selection_borderline = cvc_selection_borderline.drop_duplicates(subset=['image_file'], keep='first')
#must make sure this occurs when selecting rows by checking lengths of list of unique UIDs against length of df

In [10]:
#pre select rows
cvc_selection=pd.concat([cvc_selection_normal, cvc_selection_abnormal,cvc_selection_borderline], axis=0)
cvc_selection.reset_index(drop=True,inplace=True)

In [11]:
#for simplicity only use UIDs with at most one of each type of catheter placement
#adjustable size parameters for number of images
ett_annots_normal=train_annots[train_annots['label']=='ETT - Normal']
ett_selection_normal=ett_annots_normal.head(500)
ett_selection_normal = ett_selection_normal.drop_duplicates(subset=['image_file'], keep='first')
ett_annots_abnormal=train_annots[train_annots['label']=='ETT - Abnormal']
ett_selection_abnormal=ett_annots_abnormal.head(500)
ett_selection_abnormal = ett_selection_abnormal.drop_duplicates(subset=['image_file'], keep='first')
ett_annots_borderline=train_annots[train_annots['label']=='ETT - Borderline']
ett_selection_borderline=ett_annots_borderline.head(500)
ett_selection_borderline = ett_selection_borderline.drop_duplicates(subset=['image_file'], keep='first')
#must make sure this occurs when selecting rows by checking lengths of list of unique UIDs against length of df

In [12]:
ett_selection=pd.concat([ett_selection_normal, ett_selection_abnormal,ett_selection_borderline], axis=0)
ett_selection.reset_index(drop=True,inplace=True)

In [13]:
#for simplicity only use UIDs with at most one of each type of catheter placement
ngt_annots_normal=train_annots[train_annots['label']=='NGT - Normal']
ngt_selection_normal=ngt_annots_normal.head(500)
ngt_selection_normal = ngt_selection_normal.drop_duplicates(subset=['StudyInstanceUID'], keep='first')
ngt_annots_abnormal=train_annots[train_annots['label']=='NGT - Abnormal']
ngt_selection_abnormal=ngt_annots_abnormal.head(500)
ngt_selection_abnormal = ngt_selection_abnormal.drop_duplicates(subset=['StudyInstanceUID'], keep='first')
ngt_annots_borderline=train_annots[train_annots['label']=='NGT - Borderline']
ngt_selection_borderline=ngt_annots_borderline.head(500)
ngt_selection_borderline = ngt_selection_borderline.drop_duplicates(subset=['StudyInstanceUID'], keep='first')
#must make sure this occurs when selecting rows by checking lengths of list of unique UIDs against length of df

In [14]:
ngt_selection=pd.concat([ngt_selection_normal, ngt_selection_abnormal,ngt_selection_borderline], axis=0)
ngt_selection.reset_index(drop=True,inplace=True)

In [15]:
#check quantity of images in a given folder is correct
def count_files_in_directory(directory):
    file_count = 0

    # Walk through the directory and its subdirectories
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_count += 1
    return file_count

In [16]:
test_cvc=clean_dataframe(test,'CVC')
test_ngt=clean_dataframe(test,'NGT')
test_ett=clean_dataframe(test,'ETT')

In [17]:
from math import ceil
def form_test_dir(cath_type,test_dir,origin,quant,test,test_size=0.2):
  '''
  can only use rows without annotations in the train csv
  moves images into a TEST folder
  samples random rows from extended test data
  quant: number of images in both training and validating folders
  test: data frame with catheter, eg. see test_ett
  '''
  random.seed(RS)
  test_sample=test.sample(n=ceil(test_size*quant),replace=False)
  test_normal=test_sample[test_sample[cath_type+' - Normal'] == 1]
  test_abnormal=test_sample[test_sample[cath_type+' - Abnormal'] == 1]
  test_borderline=test_sample[test_sample[cath_type+' - Borderline'] == 1]
  test_norm_dir=test_dir+'/Normal'
  test_border_dir=test_dir+'/Borderline'
  test_ab_dir=test_dir+'/Abnormal'

  test_list_normal=test_normal['image_file'].tolist()
  test_list_abnormal=test_abnormal['image_file'].tolist()
  test_list_borderline=test_borderline['image_file'].tolist()
  copy_files(destination_directory=test_border_dir,source_directory=origin,image_files=test_list_borderline)
  copy_files(destination_directory=test_norm_dir,source_directory=origin,image_files=test_list_normal)
  copy_files(destination_directory=test_ab_dir,source_directory=origin,image_files=test_list_abnormal)
  return 'Done, files moved'

In [18]:
cvc_test_directory='images_split/CVC/test'
ett_test_directory='images_split/ETT/test'
ngt_test_directory='images_split/NGT/test'
cvc_val_directory='images_split/CVC/validate'
ngt_val_directory='images_split/NGT/validate'
ett_val_directory='images_split/ETT/validate'
cvc_train_directory='images_split/CVC/train'
ngt_train_directory='images_split/NGT/train'
ett_train_directory='images_split/ETT/train'

In [19]:
n_amount,e_amount,c_amount=len(ngt_selection),len(ett_selection),len(cvc_selection)

In [20]:
count_files_in_directory(source)#no files removed from original source file

30083

In [21]:
def train_val_test_split(train_dir,val_dir,test_dir,test_size,cath_type,sample,origin,quant,test):
    '''
    see args of previous two functions
    '''
    remove_files_in_folder(train_dir)
    remove_files_in_folder(test_dir)
    form_trval_dir(origin,train_dir,val_dir,cath_type,sample)
    form_test_dir(cath_type,test_dir,origin,quant,test,test_size=0.2)


In [22]:
train_dir,val_dir,test_dir=cvc_train_directory,cvc_val_directory,cvc_test_directory
test_size=0.2
cath_type='CVC'
sample=cvc_selection
origin=source
quant=c_amount
test=test_cvc
train_val_test_split(train_dir,val_dir,test_dir,test_size,cath_type,sample,origin,quant,test)

All files in the folder and its subdirectories have been removed.
All files in the folder and its subdirectories have been removed.


In [23]:
train_dir,val_dir,test_dir=ett_train_directory,ett_val_directory,ett_test_directory
test_size=0.2
cath_type='ETT'
sample=ett_selection
origin=source
quant=e_amount
test=test_ett
train_val_test_split(train_dir,val_dir,test_dir,test_size,cath_type,sample,origin,quant,test)

All files in the folder and its subdirectories have been removed.
All files in the folder and its subdirectories have been removed.


In [24]:
train_dir,val_dir,test_dir=ngt_train_directory,ngt_val_directory,ngt_test_directory
test_size=0.2
cath_type='NGT'
sample=ngt_selection
origin=source
quant=n_amount
test=test_ngt
train_val_test_split(train_dir,val_dir,test_dir,test_size,cath_type,sample,origin,quant,test)

All files in the folder and its subdirectories have been removed.
All files in the folder and its subdirectories have been removed.


Done! Now we can use the current directory to map out coordinates during training and validating of annotated images and then test on images that are not annotated.

In [25]:
count_files_in_directory(source)#no files removed from original source file

30083