In [8]:
import os
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
import json
import cv2

In [9]:
ROOT = "../../data/Training_Data/Training_Data"
print(os.path.exists(ROOT))
video_names = os.listdir(ROOT)
video_paths = [os.path.join(ROOT, vn) for vn in video_names]

video_names[0]

True


'AN01-20210104-154854'

In [10]:
# Open and clean dataframe

df = pd.read_csv(r"..\..\data\Training_Data\Training_Data\Training_Data.csv")
df['Tool'] = df['Tool'].fillna('nothing')
df = df.drop('Folder', axis=1)

# Ensure sorted by frame
df

Unnamed: 0,FileName,Time Recorded,Tool,Overall Task,Tool bounding box
0,AN01-20210104-154854_0000.jpg,0.021,nothing,nothing,"[{'class': 'syringe', 'xmin': 482, 'ymin': 328..."
1,AN01-20210104-154854_0001.jpg,0.279,nothing,nothing,"[{'class': 'syringe', 'xmin': 482, 'ymin': 328..."
2,AN01-20210104-154854_0002.jpg,0.400,nothing,nothing,"[{'class': 'ultrasound', 'xmin': 365, 'ymin': ..."
3,AN01-20210104-154854_0003.jpg,0.819,nothing,nothing,"[{'class': 'ultrasound', 'xmin': 365, 'ymin': ..."
4,AN01-20210104-154854_0004.jpg,0.929,nothing,nothing,"[{'class': 'ultrasound', 'xmin': 365, 'ymin': ..."
...,...,...,...,...,...
187256,MS05-20210207-131301_2606.jpg,181.552,nothing,remove_guidewire,"[{'class': 'catheter', 'xmin': '414', 'ymin': ..."
187257,MS05-20210207-131301_2607.jpg,181.604,nothing,remove_guidewire,"[{'class': 'catheter', 'xmin': '414', 'ymin': ..."
187258,MS05-20210207-131301_2608.jpg,181.655,nothing,remove_guidewire,"[{'class': 'catheter', 'xmin': '414', 'ymin': ..."
187259,MS05-20210207-131301_2609.jpg,181.707,nothing,remove_guidewire,"[{'class': 'catheter', 'xmin': '414', 'ymin': ..."


In [11]:
# Group based on video

def process_name(file_name):
    video_name, frame_ext = file_name.split('_')
    frame_number = frame_ext.split('.')[0]
    return [*video_name.split('-'), frame_number]

name_tokens = df['FileName'].apply(process_name)

df['VideoName'] = df['FileName'].apply(lambda x: x.split('_')[0])
df['Date'] = name_tokens.str[1]
df['VideoID'] = name_tokens.str[2]
df['FrameNumber'] = name_tokens.str[3].astype(int)

df['AnnotatorName'] = name_tokens.str[0]
df['AnnotatorIsMS'] = name_tokens.str[0].apply(lambda x: True if x[:2] == 'MS' else False)
df['AnnotatorNumber'] = name_tokens.str[0].apply(lambda x: int(x[2:]))

In [12]:
# Parse bounding boxes for tools

tools = list([t for t in df['Tool'].unique() if t != 'nothing'])
for tool in tools:
    df['has_tool_' + tool] = df.apply(lambda x: f"'class': '{tool}'" in x['Tool bounding box'], axis=1)

df

Unnamed: 0,FileName,Time Recorded,Tool,Overall Task,Tool bounding box,VideoName,Date,VideoID,FrameNumber,AnnotatorName,AnnotatorIsMS,AnnotatorNumber,has_tool_ultrasound,has_tool_syringe,has_tool_anesthetic,has_tool_guidewire_casing,has_tool_guidewire,has_tool_scalpel,has_tool_dilator,has_tool_catheter
0,AN01-20210104-154854_0000.jpg,0.021,nothing,nothing,"[{'class': 'syringe', 'xmin': 482, 'ymin': 328...",AN01-20210104-154854,20210104,154854,0,AN01,False,1,False,True,False,False,False,False,False,False
1,AN01-20210104-154854_0001.jpg,0.279,nothing,nothing,"[{'class': 'syringe', 'xmin': 482, 'ymin': 328...",AN01-20210104-154854,20210104,154854,1,AN01,False,1,False,True,False,False,False,False,False,False
2,AN01-20210104-154854_0002.jpg,0.400,nothing,nothing,"[{'class': 'ultrasound', 'xmin': 365, 'ymin': ...",AN01-20210104-154854,20210104,154854,2,AN01,False,1,True,True,False,False,False,False,False,False
3,AN01-20210104-154854_0003.jpg,0.819,nothing,nothing,"[{'class': 'ultrasound', 'xmin': 365, 'ymin': ...",AN01-20210104-154854,20210104,154854,3,AN01,False,1,True,True,False,False,False,False,False,False
4,AN01-20210104-154854_0004.jpg,0.929,nothing,nothing,"[{'class': 'ultrasound', 'xmin': 365, 'ymin': ...",AN01-20210104-154854,20210104,154854,4,AN01,False,1,True,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187256,MS05-20210207-131301_2606.jpg,181.552,nothing,remove_guidewire,"[{'class': 'catheter', 'xmin': '414', 'ymin': ...",MS05-20210207-131301,20210207,131301,2606,MS05,True,5,False,False,False,False,True,False,False,True
187257,MS05-20210207-131301_2607.jpg,181.604,nothing,remove_guidewire,"[{'class': 'catheter', 'xmin': '414', 'ymin': ...",MS05-20210207-131301,20210207,131301,2607,MS05,True,5,False,False,False,False,True,False,False,True
187258,MS05-20210207-131301_2608.jpg,181.655,nothing,remove_guidewire,"[{'class': 'catheter', 'xmin': '414', 'ymin': ...",MS05-20210207-131301,20210207,131301,2608,MS05,True,5,False,False,False,False,True,False,False,True
187259,MS05-20210207-131301_2609.jpg,181.707,nothing,remove_guidewire,"[{'class': 'catheter', 'xmin': '414', 'ymin': ...",MS05-20210207-131301,20210207,131301,2609,MS05,True,5,False,False,False,False,True,False,False,True


In [13]:
# Group rows by video ID

df['VideoID'] = df['FileName'].apply(lambda x: x.split('_')[0].split('-')[2])
videos_df = pd.DataFrame()

# Annotators who only annotated videos in a different setting than the other annotators
annotators_setting_B = ['MS01', 'MS02', 'MS03', 'MS04']


for video_id, video_rows in df.groupby('VideoID'):

    first_row = video_rows.iloc[0]
    video_name = first_row['FileName'].split('_')[0]

    annotator = video_name.split('-')[0]

    is_annotator_ms = annotator[:2] == 'MS'
    annotator_num = int(annotator[2:])
    in_setting_B = annotator in annotators_setting_B

    #frame = video_rows.iloc[0]['FrameNumber']
    last_frame = video_rows.iloc[-1]['FrameNumber']


    # if video is over 2800 frames, split in half
    """
    if len(video_rows) > 2800:
        midpoint = len(video_rows) // 2
        video_rows_1 = video_rows.iloc[:midpoint]
        video_rows_2 = video_rows.iloc[midpoint:]

        new_row_1 = pd.DataFrame({
            'VideoID': [video_id],
            'VideoName': [video_name],

            'Annotator': [annotator],
            'IsAnnotatorMS': [is_annotator_ms],
            'AnnotatorNumber': [annotator_num],
            'InSettingB': [in_setting_B],

            'num_frames_total': len(video_rows_1),
            'first_file': video_rows_1.iloc[0]['FileName'],
            'last_file': video_rows_1.iloc[-1]['FileName']
        })
        

        for tool in tools:
            new_row_1[f'num_frames_with_tool_{tool}'] = video_rows_1[f'has_tool_{tool}'].sum()

        new_row_2 = pd.DataFrame({
            'VideoID': [video_id],
            'VideoName': [video_name],

            'Annotator': [annotator],
            'IsAnnotatorMS': [is_annotator_ms],
            'AnnotatorNumber': [annotator_num],
            'InSettingB': [in_setting_B],

            'num_frames_total': len(video_rows_2),
            'first_file': video_rows_2.iloc[0]['FileName'],
            'last_file': video_rows_2.iloc[-1]['FileName']
        })

        for tool in tools:
            new_row_2[f'num_frames_with_tool_{tool}'] = video_rows_2[f'has_tool_{tool}'].sum()

        videos_df = pd.concat([videos_df, new_row_1, new_row_2], axis=0)

    else:
    """
    new_row = pd.DataFrame({
        'VideoID': [video_id],
        'VideoName': [video_name],

        'Annotator': [annotator],
        'IsAnnotatorMS': [is_annotator_ms],
        'AnnotatorNumber': [annotator_num],
        'InSettingB': [in_setting_B],

        'num_frames_total': len(video_rows),
        #'first_file': video_rows.iloc[0]['FileName'],
        #'last_file': video_rows.iloc[-1]['FileName']
    })

    for tool in tools:
        new_row[f'num_frames_with_tool_{tool}'] = video_rows[f'has_tool_{tool}'].sum()

    videos_df = pd.concat([videos_df, new_row], axis=0)

videos_df = videos_df.reset_index(drop=True)
videos_df

Unnamed: 0,VideoID,VideoName,Annotator,IsAnnotatorMS,AnnotatorNumber,InSettingB,num_frames_total,num_frames_with_tool_ultrasound,num_frames_with_tool_syringe,num_frames_with_tool_anesthetic,num_frames_with_tool_guidewire_casing,num_frames_with_tool_guidewire,num_frames_with_tool_scalpel,num_frames_with_tool_dilator,num_frames_with_tool_catheter
0,122821,MS05-20210207-122821,MS05,True,5,False,4374,1038,640,96,449,1523,35,50,625
1,123321,MS05-20210207-123321,MS05,True,5,False,2966,1104,714,37,80,1053,8,46,328
2,123944,MS05-20210207-123944,MS05,True,5,False,4358,1380,320,25,597,2015,36,33,269
3,124429,MS05-20210207-124429,MS05,True,5,False,2455,770,476,0,283,1086,84,34,180
4,124744,MS05-20210207-124744,MS05,True,5,False,1857,589,260,35,193,786,18,19,239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,205729,AN05-20210216-205729,AN05,False,5,False,1436,1339,203,43,122,474,7,8,267
96,210122,AN05-20210216-210122,AN05,False,5,False,1563,748,269,17,162,322,6,0,217
97,210426,AN05-20210216-210426,AN05,False,5,False,1675,1549,452,32,108,392,5,10,202
98,210734,AN05-20210216-210734,AN05,False,5,False,1441,840,356,8,120,355,9,1,336


---

# Start Splitting

In [185]:
import pandas as pd
from random import randint
from IPython.display import display


def pop_row(df, index):
    if index < 0:
        index = len(df) + index
    
    row = df.iloc[index]
    df.drop(index=index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return row


def pop_rand(orig_df):
    index = randint(0, len(orig_df) - 1)
    return pop_row(orig_df, index)



def train_val_test_split(df, train_size=0.8, val_size=0.1, test_size=0.1):
    # Assume we are stratifying by key 'num_frames_total'

    df = df.copy()

    tss = train_size + val_size + test_size
    if tss <= 1 - 1e5 or tss >= 1 + 1e5:
        raise ValueError("sizes must add up to 1", train_size + val_size + test_size)
    
    # DataFrames to store the train, validation, and test sets
    train_df = pd.DataFrame(columns=df.columns)
    val_df = pd.DataFrame(columns=df.columns)
    test_df = pd.DataFrame(columns=df.columns)

    total_frames = df['num_frames_total'].sum()
    TRAIN_TARGET_SIZE = total_frames * train_size
    VAL_TARGET_SIZE = total_frames * val_size
    TEST_TARGET_SIZE = total_frames * test_size


    # Main loop to split the data
    while len(df):

        # Add random to train
        if len(df) and train_df.empty or train_df['num_frames_total'].sum() < TRAIN_TARGET_SIZE:
            row = pop_rand(df)
            train_df.loc[len(df)] = row

        # Add random to val
        if len(df) and val_df.empty or val_df['num_frames_total'].sum() < VAL_TARGET_SIZE:
            row = pop_rand(df)
            val_df.loc[len(val_df)] = row

        # Add random to test
        if len(df) and test_df.empty or test_df['num_frames_total'].sum() < TEST_TARGET_SIZE:
            row = pop_rand(df)
            test_df.loc[len(test_df)] = row

    frames_sum = train_df['num_frames_total'].sum() + val_df['num_frames_total'].sum() + test_df['num_frames_total'].sum()
    if total_frames != frames_sum:
        raise Exception(f"Frames do not add up: {total_frames} (expected) {frames_sum} (actual)")

    return {
        'train': train_df,
        'val': val_df,
        'test': test_df
    }


def print_sets(train_df, val_df, test_df):
    # Final sizes of each dataset
    trn = train_df['num_frames_total'].sum()
    vn = val_df['num_frames_total'].sum()
    te = test_df['num_frames_total'].sum()

    total_frames = trn + vn + te
    print("TRAIN", len(train_df), trn, trn / total_frames)
    print("VALID", len(val_df), vn, vn / total_frames)
    print("TEST", len(test_df), te, te / total_frames)

    tool_columns = [c for c in train_df.columns if c.startswith('num_frames_with_tool_')]
    tool_totals_df = pd.DataFrame(columns=['column', 'train', 'val', 'test'])

    for c in tool_columns:
        new_row = pd.DataFrame({
            'column': [c],
            'train': [round(train_df[c].mean())],
            'val': [round(val_df[c].mean())],
            'test': [round(test_df[c].mean())]
        })
        tool_totals_df = pd.concat([tool_totals_df, new_row]) if not tool_totals_df.empty else new_row
        
    display(tool_totals_df)


In [411]:
# Assume videos_df is the DataFrame you start with
ASize = 0
BSize = 0

while ASize != 5 or BSize != 5:
    group_A_df = videos_df[~videos_df['InSettingB']].reset_index(drop=True)    # Starting with Setting A (majority)
    group_A_res = train_val_test_split(group_A_df, 0.85, 0.075, 0.075)
    ASize = len(group_A_res['val'])
    BSize = len(group_A_res['test'])

print_sets(group_A_res['train'], group_A_res['val'], group_A_res['test'])

TRAIN 50 94072 0.8318551203940329
VALID 5 9385 0.0829892030029977
TEST 5 9630 0.0851556766029694


Unnamed: 0,column,train,val,test
0,num_frames_with_tool_ultrasound,1390,1583,1014
0,num_frames_with_tool_syringe,473,633,384
0,num_frames_with_tool_anesthetic,32,47,29
0,num_frames_with_tool_guidewire_casing,175,148,143
0,num_frames_with_tool_guidewire,767,840,766
0,num_frames_with_tool_scalpel,78,20,27
0,num_frames_with_tool_dilator,137,144,193
0,num_frames_with_tool_catheter,291,218,219


In [391]:
group_B_df = videos_df[videos_df['InSettingB']].reset_index(drop=True)    # Starting with Setting A (majority)
group_B_res = train_val_test_split(group_B_df)
print_sets(group_B_res['train'], group_B_res['val'], group_B_res['test'])

TRAIN 30 58350 0.7866637905465527
VALID 5 7990 0.10771968614339256
TEST 5 7834 0.10561652331005474


Unnamed: 0,column,train,val,test
0,num_frames_with_tool_ultrasound,790,434,532
0,num_frames_with_tool_syringe,712,215,196
0,num_frames_with_tool_anesthetic,49,34,35
0,num_frames_with_tool_guidewire_casing,249,192,238
0,num_frames_with_tool_guidewire,793,770,682
0,num_frames_with_tool_scalpel,18,14,15
0,num_frames_with_tool_dilator,51,41,33
0,num_frames_with_tool_catheter,264,286,259


---

# Save Results

In [413]:
final_train_df = pd.concat([group_A_res['train'], group_B_res['train']]).sort_values(by='VideoName').reset_index(drop=True)
final_val_df = pd.concat([group_A_res['val'], group_B_res['val']]).sort_values(by='VideoName').reset_index(drop=True)
final_test_df = pd.concat([group_A_res['test'], group_B_res['test']]).sort_values(by='VideoName').reset_index(drop=True)

trn = final_train_df['num_frames_total'].sum()
vn = final_val_df['num_frames_total'].sum()
te = final_test_df['num_frames_total'].sum()

total_frames = trn + vn + te
print("TRAIN", len(final_train_df), trn, trn / total_frames)
print("VALID", len(final_val_df), vn, vn / total_frames)
print("TEST", len(final_test_df), te, te / total_frames)


TRAIN 80 152422 0.813954854454478
VALID 10 17375 0.09278493653243335
TEST 10 17464 0.09326020901308868


In [414]:
tool_columns = [c for c in final_train_df.columns if c.startswith('num_frames_with_tool_')]
tool_totals = {d: dict() for d in tool_columns}

tool_totals_df = pd.DataFrame(columns=['column', 'train', 'val', 'test'])

for c in tool_columns:
    new_row = pd.DataFrame({
        'column': [c],
        'train': [round(final_train_df[c].mean())],
        'val': [round(final_val_df[c].mean())],
        'test': [round(final_test_df[c].mean())]
    })
    tool_totals_df = pd.concat([tool_totals_df, new_row])
    
tool_totals_df

Unnamed: 0,column,train,val,test
0,num_frames_with_tool_ultrasound,1165,1008,773
0,num_frames_with_tool_syringe,563,424,290
0,num_frames_with_tool_anesthetic,39,41,32
0,num_frames_with_tool_guidewire_casing,203,170,191
0,num_frames_with_tool_guidewire,777,805,724
0,num_frames_with_tool_scalpel,55,17,21
0,num_frames_with_tool_dilator,105,92,113
0,num_frames_with_tool_catheter,281,252,239


In [415]:
final_train_df.to_csv('train.csv')
final_val_df.to_csv('val.csv')
final_test_df.to_csv('test.csv')