In [1]:
import os
import pathlib
import collections

import numpy as np
import pandas as pd

from PIL import Image

In [2]:
column_names = ["id", "raw_image_filepath", "scored_image_filepath", "sorted_image_filepath", "cpc_filepath"]
index_dict = collections.defaultdict(lambda: [None, None, None, None])

for site_name in os.listdir("C-30"):
    if (site_name == "_Scoring Excel"):
        continue

    for image_type in ["RAW", "SCORED", "SORTED"]:
        for (dirpath, _, filenames) in os.walk(os.path.join("C-30", site_name, image_type)):
            for filename in filenames:
                full_file_path = os.path.join(dirpath, filename)
                image_id = pathlib.Path(full_file_path).stem

                is_image = filename.lower().endswith(".jpg")
                is_cpc = filename.lower().endswith(".cpc")

                if is_image:
                    if image_type == "RAW":
                        index_dict[image_id][0] = full_file_path
                    elif image_type == "SCORED":
                        index_dict[image_id][1] = full_file_path
                    elif image_type == "SORTED":
                        index_dict[image_id][2] = full_file_path
                elif is_cpc:
                    index_dict[image_id][3] = full_file_path

    


In [3]:
image_ids_np = np.array(list(index_dict.keys()))
image_filepaths_np = np.array(list(index_dict.values()))

index_df = pd.DataFrame(np.c_[image_ids_np, image_filepaths_np], columns=column_names)
index_df.head()


Unnamed: 0,id,raw_image_filepath,scored_image_filepath,sorted_image_filepath,cpc_filepath
0,PB084364,C-30\SHINE-1737(Toroso A)\RAW\OLYMPUSTG6_CBRAC...,,,
1,PB084365,C-30\SHINE-1737(Toroso A)\RAW\OLYMPUSTG6_CBRAC...,,,
2,PB084366,C-30\SHINE-1737(Toroso A)\RAW\OLYMPUSTG6_CBRAC...,C-30\SHINE-1737(Toroso A)\SCORED\PB084366.JPG,C-30\SHINE-1737(Toroso A)\SORTED\2\PB084366.JPG,C-30\SHINE-1737(Toroso A)\SCORED\PB084366.cpc
3,PB084367,C-30\SHINE-1737(Toroso A)\RAW\OLYMPUSTG6_CBRAC...,C-30\SHINE-1737(Toroso A)\SCORED\PB084367.JPG,C-30\SHINE-1737(Toroso A)\SORTED\2\PB084367.JPG,C-30\SHINE-1737(Toroso A)\SCORED\PB084367.cpc
4,PB084368,C-30\SHINE-1737(Toroso A)\RAW\OLYMPUSTG6_CBRAC...,C-30\SHINE-1737(Toroso A)\SCORED\PB084368.JPG,C-30\SHINE-1737(Toroso A)\SORTED\2\PB084368.JPG,C-30\SHINE-1737(Toroso A)\SCORED\PB084368.cpc


In [4]:
raw_timestamps = []
scored_img_data = []  # width, height, timestamp
cpc_content = []

EXIF_ORIGINAL_DATETIME_TAG = 36867

for row in index_df.iterrows():    
    raw_filepath = row[1]["raw_image_filepath"]
    if raw_filepath:
        img = Image.open(raw_filepath, "r")
        timestamp = img._getexif()[EXIF_ORIGINAL_DATETIME_TAG]
        raw_timestamps.append(timestamp)
    else:
        raw_timestamps.append(None)

    sorted_filepath = row[1]["sorted_image_filepath"]
    if sorted_filepath:
        img = Image.open(sorted_filepath, "r")
    
        width, height = img.size
        timestamp = img._getexif()[EXIF_ORIGINAL_DATETIME_TAG]

        scored_img_data.append([width, height, timestamp])
    else:
        scored_img_data.append([None, None, None])

    cpc_filepath = row[1]["cpc_filepath"]
    if cpc_filepath:
        with open(cpc_filepath, "r") as f:
            cpc_content.append(f.read())
    else:
        cpc_content.append(None)


In [5]:
img_data_np = np.array(scored_img_data)
index_df["scored_image_width"] = img_data_np[:, 0]
index_df["scored_image_height"] = img_data_np[:, 1]
index_df["scored_image_timestamp"] = img_data_np[:, 2]

index_df["raw_image_timestamp"] = raw_timestamps
index_df["cpc_content"] = cpc_content

index_df.head()

Unnamed: 0,id,raw_image_filepath,scored_image_filepath,sorted_image_filepath,cpc_filepath,scored_image_width,scored_image_height,scored_image_timestamp,raw_image_timestamp,cpc_content
0,PB084364,C-30\SHINE-1737(Toroso A)\RAW\OLYMPUSTG6_CBRAC...,,,,,,,2023:11:08 14:34:38,
1,PB084365,C-30\SHINE-1737(Toroso A)\RAW\OLYMPUSTG6_CBRAC...,,,,,,,2023:11:08 14:35:14,
2,PB084366,C-30\SHINE-1737(Toroso A)\RAW\OLYMPUSTG6_CBRAC...,C-30\SHINE-1737(Toroso A)\SCORED\PB084366.JPG,C-30\SHINE-1737(Toroso A)\SORTED\2\PB084366.JPG,C-30\SHINE-1737(Toroso A)\SCORED\PB084366.cpc,4000.0,3000.0,2023:11:08 14:42:12,2023:11:08 14:42:12,"""C:\CPCe_41_inst\OMLC_code-7CAT.txt"",""C:\Users..."
3,PB084367,C-30\SHINE-1737(Toroso A)\RAW\OLYMPUSTG6_CBRAC...,C-30\SHINE-1737(Toroso A)\SCORED\PB084367.JPG,C-30\SHINE-1737(Toroso A)\SORTED\2\PB084367.JPG,C-30\SHINE-1737(Toroso A)\SCORED\PB084367.cpc,4000.0,3000.0,2023:11:08 14:42:55,2023:11:08 14:42:55,"""C:\CPCe_41_inst\OMLC_code-7CAT.txt"",""C:\Users..."
4,PB084368,C-30\SHINE-1737(Toroso A)\RAW\OLYMPUSTG6_CBRAC...,C-30\SHINE-1737(Toroso A)\SCORED\PB084368.JPG,C-30\SHINE-1737(Toroso A)\SORTED\2\PB084368.JPG,C-30\SHINE-1737(Toroso A)\SCORED\PB084368.cpc,4000.0,3000.0,2023:11:08 14:43:44,2023:11:08 14:43:44,"""C:\CPCe_41_inst\OMLC_code-7CAT.txt"",""C:\Users..."


In [6]:
index_df.to_csv("c30-filename-index.csv", index=False)