In [2]:
# rename files to give them all unique filenames

import os
import pandas as pd
import string

# directory path
parent_dir = r'/mnt/ssd-cluster/cara/data'

#os.listdir(parent_dir)

In [None]:
# define function to read all files
def list_files_in_directory(directory):
    file_paths = []

    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        
        if os.path.isfile(item_path):
            file_paths.append(item_path)
        elif os.path.isdir(item_path):
            file_paths.extend(list_files_in_directory(item_path))
    
    return file_paths

# scan directory and save full paths to filenames
directory_path = '/mnt/ssd-cluster/cara/data'
file_paths = list_files_in_directory(directory_path)

#for path in file_paths:
#    print(path)


In [None]:
# filter out ".txt" files
file_paths = [path for path in file_paths if not path.endswith('.txt')]

# extract filename and study/species combo but keep also full path
names = []
study_species_code_mapping = {}  #to store unique ID for study/species combo
alphabet = string.ascii_lowercase  # All uppercase letters

for path in file_paths:
    file_name = os.path.basename(path)
    parts = path.split('/')
    project_species = '/'.join(parts[5:7])  # Extract "project/species" part

    if project_species not in study_species_code_mapping:
        if len(study_species_code_mapping) < len(alphabet) * len(alphabet):
            first_letter = alphabet[len(study_species_code_mapping) // len(alphabet)]
            second_letter = alphabet[len(study_species_code_mapping) % len(alphabet)]
            code = f"{first_letter}{second_letter}"
            study_species_code_mapping[project_species] = code
        else:
            print("Not enough two-letter combinations for all study/species values.")
    
    file_name_without_extension = os.path.splitext(file_name)[0]  # Remove ".JPG" extension
    combined_filename = f"{file_name_without_extension}_{study_species_code_mapping[project_species]}.JPG"

    names.append({'full_path': path, 'filename': file_name, 'project_species': project_species, 
                  'code': study_species_code_mapping[project_species],
                  'new_filename': combined_filename})

# create a dataframe
names_table = pd.DataFrame(names)

print(len(names_table))
print(len(study_species_code_mapping))
names_table.head()

#save
#names_table.to_csv('/mnt/ssd-cluster/cara/filename_key.csv')
#names_table.to_csv('/home/cara/oregon_critters/filename_key.csv')



In [None]:
#are they all unique?

unique_combined_filenames = names_table['new_filename'].nunique()
total_rows = names_table.shape[0]

if unique_combined_filenames == total_rows:
    print("All combined filenames are unique.")
else:
    print("There are duplicate combined filenames.")

In [None]:
#filename test

# test = pd.read_csv('/home/cara/oregon_critters/filename_test.csv')
# test.head()

# for index, row in test.iterrows():
#     full_filepath = row['full_path']
#     new_filename = row['new_filename']
    
#     if os.path.exists(full_filepath):
#         new_full_filepath = os.path.join(os.path.dirname(full_filepath), new_filename)
#         os.rename(full_filepath, new_full_filepath)
#         print(f"Renamed '{full_filepath}' to '{new_full_filepath}'")
#     else:
#         print(f"File '{full_filepath}' not found.")

In [None]:
# rename files

for index, row in names_table.iterrows():
    full_filepath = row['full_path']
    new_filename = row['new_filename']
    
    if os.path.exists(full_filepath):
        new_full_filepath = os.path.join(os.path.dirname(full_filepath), new_filename)
        os.rename(full_filepath, new_full_filepath)
        print(f"Renamed '{full_filepath}' to '{new_full_filepath}'")
    else:
        print(f"File '{full_filepath}' not found.")

#took ~13 minutes

In [None]:
#now rename YOLO .txt files

#create copy of new names dataframe
yolo_table = names_table

#replace .JPG with .txt
yolo_table['full_path'] = yolo_table['full_path'].str.replace('.JPG', '.txt')
yolo_table['filename'] = yolo_table['filename'].str.replace('.JPG', '.txt')
yolo_table['new_filename'] = yolo_table['new_filename'].str.replace('.JPG', '.txt')

#replace '/images' with '/labels'
yolo_table['full_path'] = yolo_table['full_path'].str.replace('/images/', '/labels/')

#view
pd.set_option('display.max_colwidth', None)
yolo_table.head()

#rename
for index, row in yolo_table.iterrows():
    full_filepath = row['full_path']
    new_filename = row['new_filename']
    
    if os.path.exists(full_filepath):
        new_full_filepath = os.path.join(os.path.dirname(full_filepath), new_filename)
        os.rename(full_filepath, new_full_filepath)
        print(f"Renamed '{full_filepath}' to '{new_full_filepath}'")
    else:
        print(f"File '{full_filepath}' not found.")



In [7]:
#now modify train.txt, val.txt, and test.txt files with new filenames

# Read the CSV file into a DataFrame
names_key = pd.read_csv('/home/cara/oregon_critters/filename_key.csv')

#add relative path to new name for matching
names_key['dir_path'] = names_key['full_path'].apply(lambda path: os.path.dirname(path))
names_key['new_filename_path'] = names_key.apply(lambda row: row['dir_path'].replace('/mnt/ssd-cluster/cara/', '') + '/' + row['new_filename'], axis=1)
names_key['old_filename_path'] = names_key.apply(lambda row: row['dir_path'].replace('/mnt/ssd-cluster/cara/', '') + '/' + row['filename'], axis=1)

#read files
old_train_path = '/home/cara/oregon_critters/sampled_ds_300_train.txt'
old_val_path = '/home/cara/oregon_critters/sampled_ds_300_val.txt'
old_test_path = '/home/cara/oregon_critters/sampled_ds_300_test.txt'

with open(old_train_path, 'r') as f:
    old_train = f.read().splitlines()

with open(old_val_path, 'r') as f:
    old_val = f.read().splitlines()

with open(old_test_path, 'r') as f:
    old_test = f.read().splitlines()

# map old filenames to new filenames
old_to_new_mapping_TRAIN = {}
for _, row in names_key.iterrows():
    if row['old_filename_path'] in old_train:
        old_to_new_mapping_TRAIN[row['old_filename_path']] = row['new_filename_path']

# map old filenames to new filenames
old_to_new_mapping_VAL = {}
for _, row in names_key.iterrows():
    if row['old_filename_path'] in old_val:
        old_to_new_mapping_VAL[row['old_filename_path']] = row['new_filename_path']

# map old filenames to new filenames
old_to_new_mapping_TEST = {}
for _, row in names_key.iterrows():
    if row['old_filename_path'] in old_test:
        old_to_new_mapping_TEST[row['old_filename_path']] = row['new_filename_path']


In [15]:
print(old_to_new_mapping_TRAIN)

{'data/COA_2021/DouglasSquirrel2_ZF/images/22195-3__22195-3-G__2021-03-25__08-04-53(1).JPG': 'data/COA_2021/DouglasSquirrel2_ZF/images/22195-3__22195-3-G__2021-03-25__08-04-53(1)_aa.JPG', 'data/COA_2021/DouglasSquirrel2_ZF/images/22103-4__22103-4-G__2021-05-28__14-47-50(1).JPG': 'data/COA_2021/DouglasSquirrel2_ZF/images/22103-4__22103-4-G__2021-05-28__14-47-50(1)_aa.JPG', 'data/COA_2021/DouglasSquirrel2_ZF/images/22105-3__22105-3-T__2021-05-15__17-56-35(5).JPG': 'data/COA_2021/DouglasSquirrel2_ZF/images/22105-3__22105-3-T__2021-05-15__17-56-35(5)_aa.JPG', 'data/COA_2021/DouglasSquirrel2_ZF/images/22103-4__22103-4-G__2021-04-19__10-15-14(2).JPG': 'data/COA_2021/DouglasSquirrel2_ZF/images/22103-4__22103-4-G__2021-04-19__10-15-14(2)_aa.JPG', 'data/COA_2021/DouglasSquirrel2_ZF/images/22105-1__22105-1-T__2021-05-02__12-47-15(3).JPG': 'data/COA_2021/DouglasSquirrel2_ZF/images/22105-1__22105-1-T__2021-05-02__12-47-15(3)_aa.JPG', 'data/COA_2021/DouglasSquirrel2_ZF/images/22105-4__22105-4-T__20

In [None]:
names_key.head()

In [16]:
# create a new .txt file (TRAIN)
new_txt_path = '/home/cara/oregon_critters/sampled_ds_300_renamed_train.txt'

with open(new_txt_path, 'w') as f:
    for old_filename in old_train:
        if old_filename in old_to_new_mapping_TRAIN:
            new_filename = old_to_new_mapping_TRAIN[old_filename]

            f.write(new_filename + '\n')
        else:
            f.write(old_filename + '\n')

print("New text file with updated filenames created:", new_txt_path)


New text file with updated filenames created: /home/cara/oregon_critters/sampled_ds_300_renamed_train.txt


In [18]:
# create a new .txt file (VAL)
new_txt_path_val = '/home/cara/oregon_critters/sampled_ds_300_renamed_val.txt'

with open(new_txt_path_val, 'w') as f:
    for old_filename in old_val:
        if old_filename in old_to_new_mapping_VAL:
            new_filename = old_to_new_mapping_VAL[old_filename]

            f.write(new_filename + '\n')
        else:
            f.write(old_filename + '\n')

print("New text file with updated filenames created:", new_txt_path_val)


New text file with updated filenames created: /home/cara/oregon_critters/sampled_ds_300_renamed_val.txt


In [17]:
# create a new .txt file (TEST)
new_txt_path_test = '/home/cara/oregon_critters/sampled_ds_300_renamed_test.txt'

with open(new_txt_path_test, 'w') as f:
    for old_filename in old_test:
        if old_filename in old_to_new_mapping_TEST:
            new_filename = old_to_new_mapping_TEST[old_filename]

            f.write(new_filename + '\n')
        else:
            f.write(old_filename + '\n')

print("New text file with updated filenames created:", new_txt_path_test)


New text file with updated filenames created: /home/cara/oregon_critters/sampled_ds_300_renamed_test.txt
