In [None]:
import pandas as pd
import os
import sys
import warnings
import shutil

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory

In [None]:
# set-up
bucket_name = 'ca-climate-index'
aws_dir = '3_fair_data/index_data/'

pull_csv_from_directory(bucket_name, aws_dir, output_folder='aws_csvs', search_zipped=False, print_name=False)

In [None]:
# Define the output folder path
output_folder = 'aws_csvs'

# Count the number of files in the folder
file_count = len([file for file in os.listdir(output_folder) if os.path.isfile(os.path.join(output_folder, file))])

print(f'There are {file_count} files in the {output_folder} folder.')

In [None]:
meta_csv = r'../utils/calcrai_metrics.csv'

metadata_calculate = pd.read_csv(meta_csv)

In [None]:
metadata_calculate.head()

In [None]:
to_delete_folder = 'to_delete'  # Folder to store unmatched files

# Read the CSV and extract the 'File Name' column
metadata_filenames = metadata_calculate['Metric file name'].tolist()  # List of valid file names from the CSV

# Create the to_delete folder if it doesn't exist
if not os.path.exists(to_delete_folder):
    os.makedirs(to_delete_folder)

# Iterate through the files in the output folder
for file_name in os.listdir(output_folder):
    # Full path to the file in the output folder
    file_path = os.path.join(output_folder, file_name)
    
    # If it's a file and not in the metadata list, move it to the to_delete folder
    if os.path.isfile(file_path) and file_name not in metadata_filenames:
        # Move the file to the to_delete folder
        shutil.move(file_path, os.path.join(to_delete_folder, file_name))

print(f"Files that don't match the metadata have been moved to {to_delete_folder}.")

In [None]:
# Define the output folder path
delete_folder = 'to_delete'

# Count the number of files in the folder
file_count = len([file for file in os.listdir(delete_folder) if os.path.isfile(os.path.join(delete_folder, file))])

print(f'There are {file_count} files in the {output_folder} folder.')

In [None]:
# Define the output folder path
output_folder = 'aws_csvs'

# Count the number of files in the folder
file_count = len([file for file in os.listdir(output_folder) if os.path.isfile(os.path.join(output_folder, file))])

print(f'There are {file_count} files in the {output_folder} folder.')

In [None]:
# List all files remaining in the aws_csvs folder (after moving 28 to to_delete folder)
remaining_files = os.listdir(output_folder)  # This gets the remaining 105 files in aws_csvs

# Find the rows in metadata that don't have a corresponding file in aws_csvs
missing_files = [file_name for file_name in metadata_filenames if file_name not in remaining_files]

# Filter the metadata DataFrame to show the rows corresponding to missing files
missing_rows = metadata_calculate[metadata_calculate['Metric file name'].isin(missing_files)]

# Display the missing rows
print("These are the rows in 'meta_calculate_csv' that don't have a corresponding file in 'aws_csvs':")
display(missing_rows)

# Optionally, save the missing rows to a new CSV
missing_rows.to_csv('missing_files_in_metadata.csv', index=False)