# üìù Filename Renaming Template

This notebook creates a mapping CSV of original filenames to new filenames without processing the files.

## üìä Output CSV Columns
- `original_s3_path` - Full S3 path to original file
- `original_filename` - Original filename only
- `new_filename` - Proposed new filename
- `category` - File category
- `file_size_gb` - File size in GB
- `nodata_value` - Nodata value detected in the file
- `status` - Validation status
- `output_s3_path` - Proposed destination path

---

## üìã Step 1: Basic Configuration

Set your event details and S3 paths:

In [11]:
# ========================================
# INPUTS
# ========================================

# S3 Paths (DO NOT CHANGE)
BUCKET = 'nasa-disasters'    # S3 bucket
DESTINATION_BASE = 'drcs_activations_new'  # Where to save COGs in S3 bucket
GEOTIFF_DIR = 'drcs_activations' # This is where all raw geotiff files currently are


# Event Details
EVENT_NAME = '202510_Flood_AK'  # Your event name
SUB_PRODUCT_NAME = 'sentinel2'         # Sub-directories within EVENT_NAME (RGB, trueColor, SWIR, etc.). Can leave blank.
SOURCE_PATH = f'{GEOTIFF_DIR}/{EVENT_NAME}/{SUB_PRODUCT_NAME}'      # Where your files are


# Output Options
SAVE_CSV = True          # Save mapping to CSV file
OUTPUT_DIR = 'file-mapping'    # Local directory for CSV output

print(f"Event: {EVENT_NAME}")
print(f"Source: s3://{BUCKET}/{SOURCE_PATH}")

Event: 202510_Flood_AK
Source: s3://nasa-disasters/drcs_activations/202510_Flood_AK/sentinel2


## üîç Step 2: Connect to S3 and List Files

Connect to S3 and view available files with their sizes:

In [None]:
# Import necessary modules
import sys
import os
import boto3
import pandas as pd
from pathlib import Path
from datetime import datetime
import tempfile
import rasterio

# Add parent directory to path for importing functions
sys.path.insert(0, str(Path('..').resolve()))
from core.s3_operations import list_s3_files, get_file_size_from_s3

# Initialize S3 client
print("Connecting to S3...")

if USE_TEMP_CREDENTIALS and AWS_ACCESS_KEY_ID:
    # Use temporary credentials
    s3_client = boto3.client(
        's3',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        aws_session_token=AWS_SESSION_TOKEN
    )
    print("Connected to S3 using temporary credentials\n")
else:
    # Use default credentials (from environment or external ID)
    from core.s3_operations import initialize_s3_client
    s3_client, _ = initialize_s3_client(bucket_name=BUCKET, verbose=False)
    print("Connected to S3 using default credentials\n")


# List all TIF files
print(f"Files in s3://{BUCKET}/{SOURCE_PATH}:")
print("="*80)

files = list_s3_files(s3_client, BUCKET, SOURCE_PATH, suffix='.tif')

if files:
    print(f"Found {len(files)} .tif files\n")
    
    # Create initial DataFrame with file info
    file_data = []
    for idx, file_path in enumerate(files, 1):
        filename = os.path.basename(file_path)
        print(f"[{idx}/{len(files)}] Reading metadata for {filename}...", end=" ")
        
        try:
            size_gb = get_file_size_from_s3(s3_client, BUCKET, file_path)
        except:
            size_gb = 0.0
        
        # Try to get nodata value by downloading file temporarily
        nodata_value = None
        try:
            # Create temporary file
            suffix = os.path.splitext(file_path)[1] or '.tif'
            with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
                temp_file = tmp.name
            
            # Download file
            s3_client.download_file(BUCKET, file_path, temp_file)
            
            # Read nodata value
            with rasterio.open(temp_file) as src:
                nodata_value = src.nodata
            
            # Clean up
            os.unlink(temp_file)
            
            print(f"(nodata={nodata_value})")
        except Exception as e:
            print(f"(nodata=?)")
        
        file_data.append({
            'original_s3_path': file_path,
            'original_filename': filename,
            'file_size_gb': size_gb,
            'nodata_value': nodata_value
        })
    
    files_df = pd.DataFrame(file_data)
    
    # Display summary
    print(f"\nTotal files: {len(files)}")
    print(f"Total size: {files_df['file_size_gb'].sum():.2f} GB")
    
    # Show nodata value distribution
    nodata_counts = files_df['nodata_value'].value_counts()
    print(f"\nNodata values found:")
    for nodata, count in nodata_counts.items():
        print(f"   {nodata}: {count} files")
    
    # Display ALL files
    print(f"\nComplete file list:")
    print("-" * 80)
    for i, row in files_df.iterrows():
        print(f"{i+1:3}. {row['original_filename']:<60} ({row['file_size_gb']:.2f} GB, nodata={row['nodata_value']})")
else:
    print("‚ö†Ô∏è No .tif files found in the specified path.")
    print("   Check your SOURCE_PATH configuration.")
    files_df = pd.DataFrame()

## üè∑Ô∏è Step 3: Define Filename Transformation Functions

Based on the files you see above, configure:
1. **Categorization patterns** - Regex patterns to identify file types
2. **Filename functions** - How to transform filenames
3. **Output directories** - Where each category should be saved

In [13]:
# ========================================
# CATEGORIZATION AND FILENAME TRANSFORMATION
# ========================================

import re

#Define helper function to extract dates from filenames
def extract_date_from_filename(filename):
    """Extract date from filename in YYYY-MM-DD format."""
    # Try YYYYMMDD format (d{8} finds a sequence of 8 digits from filename)
    dates = re.findall(r'\d{8}', filename)
    if dates:
        date_str = dates[0]
        # Then it splits the date into different sections
        return f"{date_str[0:4]}-{date_str[4:6]}-{date_str[6:8]}"
    
    # Try YYYY-MM-DD format
    dates = re.findall(r'\d{4}-\d{2}-\d{2}', filename)
    if dates:
        return dates[0]
    
    return None

#Define filename transformation functions for each category
def create_TC_SWIR_NC_CIR_filename(original_path, event_name):
    """Create filename for trueColor products."""
    filename = os.path.basename(original_path)
    stem = os.path.splitext(filename)[0]
    date = extract_date_from_filename(stem)
    
    if date:
        stem_clean = re.sub(r'_?\d{8}', '', stem)
        stem_clean = re.sub(r'_?\d{4}-\d{2}-\d{2}', '', stem_clean)
        return f"{event_name}_{stem_clean}_{date}_day.tif"
    return f"{event_name}_{stem}_day.tif"



In [16]:
# Configure categorization patterns (REQUIRED)
# These regex patterns determine which files belong to which category
CATEGORIZATION_PATTERNS = {
    'trueColor': r'trueColor|truecolor|true_color',
    'colorInfrared': r'colorInfrared|colorIR|color_infrared',
    'naturalColor': r'naturalColor|naturalcolor|natural_color',
    'shortwaveIR': r'shortwaveIR|shortwaveinfrared|shortwaveInfrared'
    # Add patterns for ALL file types you want to process
    # Files not matching any pattern will be marked as 'uncategorized'
}

# Map categories to filename transformation functions
FILENAME_CREATORS = {
    'trueColor': create_TC_SWIR_NC_CIR_filename,
    'colorInfrared': create_TC_SWIR_NC_CIR_filename,
    'naturalColor': create_TC_SWIR_NC_CIR_filename,
    'shortwaveIR': create_TC_SWIR_NC_CIR_filename
    # Must have an entry for each category in CATEGORIZATION_PATTERNS
}

# Specify output directories for each category
OUTPUT_DIRS = {
    'trueColor': 'Sentinel-2/trueColor',
    'colorInfrared': 'Sentinel-2/colorIR',
    'naturalColor': 'Sentinel-2/naturalColor',
    'shortwaveIR': 'Sentinel-2/shortwaveIR'
    # Must have an entry for each category in CATEGORIZATION_PATTERNS
}

print("Filename transformation functions defined")
print(f"\nCategories configured: {len(CATEGORIZATION_PATTERNS)}")
for category in CATEGORIZATION_PATTERNS.keys():
    print(f"   ‚Ä¢ {category}")

Filename transformation functions defined

Categories configured: 4
   ‚Ä¢ trueColor
   ‚Ä¢ colorInfrared
   ‚Ä¢ naturalColor
   ‚Ä¢ shortwaveIR


## üîç Step 4: Preview Transformations

Apply the transformation functions and preview the mapping:

In [17]:
if not files_df.empty:
    print("Applying filename transformations...\n")
    
    # Apply categorization and transformation
    def categorize_file(filename):
        """Categorize a file based on patterns."""
        for category, pattern in CATEGORIZATION_PATTERNS.items():
            if re.search(pattern, filename, re.IGNORECASE):
                return category
        return 'uncategorized'
    
    def transform_filename(row):
        """Transform filename based on category."""
        category = row['category']
        original_path = row['original_s3_path']
        
        if category == 'uncategorized':
            return os.path.basename(original_path)  # Keep original
        
        if category in FILENAME_CREATORS:
            return FILENAME_CREATORS[category](original_path, EVENT_NAME)
        
        return os.path.basename(original_path)
    
    def get_output_path(row):
        """Generate output S3 path."""
        category = row['category']
        new_filename = row['new_filename']
        
        if category == 'uncategorized':
            return f"{DESTINATION_BASE}/uncategorized/{new_filename}"
        
        if category in OUTPUT_DIRS:
            return f"{DESTINATION_BASE}/{OUTPUT_DIRS[category]}/{new_filename}"
        
        return f"{DESTINATION_BASE}/{category}/{new_filename}"
    
    # Apply transformations
    files_df['category'] = files_df['original_filename'].apply(categorize_file)
    files_df['new_filename'] = files_df.apply(transform_filename, axis=1)
    files_df['output_s3_path'] = files_df.apply(get_output_path, axis=1)
    files_df['status'] = 'valid'
    
    # Check for uncategorized files
    uncategorized = files_df[files_df['category'] == 'uncategorized']
    if not uncategorized.empty:
        files_df.loc[files_df['category'] == 'uncategorized', 'status'] = 'uncategorized'
    
    # Display preview
    print("TRANSFORMATION PREVIEW")
    print("="*80)
    print(f"\nTotal files: {len(files_df)}")
    print(f"Categorized: {len(files_df[files_df['category'] != 'uncategorized'])}")
    print(f"Uncategorized: {len(uncategorized)}")
    
    # Show category breakdown
    print("\nFiles by category:")
    category_counts = files_df['category'].value_counts()
    for category, count in category_counts.items():
        print(f"   ‚Ä¢ {category}: {count} files")
    
    # Show sample transformations
    print("\nTransformation information:")
    print("-" * 80)
    for i, row in files_df.iterrows():
        print(f"\n{i+1}. Original: {row['original_filename']}")
        print(f"   Category: {row['category']}")
        print(f"   New name: {row['new_filename']}")
        print(f"   Output:   s3://{BUCKET}/{row['output_s3_path']}")
    
    if len(uncategorized) > 0:
        print("\n‚ö†Ô∏è  UNCATEGORIZED FILES:")
        print("-" * 80)
        for i, row in uncategorized.iterrows():
            print(f"   ‚Ä¢ {row['original_filename']}")
        print("\nAdd patterns to CATEGORIZATION_PATTERNS to categorize these files")
    
    print("\n" + "="*80)
else:
    print("‚ö†Ô∏è No files to process. Check Step 2.")

Applying filename transformations...

TRANSFORMATION PREVIEW

Total files: 123
Categorized: 123
Uncategorized: 0

Files by category:
   ‚Ä¢ colorInfrared: 41 files
   ‚Ä¢ shortwaveIR: 41 files
   ‚Ä¢ trueColor: 41 files

Transformation information:
--------------------------------------------------------------------------------

1. Original: S2B_MSIL2A_colorInfrared_20250913_222529_T03VVG.tif
   Category: colorInfrared
   New name: 202510_Flood_AK_S2B_MSIL2A_colorInfrared_222529_T03VVG_2025-09-13_day.tif
   Output:   s3://nasa-disasters/drcs_activations_new/Sentinel-2/colorIR/202510_Flood_AK_S2B_MSIL2A_colorInfrared_222529_T03VVG_2025-09-13_day.tif

2. Original: S2B_MSIL2A_colorInfrared_20250913_222529_T03VVH.tif
   Category: colorInfrared
   New name: 202510_Flood_AK_S2B_MSIL2A_colorInfrared_222529_T03VVH_2025-09-13_day.tif
   Output:   s3://nasa-disasters/drcs_activations_new/Sentinel-2/colorIR/202510_Flood_AK_S2B_MSIL2A_colorInfrared_222529_T03VVH_2025-09-13_day.tif

3. Original: S2

## üíæ Step 5: Export Mapping to CSV

Save the filename mapping to a CSV file:

In [None]:
if not files_df.empty and SAVE_CSV:
    # Create output directory
    output_path = Path(OUTPUT_DIR) / EVENT_NAME
    output_path.mkdir(parents=True, exist_ok=True)

    csv_filename = f"{EVENT_NAME}-{SUB_PRODUCT_NAME}.csv"
    csv_path = output_path / csv_filename
    
    # Reorder columns for better readability
    column_order = [
        'original_filename',
        'new_filename',
        'category',
        'file_size_gb',
        'nodata_value',
        'status',
        'original_s3_path',
        'output_s3_path'
    ]
    
    # Save to CSV
    files_df[column_order].to_csv(csv_path, index=False)
    
    print(f"EXPORT COMPLETE. Saved mapping to: {csv_path}")
    print("="*80)
    print(f"   Total records: {len(files_df)}")
    print(f"   Total size:    {files_df['file_size_gb'].sum():.2f} GB")
    print(f"   Valid:         {len(files_df[files_df['status'] == 'valid'])}")
    print(f"   Uncategorized: {len(files_df[files_df['status'] == 'uncategorized'])}")
    
    # Show nodata value distribution
    nodata_counts = files_df['nodata_value'].value_counts()
    print(f"\n   Nodata values:")
    for nodata, count in nodata_counts.items():
        print(f"      {nodata}: {count} files")
    
    print("\nYou can now use this CSV in a separate script to perform actual file renaming/copying")
    
    # Display the DataFrame
    print("\nFull mapping table:")
    print("="*80)
    display(files_df[column_order])
    
elif files_df.empty:
    print("‚ö†Ô∏è No files to export. Check previous steps.")
else:
    print("‚ÑπÔ∏è  CSV export disabled (SAVE_CSV = False)")