# Fix timestamps to be right timezone

The fix_timestamps.py renames raw audio, but there are a bunch of results files etc that need updating for each dataset:
- Inference csv's
- Validation csv's
- raw_file_list csv'
We do not adjust the agile outputs labelled data, as these do not matter if the timestamp is correct. If this ever changes, then this notebook could be adjusted to add something which completes this.

In [1]:
import os
import pandas as pd
import logging
from datetime import datetime, timedelta

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

In [2]:
COUNTRY_CONFIG = {
  "australia": {"offset": -10},
  "kenya": {"offset": -3},
  "maldives": {"offset": -4},
  "mexico": {"offset": 7},
}

SHORT_CODES = {
  "aus": "australia",
  "ken": "kenya",
  "mal": "maldives",
  "mex": "mexico",
}

BASE_DIR = os.getenv("BASE_DIR")
if not BASE_DIR:
  raise ValueError("BASE_DIR environment variable is not set.")

### Fix raw_file_list

In [None]:
def correct_timestamp(filename: str) -> str:
  """
  Returns the corrected filename after adjusting the date/time.
  """
  name_no_ext, ext = os.path.splitext(filename)
  parts = name_no_ext.split("_")

  # Expecting [prefix, site_key, yyyymmdd, hhmmss]
  if len(parts) != 4:
    return filename
  
  prefix, site_key, date_str, time_str = parts
  short_code = prefix.lower()
  country = SHORT_CODES.get(short_code)
  if not country:
    return filename
  
  offset = COUNTRY_CONFIG[country]["offset"]
  
  try:
    dt_original = datetime.strptime(f"{date_str}{time_str}", "%Y%m%d%H%M%S")
  except ValueError:
    return filename
  
  dt_new = dt_original - timedelta(hours=offset)
  new_date_str = dt_new.strftime("%Y%m%d")
  new_time_str = dt_new.strftime("%H%M%S")

  return f"{prefix}_{site_key}_{new_date_str}_{new_time_str}{ext}"

def loop_through_files() -> None:
  """
  Loop through CSVs for each country, fix timestamps in-place.
  """
  for country in ["australia", "kenya", "maldives", "mexico"]:
    csv_path = os.path.join(
      BASE_DIR, "marrs_acoustics", "data", f"output_dir_{country}", "raw_file_list.csv"
    )
    
    if not os.path.isfile(csv_path):
      logging.warning(f"CSV not found for {country}: {csv_path}")
      continue
    
    df = pd.read_csv(csv_path)
    if "filename" not in df.columns:
      logging.warning(f"No 'filename' column in CSV for {country}")
      continue
    
    # Replace filenames in-place
    df["filename"] = df["filename"].apply(correct_timestamp)
    
    # Overwrite the CSV
    df.to_csv(csv_path, index=False)
    logging.info(f"Timestamps corrected for {country}. Updated file: {csv_path}")

loop_through_files()


INFO: Timestamps corrected for australia. Updated file: /home/bwilliams/ucl_projects/marrs_acoustics/data/output_dir_australia/raw_file_list.csv
INFO: Timestamps corrected for kenya. Updated file: /home/bwilliams/ucl_projects/marrs_acoustics/data/output_dir_kenya/raw_file_list.csv
INFO: Timestamps corrected for maldives. Updated file: /home/bwilliams/ucl_projects/marrs_acoustics/data/output_dir_maldives/raw_file_list.csv
INFO: Timestamps corrected for mexico. Updated file: /home/bwilliams/ucl_projects/marrs_acoustics/data/output_dir_mexico/raw_file_list.csv


### fix inference csvs

In [7]:

def correct_timestamp(filename: str) -> str:
  """
  Adjusts the date/time in 'filename' (e.g. 'ken_H2_20230301_031200.WAV')
  according to COUNTRY_CONFIG. Returns the corrected filename.
  """
  name_no_ext, ext = os.path.splitext(filename)
  parts = name_no_ext.split("_")

  if len(parts) != 4:
    return filename
  
  prefix, site_key, date_str, time_str = parts
  short_code = prefix.lower()
  country = SHORT_CODES.get(short_code)
  if not country:
    return filename
  
  offset = COUNTRY_CONFIG[country]["offset"]
  
  try:
    dt_original = datetime.strptime(f"{date_str}{time_str}", "%Y%m%d%H%M%S")
  except ValueError:
    return filename
  
  dt_new = dt_original - timedelta(hours=offset)
  new_date_str = dt_new.strftime("%Y%m%d")
  new_time_str = dt_new.strftime("%H%M%S")
  return f"{prefix}_{site_key}_{new_date_str}_{new_time_str}{ext}"

def fix_inference_files_for_country(country: str) -> None:
  """
  Automatically finds all sound folders in the agile_outputs dir for 'country'.
  For each found folder, if it contains 'sound_inference.csv', timestamps in
  the 'filename' column are adjusted in-place. Logs which folders were processed or skipped.
  Overwrites the original CSV with corrected timestamps.
  """
  agile_outputs_dir = os.path.join(
    BASE_DIR,
    "marrs_acoustics",
    "data",
    f"output_dir_{country}",
    "agile_outputs"
  )

  if not os.path.isdir(agile_outputs_dir):
    logging.warning(f"No agile_outputs directory found for {country}: {agile_outputs_dir}")
    return

  processed_folders = []
  skipped_folders = []

  # Gather all subfolders in 'agile_outputs_dir' (one for each sound)
  for sound_folder in os.listdir(agile_outputs_dir):
    folder_path = os.path.join(agile_outputs_dir, sound_folder)
    if not os.path.isdir(folder_path):
      continue

    csv_path = os.path.join(folder_path, f"{sound_folder}_inference.csv")

    if os.path.isfile(csv_path):
      df = pd.read_csv(csv_path)
      if "filename" not in df.columns:
        logging.warning(f"No 'filename' column in {csv_path}")
        skipped_folders.append(sound_folder)
        continue
      
      # Fix timestamps in the 'filename' column
      def parse_and_correct(file_path: str) -> str:
        if "/" in file_path:
          prefix, after_slash = file_path.split("/", 1)
          return f"{prefix}/{correct_timestamp(after_slash)}"
        return correct_timestamp(file_path)

      df["filename"] = df["filename"].apply(parse_and_correct)
      # Overwrite the original CSV
      df.to_csv(csv_path, index=False)
      processed_folders.append(sound_folder)
    else:
      skipped_folders.append(sound_folder)

  logging.info(f"{country} - Processed folders: {processed_folders}")
  logging.info(f"{country} - Skipped folders: {skipped_folders}")

# Example usage for just one country (comment out the others if needed)
fix_inference_files_for_country("kenya")
fix_inference_files_for_country("maldives")
fix_inference_files_for_country("australia")
fix_inference_files_for_country("mexico")


INFO: kenya - Processed folders: ['snaps', 'pulses', 'oink', 'growl', 'croak']
INFO: kenya - Skipped folders: ['laugh', 'scrape_abondoned_lowlogit_notgrazing', 'knock_abond_couldntunlearnsnaps', 'rasberry_abondoned_nofinds']
INFO: maldives - Processed folders: ['snaps', 'grunt', 'scrape', 'growl', 'pop', 'raspberry', 'croak']
INFO: maldives - Skipped folders: ['pulses_abandoned']
INFO: australia - Processed folders: ['snaps', 'oink', 'scrape', 'chirrup', 'pulse_train', 'huff', 'creek']
INFO: australia - Skipped folders: ['growl', 'pop']
INFO: mexico - Processed folders: ['snaps', 'pulses', 'laugh', 'scrape', 'pop']
INFO: mexico - Skipped folders: ['moan', 'drum', 'grunt', 'croak', 'rumble']


### Fix validation logs

In [9]:

def correct_timestamp(filename: str) -> str:
  """
  Adjusts the date/time in 'filename' (e.g. 'ken_H2_20230301_031200.WAV')
  according to COUNTRY_CONFIG. Returns the corrected filename.
  """
  name_no_ext, ext = os.path.splitext(filename)
  parts = name_no_ext.split("_")

  if len(parts) != 4:
    return filename
  
  prefix, site_key, date_str, time_str = parts
  short_code = prefix.lower()
  country = SHORT_CODES.get(short_code)
  if not country:
    return filename
  
  offset = COUNTRY_CONFIG[country]["offset"]
  
  try:
    dt_original = datetime.strptime(f"{date_str}{time_str}", "%Y%m%d%H%M%S")
  except ValueError:
    return filename
  
  dt_new = dt_original - timedelta(hours=offset)
  new_date_str = dt_new.strftime("%Y%m%d")
  new_time_str = dt_new.strftime("%H%M%S")
  return f"{prefix}_{site_key}_{new_date_str}_{new_time_str}{ext}"

def fix_inference_files_for_country(country: str) -> None:
  """
  Automatically finds all sound folders in the agile_outputs dir for 'country'.
  For each found folder, if it contains 'sound_inference.csv', timestamps in
  the 'filename' column are adjusted in-place. Logs which folders were processed or skipped.
  Overwrites the original CSV with corrected timestamps.
  """
  agile_outputs_dir = os.path.join(
    BASE_DIR,
    "marrs_acoustics",
    "data",
    f"output_dir_{country}",
    "agile_outputs"
  )

  if not os.path.isdir(agile_outputs_dir):
    logging.warning(f"No agile_outputs directory found for {country}: {agile_outputs_dir}")
    return

  processed_folders = []
  skipped_folders = []

  # Gather all subfolders in 'agile_outputs_dir' (one for each sound)
  for sound_folder in os.listdir(agile_outputs_dir):
    folder_path = os.path.join(agile_outputs_dir, sound_folder)
    if not os.path.isdir(folder_path):
      continue

    csv_path = os.path.join(folder_path, f"validation_{sound_folder}.csv")

    if os.path.isfile(csv_path):
      df = pd.read_csv(csv_path)
      if "filenames" not in df.columns:
        logging.warning(f"No 'filenames' column in {csv_path}")
        skipped_folders.append(sound_folder)
        continue
      
      # Fix timestamps in the 'filename' column
      def parse_and_correct(file_path: str) -> str:
        if "/" in file_path:
          prefix, after_slash = file_path.split("/", 1)
          return f"{prefix}/{correct_timestamp(after_slash)}"
        return correct_timestamp(file_path)

      df["filenames"] = df["filenames"].apply(parse_and_correct)
      # Overwrite the original CSV
      df.to_csv(csv_path, index=False)
      processed_folders.append(sound_folder)
    else:
      skipped_folders.append(sound_folder)

  logging.info(f"{country} - Processed folders: {processed_folders}")
  logging.info(f"{country} - Skipped folders: {skipped_folders}")

# Example usage for just one country (comment out the others if needed)
fix_inference_files_for_country("kenya")
fix_inference_files_for_country("maldives")
fix_inference_files_for_country("australia")
fix_inference_files_for_country("mexico")


INFO: kenya - Processed folders: ['snaps', 'pulses', 'oink', 'growl', 'croak']
INFO: kenya - Skipped folders: ['laugh', 'scrape_abondoned_lowlogit_notgrazing', 'knock_abond_couldntunlearnsnaps', 'rasberry_abondoned_nofinds']


INFO: maldives - Processed folders: ['snaps', 'grunt', 'scrape', 'growl', 'pop', 'raspberry', 'croak']
INFO: maldives - Skipped folders: ['pulses_abandoned']
INFO: australia - Processed folders: ['snaps', 'oink', 'scrape', 'chirrup', 'pulse_train', 'huff', 'creek']
INFO: australia - Skipped folders: ['growl', 'pop']
INFO: mexico - Processed folders: ['snaps', 'pulses', 'laugh', 'scrape', 'pop']
INFO: mexico - Skipped folders: ['moan', 'drum', 'grunt', 'croak', 'rumble']
