In [None]:
import pandas as pd
import numpy as np
import os
import zipfile
import csv
import shutil
import functools
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
"""
Script for extracting data from https://figshare.com/articles/dataset/Sounds_of_the_Eleutherodactylus_frog_community_from_Puerto_Rico/806302?file=3104183
Unzips all the zips from root
Simply Unzip downloaded file and provide path root of folder
"""


def extract_zip_file(file_path, extract_to):
    """
    Extracts a single zip file to a specified directory.

    Args:
        file_path (str): Path to the zip file.
        extract_to (str): Path to the directory where the zip file will be extracted.
    """
    with zipfile.ZipFile(file_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)
        # print(f"{os.path.basename(file_path)} extracted to {os.path.abspath(extract_to)}")


def extract_zip_files(zip_folder, extract_to):
    """
    Extracts all zip files from a folder to a specified directory using threading.

    Args:
        zip_folder (str): Path to the folder containing zip files.
        extract_to (str): Path to the directory where zip files will be extracted.
    """
    # Make sure the extraction directory exists
    os.makedirs(extract_to, exist_ok=True)

    data_file = "FrequencyRange_by_species_and_site_Averages.csv"
    shutil.copyfile(os.path.join(zip_folder, data_file) , os.path.join(extract_to, data_file))

    # List all zip files in the folder
    zip_files = [
        os.path.join(zip_folder, item)
        for item in os.listdir(zip_folder)
        if item.endswith(".zip")
    ]

    # Use ThreadPoolExecutor to extract zip files concurrently
    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(extract_zip_file, zip_file, extract_to)
            for zip_file in zip_files
        ]

        # Wait for all futures to complete
        for future in futures:
            future.result()
    
    print("Done.")

In [None]:
filepath = "/home/edwinc/Downloads/806302"
ExtractTo = "/home/edwinc/Downloads/806302/Extracted"
extract_zip_files(filepath, ExtractTo)

In [None]:
# Clean the AllData file
all_data_path = os.path.join(filepath, "FrequencyRange_by_species_and_site_AllData.csv")
with open(all_data_path, mode="r") as all_data_file:
    reader = csv.DictReader(all_data_file)
    root_df = pd.DataFrame.from_records(list(reader))
root_df = root_df.drop(columns=[None])
root_df.SoundID = pd.to_numeric(root_df.SoundID)
root_df

In [None]:
# Now go through all the data.csv files to match the SoundID in our dataframe to a sound file
# Find all data.csv files
zip_walk = list(os.walk(ExtractTo))
leaf_paths = [os.path.join(trace[0], 'data.csv') for trace in zip_walk if 'data.csv' in trace[2]]
fragments = [pd.read_csv(leaf_path)[['SoundID', 'SiteID', 'Filename']].drop_duplicates() for leaf_path in leaf_paths]

soundid_to_filename = pd.concat(fragments)
soundid_to_filename


In [None]:
# We need the filename to be an absolute path
# Since the current Filename column only has the filename and there is no information that directly tells us what directory those files will be in, we must search for them manually
# Instead of doing it O(N^2) by searching through all of them n times, we'll just use the os.walk we already performed
def update_return(d1, d2):
    d1.update(d2)
    return d1
filename_dict = functools.reduce(
    update_return,
    [
    {
        filename: os.path.join(trace[0], filename)
        for filename in trace[2]
        if filename.endswith('.wav')
    }
    for trace in zip_walk[1:]
])

soundid_to_filename.Filename = soundid_to_filename.Filename.apply(lambda x: filename_dict[x])
soundid_to_filename


In [None]:
# Now merge them and save the result
preprocess_final = pd.merge(root_df, soundid_to_filename, on="SoundID", validate="many_to_one")[['SiteID_x', 'Filename', 'Species']]
preprocess_final.columns = pd.Index(['siteId', 'filename', 'species'])

preprocess_final.siteId = pd.to_numeric(preprocess_final.siteId)

species_classes = list(preprocess_final['species'].unique())

final_output = preprocess_final[['siteId', 'filename']].drop_duplicates()
for species in species_classes:
    # For each species, make a new column in the dataframe that says which files contain that species
    final_output[species] = [
        species in set(preprocess_final[preprocess_final['filename'] == filename]['species']) 
        for filename in final_output['filename']
    ]

try:
    output_path = "processed/processed.csv"
    final_output.to_csv(output_path)
except OSError:
    output_path = "machine_learning/processed/processed.csv"
    final_output.to_csv(output_path)
final_output