In [1]:
import datetime
import numpy as np
import os
import pandas as pd
import pytz

data_dir = '/scratch/vl1019/bvd300h_data'
in_dir = os.path.join(data_dir, 'BirdVox-300h_wav_raw-entrofy-names')
in_wav_names = os.listdir(in_dir)
in_dawn_names = [x for x in in_wav_names if "_dawn" in x]

# 2015-09-08 and 2015-09-09, unit01_2015-10-01 are exceptions which are treated manually
in_dawn_names = [
    x for x in in_dawn_names if
    not x.startswith("unit03_2015-09-08") and
    not x.startswith("unit03_2015-09-09") and
    not x.startswith("unit01_2015-10-01")]

in_df = pd.read_csv('etc/Entrofy_145_raw-entrofy-names.csv')
in_dawn_df = in_df[in_df["utc"]=="dawn"]


In [2]:
dawn_converter = {}

for in_dawn_name in sorted(in_dawn_names):
    out_dawn_name = "init"
    unit_str = in_dawn_name.split("_")[0]
    start_str = str(int("".join(in_dawn_name.split("_")[1].split("-"))))
    excerpt_start = unit_str + "/" + start_str
    row_id = np.where([
        x.startswith(excerpt_start) for x in
        list(in_dawn_df["excerpt"])])[0]
    assert len(row_id) < 2
    is_day_minus_one = (len(row_id) == 0)
    
    if not is_day_minus_one:
        in_dict = dict(in_dawn_df.iloc[row_id[0]])
        excerpt_str = in_dict["excerpt"]
        timestamp_str = excerpt_str[7:].split("_")[0]
        offset_str = excerpt_str[7:].split("_")[1]
        offset_hours = int(offset_str.split(":")[0])
        offset_minutes = int(offset_str.split(":")[1])
        offset_seconds = int(offset_str.split(":")[2])
        timestamp_year = int(timestamp_str[:4])
        timestamp_month = int(timestamp_str[4:6])
        timestamp_day = int(timestamp_str[6:8])
        timestamp_hour = int(timestamp_str[8:10])
        timestamp_minute = int(timestamp_str[10:12])
        timestamp_second = int(timestamp_str[12:])
        start_dt = datetime.datetime(
            timestamp_year, timestamp_month, timestamp_day,
            timestamp_hour, timestamp_minute, timestamp_second, 0, pytz.UTC)
        offset_td = datetime.timedelta(
            hours=offset_hours,
            minutes=offset_minutes,
            seconds=offset_seconds)
        dt = start_dt + offset_td
        out_dawn_name = "{:04d}-{:02d}-{:02d}_{:02d}-{:02d}-{:02d}".format(
            dt.year, dt.month, dt.day,
            dt.hour, dt.minute, dt.second) + "_" + unit_str + ".wav"
        day_str = "{:04d}-{:02d}-{:02d}".format(dt.year, dt.month, dt.day)
        if day_str not in in_dawn_name:
            is_day_minus_one = True
        
    if is_day_minus_one:
        start_str = str(int("".join(in_dawn_name.split("_")[1].split("-")))-1)
        excerpt_start = unit_str + "/" + start_str
        row_id = np.where([
            x.startswith(excerpt_start) for x in
            list(in_dawn_df["excerpt"])])[0]
        assert len(row_id) == 1
        in_dict = dict(in_dawn_df.iloc[row_id[0]])
        excerpt_str = in_dict["excerpt"]
        timestamp_str = excerpt_str[7:].split("_")[0]
        offset_str = excerpt_str[7:].split("_")[1]
        offset_hours = int(offset_str.split(":")[0])
        offset_minutes = int(offset_str.split(":")[1])
        offset_seconds = int(offset_str.split(":")[2])
        timestamp_year = int(timestamp_str[:4])
        timestamp_month = int(timestamp_str[4:6])
        timestamp_day = int(timestamp_str[6:8])
        timestamp_hour = int(timestamp_str[8:10])
        timestamp_minute = int(timestamp_str[10:12])
        timestamp_second = int(timestamp_str[12:])
        start_dt = datetime.datetime(
            timestamp_year, timestamp_month, timestamp_day,
            timestamp_hour, timestamp_minute, timestamp_second, 0, pytz.UTC)
        offset_td = datetime.timedelta(
            hours=offset_hours,
            minutes=offset_minutes,
            seconds=offset_seconds)
        dt = start_dt + offset_td
        out_dawn_name = "{:04d}-{:02d}-{:02d}_{:02d}-{:02d}-{:02d}".format(
            dt.year, dt.month, dt.day,
            dt.hour, dt.minute, dt.second) + "_" + unit_str + ".wav"
        
    dawn_converter[in_dawn_name] = out_dawn_name

In [3]:
in_dawn_name = "unit03_2015-09-08_dawn.wav"
excerpt_str = "unit03/20150908000002_08:09:57"
timestamp_str = excerpt_str[7:].split("_")[0]
offset_str = excerpt_str[7:].split("_")[1]
offset_hours = int(offset_str.split(":")[0])
offset_minutes = int(offset_str.split(":")[1])
offset_seconds = int(offset_str.split(":")[2])
timestamp_year = int(timestamp_str[:4])
timestamp_month = int(timestamp_str[4:6])
timestamp_day = int(timestamp_str[6:8])
timestamp_hour = int(timestamp_str[8:10])
timestamp_minute = int(timestamp_str[10:12])
timestamp_second = int(timestamp_str[12:])
start_dt = datetime.datetime(
    timestamp_year, timestamp_month, timestamp_day,
    timestamp_hour, timestamp_minute, timestamp_second, 0, pytz.UTC)
offset_td = datetime.timedelta(
    hours=offset_hours,
    minutes=offset_minutes,
    seconds=offset_seconds)
dt = start_dt + offset_td
out_dawn_name = "{:04d}-{:02d}-{:02d}_{:02d}-{:02d}-{:02d}".format(
    dt.year, dt.month, dt.day,
    dt.hour, dt.minute, dt.second) + "_unit03.wav"
dawn_converter[in_dawn_name] = out_dawn_name

in_dawn_name = "unit03_2015-09-09_dawn.wav"
excerpt_str = "unit03/20150908235804_08:13:01"
timestamp_str = excerpt_str[7:].split("_")[0]
offset_str = excerpt_str[7:].split("_")[1]
offset_hours = int(offset_str.split(":")[0])
offset_minutes = int(offset_str.split(":")[1])
offset_seconds = int(offset_str.split(":")[2])
timestamp_year = int(timestamp_str[:4])
timestamp_month = int(timestamp_str[4:6])
timestamp_day = int(timestamp_str[6:8])
timestamp_hour = int(timestamp_str[8:10])
timestamp_minute = int(timestamp_str[10:12])
timestamp_second = int(timestamp_str[12:])
start_dt = datetime.datetime(
    timestamp_year, timestamp_month, timestamp_day,
    timestamp_hour, timestamp_minute, timestamp_second, 0, pytz.UTC)
offset_td = datetime.timedelta(
    hours=offset_hours,
    minutes=offset_minutes,
    seconds=offset_seconds)
dt = start_dt + offset_td
out_dawn_name = "{:04d}-{:02d}-{:02d}_{:02d}-{:02d}-{:02d}".format(
    dt.year, dt.month, dt.day,
    dt.hour, dt.minute, dt.second) + "_unit03.wav"
dawn_converter[in_dawn_name] = out_dawn_name

in_dawn_name = "unit01_2015-10-01.wav"
excerpt_str = "unit01/20150930231902_09:14:28"
timestamp_str = excerpt_str[7:].split("_")[0]
offset_str = excerpt_str[7:].split("_")[1]
offset_hours = int(offset_str.split(":")[0])
offset_minutes = int(offset_str.split(":")[1])
offset_seconds = int(offset_str.split(":")[2])
timestamp_year = int(timestamp_str[:4])
timestamp_month = int(timestamp_str[4:6])
timestamp_day = int(timestamp_str[6:8])
timestamp_hour = int(timestamp_str[8:10])
timestamp_minute = int(timestamp_str[10:12])
timestamp_second = int(timestamp_str[12:])
start_dt = datetime.datetime(
    timestamp_year, timestamp_month, timestamp_day,
    timestamp_hour, timestamp_minute, timestamp_second, 0, pytz.UTC)
offset_td = datetime.timedelta(
    hours=offset_hours,
    minutes=offset_minutes,
    seconds=offset_seconds)
dt = start_dt + offset_td
out_dawn_name = "{:04d}-{:02d}-{:02d}_{:02d}-{:02d}-{:02d}".format(
    dt.year, dt.month, dt.day,
    dt.hour, dt.minute, dt.second) + "_unit01.wav"
dawn_converter[in_dawn_name] = out_dawn_name

In [4]:
in_night_names = [x for x in in_wav_names if "_dawn" not in x]
# 20151114221604_1:43:56_fold-unit02_utc-22h_week-Nov14_cluster-5 is an exception which is treated manually
in_night_names = [
    x for x in in_night_names
    if not "20151114221604_1:43:56_fold-unit02_utc-22h_week-Nov14_cluster-5" in x]
in_night_df = in_df[in_df["utc"]!="dawn"]
    
night_converter = {}
leftovers = []
for in_night_name in in_night_names:
    row_id = np.where([
        in_night_name[:14] in x and
        in_night_name[15:22] in x
        for x in list(in_night_df["excerpt"])])[0]
    assert len(row_id)<2
    if len(row_id)==1:
        timestamp_year = int(in_night_name[:4])
        timestamp_month = int(in_night_name[4:6])
        timestamp_day = int(in_night_name[6:8])
        timestamp_hour = int(in_night_name[8:10])
        timestamp_minute = int(in_night_name[10:12])
        timestamp_second = int(in_night_name[12:14])
        start_dt = datetime.datetime(
            timestamp_year, timestamp_month, timestamp_day,
            timestamp_hour, timestamp_minute, timestamp_second, 0, pytz.UTC)
        offset_str = in_night_name[15:22]
        offset_hours = int(offset_str[:1])
        offset_minutes = int(offset_str[2:4])
        offset_seconds = int(offset_str[5:7])
        offset_td = datetime.timedelta(
            hours=offset_hours,
            minutes=offset_minutes,
            seconds=offset_seconds)
        dt = start_dt + offset_td
        unit_str = in_night_df.iloc[row_id[0]]["excerpt"][:6]
        out_night_name = "{:04d}-{:02d}-{:02d}_{:02d}-{:02d}-{:02d}".format(
            dt.year, dt.month, dt.day,
            dt.hour, dt.minute, dt.second) + "_" + unit_str + ".wav"
        night_converter[in_night_name] = out_night_name
    
    if len(row_id)==0:
        leftovers.append(in_night_name)
        
        
# Special case
in_night_name = "20151114221604_1:43:56_fold-unit02_utc-22h_week-Nov14_cluster-5.wav"
timestamp_year = int(in_night_name[:4])
timestamp_month = int(in_night_name[4:6])
timestamp_day = int(in_night_name[6:8])
timestamp_hour = int(in_night_name[8:10])
timestamp_minute = int(in_night_name[10:12])
timestamp_second = int(in_night_name[12:14])
start_dt = datetime.datetime(
    timestamp_year, timestamp_month, timestamp_day,
    timestamp_hour, timestamp_minute, timestamp_second, 0, pytz.UTC)
offset_str = in_night_name[15:22]
offset_hours = int(offset_str[:1])
offset_minutes = int(offset_str[2:4])
offset_seconds = int(offset_str[5:7])
offset_td = datetime.timedelta(
    hours=offset_hours,
    minutes=offset_minutes,
    seconds=offset_seconds)
dt = start_dt + offset_td
unit_str = "unit02"
out_night_name = "{:04d}-{:02d}-{:02d}_{:02d}-{:02d}-{:02d}".format(
    dt.year, dt.month, dt.day,
    dt.hour, dt.minute, dt.second) + "_" + unit_str + ".wav"
night_converter[in_night_name] = out_night_name

In [5]:
extra_df = pd.read_csv('etc/BirdVox_robin2015.csv')

leftover_converter = {}
for wav_name in leftovers:
    cluster_id = int(wav_name.split("cluster-")[1][0])
    week_str = wav_name.split("week-")[1][:3] + " " + wav_name.split("week-")[1][4]
    offset_str = " " + wav_name[15] + ":" + wav_name[17:19] + ":" + wav_name[20:22]
    row_id = extra_df[
        (extra_df["Cluster ID (0-9)"]==cluster_id) &
        (extra_df["File name"]==int(wav_name[:14])) &
        (extra_df["Offset (hh:mm:ss)"]==offset_str)]
    if len(row_id) > 1:
        row_id = row_id[row_id["Unit"].isin(["unit" + x for x in wav_name[33:].split("_")[0].split("-")])]
    assert len(row_id) == 1
    timestamp_year = int(wav_name[:4])
    timestamp_month = int(wav_name[4:6])
    timestamp_day = int(wav_name[6:8])
    timestamp_hour = int(wav_name[8:10])
    timestamp_minute = int(wav_name[10:12])
    timestamp_second = int(wav_name[12:14])
    start_dt = datetime.datetime(
        timestamp_year, timestamp_month, timestamp_day,
        timestamp_hour, timestamp_minute, timestamp_second, 0, pytz.UTC)
    offset_str = wav_name[15:22]
    offset_hours = int(offset_str[:1])
    offset_minutes = int(offset_str[2:4])
    offset_seconds = int(offset_str[5:7])
    offset_td = datetime.timedelta(
        hours=offset_hours,
        minutes=offset_minutes,
        seconds=offset_seconds)
    dt = start_dt + offset_td
    unit_str = row_id.iloc[0]["Unit"]
    out_wav_name = "{:04d}-{:02d}-{:02d}_{:02d}-{:02d}-{:02d}".format(
        dt.year, dt.month, dt.day,
        dt.hour, dt.minute, dt.second) + "_" + unit_str + ".wav"
    leftover_converter[wav_name] = out_wav_name

In [6]:
wav_converter = {**dawn_converter, **night_converter, **leftover_converter}

In [7]:
# Make sure that there are no collisions. The output should be empty
from collections import Counter
value_counter = Counter(wav_converter.values())
dups = list({k: value_counter[k] for k in value_counter if value_counter[k]>1}.keys())
{k: wav_converter[k] for k in wav_converter if wav_converter[k] in dups}

{}

In [8]:
len(wav_converter)

150

In [9]:
wav_converter

{'unit01_2015-09-01_dawn.wav': '2015-09-01_08-00-58_unit01.wav',
 'unit01_2015-09-09_dawn.wav': '2015-09-09_08-09-59_unit01.wav',
 'unit01_2015-09-29_dawn.wav': '2015-09-29_08-31-38_unit01.wav',
 'unit01_2015-10-02_dawn.wav': '2015-10-02_08-34-49_unit01.wav',
 'unit01_2015-10-18_dawn.wav': '2015-10-18_08-52-44_unit01.wav',
 'unit02_2015-08-27_dawn.wav': '2015-08-27_07-55-10_unit02.wav',
 'unit02_2015-08-30_dawn.wav': '2015-08-30_07-58-57_unit02.wav',
 'unit02_2015-08-31_dawn.wav': '2015-08-31_07-59-58_unit02.wav',
 'unit02_2015-09-15_dawn.wav': '2015-09-15_08-16-59_unit02.wav',
 'unit02_2015-09-16_dawn.wav': '2015-09-16_08-17-57_unit02.wav',
 'unit02_2015-09-26_dawn.wav': '2015-09-26_08-28-00_unit02.wav',
 'unit02_2015-11-02_dawn.wav': '2015-11-02_09-09-58_unit02.wav',
 'unit03_2015-08-04_dawn.wav': '2015-08-04_03-40-48_unit03.wav',
 'unit03_2015-09-04_dawn.wav': '2015-09-04_08-04-59_unit03.wav',
 'unit03_2015-09-29_dawn.wav': '2015-09-29_08-32-57_unit03.wav',
 'unit03_2015-10-02_dawn.

In [18]:
df_converter = pd.DataFrame(wav_converter.items(), columns=['in', 'out'])
df_converter.to_csv('etc/BirdVox-300h_wav-name-converter.cs)

Unnamed: 0,in,out
0,unit01_2015-09-01_dawn.wav,2015-09-01_08-00-58_unit01.wav
1,unit01_2015-09-09_dawn.wav,2015-09-09_08-09-59_unit01.wav
2,unit01_2015-09-29_dawn.wav,2015-09-29_08-31-38_unit01.wav
3,unit01_2015-10-02_dawn.wav,2015-10-02_08-34-49_unit01.wav
4,unit01_2015-10-18_dawn.wav,2015-10-18_08-52-44_unit01.wav
...,...,...
145,20150824211215_0:47:45_fold-unit-04-06_utc-22h...,2015-08-24_22-00-00_unit04.wav
146,20150826024815_1:11:45_fold-unit08_utc-08h_wee...,2015-08-26_04-00-00_unit08.wav
147,20151028223604_1:23:55_fold-unit03_utc-22h_wee...,2015-10-28_23-59-59_unit03.wav
148,20150924232902_2_30_58_fold-unit-01-10_utc-06h...,2015-09-25_02-00-00_unit01.wav
