In [27]:
import os
import glob
import soundfile as sf
import datetime
from intervaltree import IntervalTree,Interval
import csv
from fractions import Fraction

In [28]:
root_folder        = os.path.join(os.sep,'data','naveen','msda206','orca_project','orca_al','data_source','podcastR2')
all_train_dev_wavs = glob.glob(os.path.join(root_folder,'train_data','wav','*.wav'))
train_anno_path    = os.path.join(root_folder,'train_data','train.tsv')
dev_anno_path      = os.path.join(root_folder,'train_data','dev.tsv')

In [29]:
def get_positive_duration_r2(labelfile):
    interval_dict = dict()
    with open(labelfile) as f:
        for i,(wav,start,dur,loc,date,data_source,data_source_id) in enumerate(csv.reader(f, delimiter='\t')):
            if i == 0 or float(dur) <= 0.01: continue
            start = float(start)
            dur   = float(dur)
            end   = start+dur
            if wav not in interval_dict.keys():
                interval_dict[wav]=IntervalTree()
            interval_dict[wav].add(Interval(start,end,dur))
    tot_pos_dur =0
    for wav,interval_tree in interval_dict.items():
        for intvl in interval_tree:
            tot_pos_dur+=intvl.data

    return tot_pos_dur

In [30]:
total_duration = 0
count_files = 0
for wav in all_train_dev_wavs:
    f = sf.SoundFile(wav)
    sample_rate,dur = f.samplerate,len(f)/f.samplerate
    print('wav={} sample rate:{}, dur:{} seconds'.format(os.path.basename(wav),sample_rate,dur))
    total_duration+=dur
    count_files+=1
print('Total count of wav files ={}'.format(count_files))

wav=1562344334_0005.wav sample rate:20000, dur:61.25 seconds
wav=1562344334_0006.wav sample rate:20000, dur:61.25 seconds
wav=1562344334_000a.wav sample rate:20000, dur:61.25 seconds
wav=1562344334_001c.wav sample rate:20000, dur:61.25 seconds
wav=1562337136_0010.wav sample rate:20000, dur:61.25 seconds
wav=1562340736_0006.wav sample rate:20000, dur:61.25 seconds
wav=1562340736_0010.wav sample rate:20000, dur:61.25 seconds
wav=1562344334_0002.wav sample rate:20000, dur:61.25 seconds
wav=1562340736_0001.wav sample rate:20000, dur:61.25 seconds
wav=1562344334_0003.wav sample rate:20000, dur:61.25 seconds
wav=1562337136_001a.wav sample rate:20000, dur:61.25 seconds
wav=1562344334_000d.wav sample rate:20000, dur:61.25 seconds
wav=1562340736_0018.wav sample rate:20000, dur:61.25 seconds
wav=1562340736_000c.wav sample rate:20000, dur:61.25 seconds
wav=1562337136_0006.wav sample rate:20000, dur:61.25 seconds
wav=1562337136_000b.wav sample rate:20000, dur:61.25 seconds
wav=1562340736_0009.wav 

In [31]:
pos_duration_train = get_positive_duration_r2(train_anno_path)
pos_duration_dev = get_positive_duration_r2(dev_anno_path)

tot_dur = datetime.timedelta(seconds=total_duration)
pos_dur_train  = datetime.timedelta(seconds=pos_duration_train)
pos_dur_val  = datetime.timedelta(seconds=pos_duration_dev)
print('Total duration of recording {}'.format(tot_dur))
print('Total duration of SRKW calls train {}'.format(pos_dur_train))
print('Total duration of SRKW calls val {}'.format(pos_dur_val))
print('Total duration of SRKW calls {}'.format(pos_dur_train+pos_dur_val))

pos_frac = (pos_duration_train+pos_duration_dev)/total_duration
neg_frac = 1- pos_frac
print('Approx Ratio postive/negative {}'.format(Fraction(pos_frac/neg_frac).limit_denominator(10)))

Total duration of recording 0:58:11.250000
Total duration of SRKW calls train 0:12:33.646564
Total duration of SRKW calls val 0:02:28.105341
Total duration of SRKW calls 0:15:01.751905
Approx Ratio postive/negative 1/3
