In [8]:
from datetime import date, datetime
import numpy as np
import os
import pandas as pd
from pydub import AudioSegment
import re
from scipy import stats
import shlex
import subprocess
import sys

In [2]:
def date_from_int(int_date):
    """
    Function to turn an int into a date
    
    Parameter
    ---------
    int_date : int
        YYYYMMDD
    
    Returns
    -------
    date_date : date
    """
    try:
        return(date(int(str(int_date)[:4]), int(str(int_date)[4:6]), int(str(int_date)[6:8])))
    except:
        return(np.nan)
    
    
def dur_usable(dur, path, usable_table):
    """
    Function to discriminate usable durations
    
    Parameters
    ----------
    dur : float
       duration in seconds
    
    path : string
    
    usable_table : DataFrame, optional
        with columns ["RandID", "Date of recording"]
        
        
    Returns
    -------
    lens_usable : list of floats, given "usable", else empty list
    
    lens_all : list of floats
    """
    f = os.path.basename(path)
    lens_usable = []
    lens_all = []
    if int(f[:7]) in list(usable_table["RandID"]):
        if (datetime.fromtimestamp(os.stat(path).st_birthtime).date() >= 
        [v for v in usable_table[usable_table["RandID"] == int(f[:7])]["Date of recording"]][0]) or (
        [v for v in usable_table[usable_table["RandID"] == int(f[:7])]["Date of recording"]][0] ==
        np.nan):
            lens_usable.append(dur)
    lens_all.append(dur)
    return(lens_usable, lens_all)

    
def get_durs(path, usable=None):
    """
    Function to get durations from all files in a path and their subdirectories
    
    Parameters
    ----------
    path : string
    
    usable : DataFrame, optional
        with columns ["RandID", "Date of recording"]
        
        
    Returns
    -------
    lens_usable : list of floats, given "usable", else empty list
    
    lens_all : list of floats
    """
    lens_usable = []
    lens_all = []
    for fpath in os.listdir(path):
        f_path = os.path.join(path, fpath)
        if os.path.isdir(f_path):
            print(" ".join(["Loading", fpath]))
            u, a = get_durs(f_path, usable)
            for i in u:
                lens_usable.append(i)
            for i in a:
                lens_all.append(i)
        else:
            try:
                g = shlex.quote(os.path.join(path, fpath))
                dur = float(re.sub(
                        r"\n",
                        "",
                        re.sub(
                            r"duration=",
                            "",
                            subprocess.check_output(
                                "".join([
                                    "ffprobe -v quiet -show_entries format=duration ",
                                    g,
                                    " | grep duration="]), shell=True).decode('utf-8'))
                    ))
                u, a = dur_usable(dur, os.path.join(path, fpath), usable)
                for i in u:
                    lens_usable.append(i)
                for i in a:
                    lens_all.append(i)
            except ValueError:
                lens_all.append(float(re.sub(
                        r"\n",
                        "",
                        re.sub(
                            r"duration=",
                            "",
                            subprocess.check_output(
                                "".join([
                                    "ffprobe -v quiet -show_entries format=duration ",
                                    g,
                                    " | grep duration="]), shell=True).decode('utf-8'))
                )))
            except:
                print(" ".join(["Could not load", g, ":", str(sys.exc_info()[0])]))
    return(lens_usable, lens_all)

In [3]:
storage = "/Volumes/Data/Research/Healthy Brain Network/Voice Sample Data"
usable_table = pd.read_excel(os.path.join(storage, "Post-MRI Audio Video", "Usable Voice Samples.xlsx"))
usable_table["Date of recording"] = usable_table["Date of recording"].apply(date_from_int)

In [4]:
lens_usable, lens_all = get_durs(storage, usable_table)
for lens in [lens_usable, lens_all]:
    print(np.array(lens))
    print(np.array(lens).mean())

Could not load '/Volumes/Data/Research/Healthy Brain Network/Voice Sample Data/.DS_Store' : <class 'subprocess.CalledProcessError'>
Loading GFTA
Could not load '/Volumes/Data/Research/Healthy Brain Network/Voice Sample Data/GFTA/.DS_Store' : <class 'subprocess.CalledProcessError'>
Loading 5000677
Loading 5002406
Loading 5005437
Loading 5007611
Loading 5011146
Loading 5013573
Loading 5020640
Loading 5022889
Loading 5028550
Loading 5032610
Loading 5034881
Loading 5040514
Loading 5041333
Loading 5041416
Loading 5042427
Loading 5046057
Loading 5046420
Loading 5046805
Loading 5047708
Loading 5049983
Loading 5053304
Loading 5054883
Loading 5059447
Loading 5060956
Loading 5062330
Loading 5067026
Loading 5068348
Loading 5069228
Loading 5070376
Loading 5071739
Loading 5080387
Loading 5082352
Loading 5084002
Loading 5085726
Loading 5088945
Loading 5089058
Loading 5091629
Loading 5092466
Loading 5095453
Loading 5097714
Loading 5098958
Loading 5100737
Loading 5101383
Loading 5101687
Loading 510739

In [30]:
for lens in [lens_usable, lens_all]:
    print("mean (minutes)", end=": ")
    print(np.array(lens).mean()/60)
    print("median (minutes)", end=": ")
    print(np.median(np.array(lens))/60)
    print(stats.mode(np.array(lens), axis=None))
    print(stats.describe(lens))

mean (minutes): 29.7033446476
median (minutes): 6.17283333333
ModeResult(mode=array([ 181.289833]), count=array([2]))
DescribeResult(nobs=531, minmax=(52.218792000000001, 10306.272666999999), mean=1782.2006788549909, variance=4879324.042609903, skewness=1.3019438024292038, kurtosis=0.6321333512317704)
mean (minutes): 38.3174714181
median (minutes): 28.9038242583
ModeResult(mode=array([ 0.04]), count=array([4]))
DescribeResult(nobs=1540, minmax=(0.040000000000000001, 14761.195125), mean=2299.0482850831172, variance=4982272.9707542341, skewness=1.0931792875242157, kurtosis=1.2967084064995094)


In [33]:
for lens in [lens_usable, lens_all]:
    for l in lens:
        if l > 10 and l < 60:
            print(l)
    print("\n\n")

52.218792



52.4524
47.281667
52.218792



