In [3]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pydub.utils import mediainfo

# Set the visual style of the plots
sns.set(style="whitegrid")

In [10]:
def get_audio_files(directory, extensions):
    """Get a list of audio files in the specified directory and its subdirectories with given extensions."""
    audio_files = []

    for root, dirs, files in os.walk(directory):
        for file in files:
            if any(file.endswith(ext) for ext in extensions):
                audio_files.append(os.path.join(root, file))
    return audio_files

def analyze_audio_files(directories, extensions):
    """Analyze audio files and collect their properties."""
    data = []
    for directory in directories:
        audio_files = get_audio_files(directory, extensions)
        for audio_file in audio_files:
            info = mediainfo(audio_file)
            duration = round(float(info['duration']), 2) if 'duration' in info else 0

            duration_min, duration_sec = divmod(int(duration), 60)
            duration_hr, duration_min = divmod(duration_min, 60)
            # Format as hh:mm:ss
            duration_string = f"{duration_hr:02d}:{duration_min:02d}:{duration_sec:02d}"
            
            data.append({
                'File_name': os.path.basename(audio_file),
                'Duration': duration_string,
                'Duration_sec': duration,
                'Directory': directory,

            })
    
    return pd.DataFrame(data)

In [11]:
directories = ["./data/OBE1","./data/OBE2","./data/Compassion"]
extensions = ["m4a","waw","mp4"]

# Analyze audio files
audio_data = analyze_audio_files(directories, extensions)

# Display the first few rows of the data
audio_data

#audio_data[0:1]["info"][0]

Unnamed: 0,File_name,Duration,Duration_sec,Directory
0,ID 05.mp4,00:10:15,615.68,./data/OBE1
1,Id 08.m4a,00:04:34,274.65,./data/OBE1
2,Id 13.m4a,00:07:19,439.70,./data/OBE1
3,Id 13b.m4a,00:03:21,201.60,./data/OBE1
4,Id 14.m4a,00:02:52,172.97,./data/OBE1
...,...,...,...,...
64,S304.m4a,00:07:05,425.92,./data/Compassion
65,S305con.m4a,00:01:41,101.10,./data/Compassion
66,S306.m4a,00:08:50,530.71,./data/Compassion
67,S307.m4a,00:06:15,375.70,./data/Compassion


In [12]:
audio_data.describe()

Unnamed: 0,Duration_sec
count,69.0
mean,334.409565
std,211.30592
min,11.97
25%,182.83
50%,274.92
75%,439.7
max,1049.77
