<a href="https://colab.research.google.com/github/Ak-Gautam/AudioDataPrerocess/blob/main/Audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Get data in local file.

In [None]:
!pip install aiohttp aiofiles huggingface_hub -q

In [None]:
import os
import asyncio
import aiohttp
import aiofiles
from huggingface_hub import hf_hub_url, HfApi
from tqdm.asyncio import tqdm_asyncio

In [None]:
async def download_file(session, file, repo_id, repo_type, destination_dir, semaphore):
    async with semaphore:
        file_url = hf_hub_url(repo_id, file, repo_type=repo_type)
        dest_path = os.path.join(destination_dir, file)
        os.makedirs(os.path.dirname(dest_path), exist_ok=True)

        async with session.get(file_url) as response:
            if response.status == 200:
                async with aiofiles.open(dest_path, 'wb') as f:
                    await f.write(await response.read())
            else:
                print(f"Failed to download {file}: HTTP {response.status}")

async def download_dataset(repo_id, repo_type, folder_path, destination_dir, max_concurrent=10):
    api = HfApi()
    all_files = api.list_repo_files(repo_id, repo_type=repo_type)
    folder_files = [f for f in all_files if f.startswith(folder_path)]

    semaphore = asyncio.Semaphore(max_concurrent)
    async with aiohttp.ClientSession() as session:
        tasks = [
            download_file(session, file, repo_id, repo_type, destination_dir, semaphore)
            for file in folder_files
        ]
        await tqdm_asyncio.gather(*tasks, desc="Downloading files")

# Configuration
repo_id = "Alignment-Lab-AI/podcast-1-test-preprocessed"
repo_type = "dataset"
folder_path = "0"
destination_dir = "content/ddata"

# Run the async function
async def main():
    await download_dataset(repo_id, repo_type, folder_path, destination_dir)
    print(f"Folder '{folder_path}' from repository '{repo_id}' has been saved to '{destination_dir}'")

# This part is changed to work in Jupyter/Colab
import nest_asyncio
nest_asyncio.apply()

loop = asyncio.get_event_loop()
loop.run_until_complete(main())

### Processing starts from here!

In [None]:
!pip install pydub -q

In [None]:
# Download a static FFmpeg build and add it to PATH.
exist = !which ffmpeg
if not exist:
  !curl https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-amd64-static.tar.xz -o ffmpeg.tar.xz \
     && tar -xf ffmpeg.tar.xz && rm ffmpeg.tar.xz
  ffmdir = !find . -iname ffmpeg-*-static
  path = %env PATH
  path = path + ':' + ffmdir[0]
  %env PATH $path
print('')
!which ffmpeg
print('Done!')

In [None]:
from pydub import AudioSegment

spacermilli = 2000
spacer = AudioSegment.silent(duration=spacermilli)

audio = AudioSegment.from_mp3("content/ddata/0/10.mp3")

audio = spacer.append(audio, crossfade=0)

audio.export('input_prep.wav', format='wav')

In [None]:
!pip install light-the-torch -q
!ltt install torch torchvision torchaudio -q

In [None]:
!pip install pyannote.audio -q

In [None]:
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained('pyannote/speaker-diarization-3.1', use_auth_token='')

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline.to(device)

In [None]:
DEMO_FILE = {'uri': 'blabla', 'audio': 'input_prep.wav'}
dz = pipeline('input_prep.wav')

with open("diarization.txt", "w") as text_file:
    text_file.write(str(dz))

In [None]:
print(*list(dz.itertracks(yield_label = True))[:10], sep="\n")