<a href="https://colab.research.google.com/github/2024S-Ajou-ML-pandastic4/whaccent/blob/feature%2Fdataset/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [102]:
from typing import List
import io
import os
import pandas as pd
import urllib.request
import tarfile
import time
import http
import types
from torch.utils.data import Dataset


# Headers:  id, audio_path, country, transcription, duration
class Pandastic4Dataset(Dataset):
    def __init__(
        self,
        root="../pandastic4_dataset",
        transform=None,
        train: bool = True,
        random_state: int = 201820766,
    ):
        self.root = root
        self.transform = transform
        self.train = train
        self.random_state = random_state

        self.data: pd.DataFrame = None
        self.download()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data.iloc[idx]
        audio_path = os.path.join(self.root, sample["audio_path"])
        country = sample["country"]
        transcription = sample["transcription"]
        # if self.transform:
        #     audio = self.transform(audio)
        return audio_path, country, transcription

    def _check_exists(self) -> bool:
        if not self._try_load_manifest():
            return False

        for audio_path in self.data["audio_path"]:
            if not os.path.isfile(os.path.join(self.root, audio_path)):
                return False
        return True

    def _try_load_manifest(self) -> bool:
        manifest_paths = [
            os.path.join(
                self.root,
                "output",
                f'{country}_{"train" if self.train else "test"}.csv',
            )
            for country in ["KR", "US", "GB"]
        ]

        manifest_exist = all(os.path.isfile(path) for path in manifest_paths)
        if not manifest_exist:
            return False
        manifests = [pd.read_csv(manifest_path) for manifest_path in manifest_paths]
        self.data = (
            pd.concat(manifests)
            .sample(frac=1, random_state=self.random_state)
            .reset_index(drop=True)
        )

        return True

    def download(self) -> None:
        if self._check_exists():
            return
        os.makedirs(self.root, exist_ok=True)
        url = "http://mldataset.ddns.net/f/dae3e2b1a5374690a80c/?dl=1"
        target_path = os.path.join(self.root)

        stream = urllib.request.urlopen(url)
        length = stream.getheader("content-length")
        stream._length = int(length)
        stream._downloaded = 0
        stream._started_at = time.time()
        stream._last_printed = stream._started_at
        stream._print_interval = 5

        def read_with_progress(self: http.client.HTTPResponse, size):
            self._downloaded += size
            now = time.time()
            if now - self._last_printed > self._print_interval:
                speed_mps = self._downloaded / (now - self._started_at) / 1024 / 1024
                print(
                    f"Downloading: {self._downloaded}/{self._length}\t{100*self._downloaded/self._length:.0f}%\t{speed_mps:.1f}MB/s"
                )
                self._last_printed = now
            return http.client.HTTPResponse.read(self, size)

        stream.read = types.MethodType(read_with_progress, stream)

        tar_file = tarfile.open(fileobj=stream, mode="r|gz")
        tar_file.extractall(path=target_path)
        assert self._check_exists()


In [101]:
from torch.utils.data.dataloader import DataLoader

dataset = Pandastic4Dataset()
data = DataLoader(dataset, batch_size=64)
for _, country, transcription in data:
    print(country, transcription)

6810
('GB', 'GB', 'KR', 'US', 'US', 'US', 'US', 'GB', 'KR', 'KR', 'KR', 'KR', 'KR', 'KR', 'GB', 'US', 'US', 'KR', 'GB', 'GB', 'GB', 'GB', 'KR', 'US', 'GB', 'US', 'KR', 'GB', 'GB', 'KR', 'GB', 'KR', 'KR', 'GB', 'KR', 'KR', 'US', 'US', 'KR', 'GB', 'US', 'KR', 'US', 'KR', 'US', 'GB', 'US', 'GB', 'KR', 'GB', 'US', 'KR', 'KR', 'US', 'KR', 'GB', 'KR', 'US', 'KR', 'KR', 'US', 'KR', 'US', 'KR') ('He stopped as abruptly as she had done.', 'Still, she was on her own, and not very brave.', ' I left some donuts on the side of the road.', "but he didn't see none then the circus started an b'lieve me it was some circus jo hadn't had much action for some time", 'with it and that though they might peck and mend at the body he had received his final orders', 'daring vigorous and muscular fish armed with prickles on its head and stings on its fins a real scorpion measuring two to three meters the ruthless enemy', 'i have refused refused why for reasons of my own', 'That might be wise, given our record.'