In [None]:
!pip install pip==23.1.2
!pip install datasets pydub torchaudio fairseq
!sudo apt install libsox-dev

In [None]:
!git clone https://github.com/yt-dlp/yt-dlp.git yt-dlp
%cd yt-dlp
!make yt-dlp
!sudo cp yt-dlp /usr/local/bin/

In [None]:
cookies = """
# Netscape HTTP Cookie File
# This file is generated by youtube-dl.  Do not edit.

.google.com	TRUE	/	TRUE	1748961130	AEC	wwwwwww
.youtube.com	TRUE	/	TRUE	1733432221	GPS	1
.youtube.com	TRUE	/	TRUE	1749240520	NID	wwwwwww
.youtube.com	TRUE	/	TRUE	1767989250	PREF	tz=wwwww
.youtube.com	TRUE	/	TRUE	0	SOCS	CAI
.youtube.com	TRUE	/	TRUE	1748960410	VISITOR_INFO1_LIVE	wwwww
.youtube.com	TRUE	/	TRUE	1748960410	VISITOR_PRIVACY_METADATA	wwwww
.youtube.com	TRUE	/	TRUE	0	YSC	wwwww
"""

In [None]:
files_to_download_extended = []

In [None]:
from __future__ import unicode_literals
import yt_dlp

#    'format': '(bestvideo[width>=1080][ext=mp4]/bestvideo)+bestaudio/best', #Ensures best settings
ydl_opts = {
    'format': 'best', #Ensures best settings
    'postprocessors': [
      {
          'key': 'FFmpegExtractAudio',
          'preferredcodec': 'wav',
          'preferredquality': '320',
          'nopostoverwrites': True,  # Prevents overwriting files that already exist
      }
    ],
    "outtmpl":'/content/ytm/%(title)s.%(ext)s',
    'cookiefile': '/content/cookies.txt',
    "retries": 10,
    "fragment_retries": 10,
    "sleep_interval": 5,
    "http_headers": {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    },}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download(files_to_download_extended)

In [None]:
import os
from pydub import AudioSegment
import time

import sys
import csv
import pandas as pd
import librosa
import tqdm
import pyarrow as pa
import pyarrow.parquet as pq
import soundfile as sf
import torch
import torchaudio
from fairseq.data.audio.audio_utils import get_features_or_waveform

In [None]:
def create_wav_meta_tsv(audio_dir, output_dir):
    with open(output_dir, "a") as f:
        for filename in os.listdir(audio_dir):
            if filename.endswith(".wav"):
                audio_path = os.path.join(audio_dir, filename)
                try:
                    audio = AudioSegment.from_wav(audio_path)
                    duration = len(audio) / 1000.0  # Convert milliseconds to seconds
                    f.write(f"{audio_path}\t{duration}\t{int(time.time() + duration)}\n")
                except Exception as e:
                    print(f"Error processing {audio_path}: {e}")

In [None]:
class MfccFeatureReader(object):
    def __init__(self, sample_rate):
        self.sample_rate = sample_rate

    def read_audio_mp3(self, path):
      audio = AudioSegment.from_mp3(path)  # Load MP3 file
      audio = audio.set_channels(1)  # mono
      audio = audio.set_frame_rate(self.sample_rate)
      #waveform = np.array(audio.get_array_of_samples())
      wav_path = audio.export(f"{path}.wav", format="wav").name
      return wav_path

    def read_audio(self, path, ref_len=None):
        #wav_path = self.read_audio_mp3(path)
        wav_path = path
        wav = get_features_or_waveform(wav_path, need_waveform=True, use_sample_rate=self.sample_rate)
        if ref_len is not None and abs(ref_len - len(wav)) > 160:
            print(f"ref {ref_len} != read {len(wav)}")
        return wav

    def get_feats(self, path, ref_len=None):
        x = self.read_audio(path, ref_len=ref_len)
        with torch.no_grad():
            x = torch.from_numpy(x).float()
            x = x.view(1, -1)

            mfccs = torchaudio.compliance.kaldi.mfcc(
                waveform=x,
                sample_frequency=self.sample_rate,
                use_energy=False,
            )  # (time, freq)
            mfccs = mfccs.transpose(0, 1)  # (freq, time)
            deltas = torchaudio.functional.compute_deltas(mfccs)
            ddeltas = torchaudio.functional.compute_deltas(deltas)
            concat = torch.cat([mfccs, deltas, ddeltas], dim=0)
            concat = concat.transpose(0, 1).contiguous()  # (freq, time)
            return concat

In [None]:
def get_path_iterator(tsv):
    with open(tsv, "r") as f:
        lines = [line.rstrip() for line in f]
        def iterate():
            for line in lines:
                subpath, nsample, idx_id = line.split("\t")
                yield f"{subpath}", int(float(nsample)), idx_id
    return iterate, len(lines)

def wav_to_mfcc(reader, generator, num, feat_dir):
    iterator = generator()
    parquet_writer = None
    features = []
    lengths = []
    idx_val = []

    feat_path = f"{feat_dir}mfcc_features_1.parquet"

    for path, nsample, idx_id in tqdm.tqdm(iterator, total=num):
        feat = reader.get_feats(path, nsample)
        feat_np = feat.cpu().numpy()
        features.append(feat_np)
        lengths.append(len(feat))
        idx_val.append(idx_id)

    table = pa.table({
        'features': pa.array(features, type=pa.list_(pa.float64())),
        'duration_sec': pa.array(lengths, type=pa.int32()),
        'indx_id':  pa.array(lengths, type=pa.int32()),
      })
    parquet_writer = pq.ParquetWriter(feat_path, table.schema)
    parquet_writer.write_table(table)

    if parquet_writer is not None:
        parquet_writer.close()

def dump_duration(generator, num):
    iterator = generator()
    duration = []
    for path, nsample, idx_id in tqdm.tqdm(iterator, total=num):
        duration.append(nsample)
    return duration

In [None]:
def fix_file_name(audio_meta_file, correct_file_names_str, output_meta_csv_path):
    def parse_file_name_lines(meta_row_string):
        lines = meta_row_string.strip().split("\n")
        parsed_lines = [line.split("\t") for line in lines]
        return parsed_lines

    file_name_mapping = {}

    with open(audio_meta_file, mode='r') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='\t')  # Use tab delimiter
        for row in csv_reader:
            csv_file_name, csv_time_duration, *_ = row  # Handle rows with more than 2 columns
            file_name_mapping[float(csv_time_duration)] = csv_file_name

    def save_updated_csv(parsed_lines, file_name_mapping, output_file):
        with open(output_file, mode='w', newline='') as out_csv:
            csv_writer = csv.writer(out_csv, delimiter='\t')
            for file_name, time_duration, identifier in parsed_lines:
                time_duration_float = float(time_duration)
                updated_file_name = file_name_mapping.get(time_duration_float, file_name)
                csv_writer.writerow([updated_file_name, time_duration, identifier])

    parsed_lines = parse_file_name_lines(correct_file_names_str)
    save_updated_csv(parsed_lines, file_name_mapping, output_meta_csv_path)


In [None]:
def resample_update_mono_channel(tsv_file_path, target_sample_rate):
    data = pd.read_csv(tsv_file_path, sep="\t", header=None)
    file_paths = data[0]

    for file_path in file_paths:
        try:
            audio, sample_rate = sf.read(file_path)

            if audio.ndim == 1:
                audio = audio[:, None]  # Convert to 2D (n_samples, 1)
                audio = audio.repeat(2, axis=1)  # Duplicate channel to make it stereo

            resampled_audio = librosa.resample(y=audio.T if audio.ndim > 1 else audio, 
                                            orig_sr=sample_rate, 
                                            target_sr=target_sample_rate)
            
            if audio.ndim > 1:
                resampled_audio = resampled_audio.T

            sf.write(file_path, resampled_audio, target_sample_rate)
            print(f"Processed and saved: {file_path}")

        except Exception as e:
            print(f"Failed to process {file_path}: {e}")


In [None]:
def waw_to_parquet(wav_meta_file_path, target_sample_rate=16000):
    data = []
    audio_files = []

    with open(wav_meta_file_path, "r", encoding="utf-8") as tsv_file:
        reader = csv.reader(tsv_file, delimiter="\t")
        for row in reader:
            file_path, duration, idx_key = row
            audio_files.append({
                "file_path": file_path,
                "duration_sec": float(duration),
                "indx_id": idx_key
            })

    for audio_info in audio_files:
        file_path = audio_info["file_path"]
        idx_key = audio_info["indx_id"]
        duration_sec = audio_info["duration_sec"]

        try:
            audio, sample_rate = sf.read(file_path)
            audio_features = librosa.resample(y=audio.T if audio.ndim > 1 else audio, orig_sr=sample_rate, target_sr=target_sample_rate)
            data.append({
                "wav_features": audio_features.tolist(),
                "indx_id": idx_key,
                "duration_sec": duration_sec
            })
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

    df = pd.DataFrame(data)
    return df

In [None]:
meta_file_path = "/content/voice_meta/train.tsv"
audio_dir = "/content/ytm"  # Directory containing wav files
dump_mfcc_feat_dir = "/content/km/"
sample_rate = 16000
reader = MfccFeatureReader(sample_rate)

create_wav_meta_tsv(audio_dir, meta_file_path)
generator, num = get_path_iterator(f"{meta_file_path}")
wav_to_mfcc(reader, generator, num, dump_mfcc_feat_dir)

In [None]:
generator, num = get_path_iterator(f"{meta_file_path}")
duration_lst = dump_duration(generator, num)
print(f"{sum(duration_lst)/(60*60)} hrs")

In [None]:
output_parquet_file = "wav_features_s1.parquet"
audio_meta_file = "/content/voice_meta/train.tsv"
correct_name_str = """audio.wav 7455    11114445""" # or read meta_file_path
output_meta_csv_path = "wav_meta_file_s1.tsv"
target_sample_rate = 16000

fix_file_name(audio_meta_file, correct_name_str, output_meta_csv_path)
resample_update_mono_channel(output_meta_csv_path, target_sample_rate)
df = waw_to_parquet(output_meta_csv_path, target_sample_rate)
df.to_parquet(output_parquet_file, engine="pyarrow", index=False)

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="/content/km/mfcc_features_1.parquet",
    path_in_repo="mfcc_features_s1.parquet",
    repo_id="",
    repo_type="dataset",
)