In [32]:
from pydub import AudioSegment

# === SETTINGS ===
input_mp3_path = "1.6.mp3"  # Path to your MP3 file
output_wav_path = "sir_audio.wav"     # Path to save WAV file

# === CONVERSION ===
# Load the MP3 file
audio = AudioSegment.from_mp3(input_mp3_path)

# Export as WAV
audio.export(output_wav_path, format="wav")

print(f"✅ Successfully converted {input_mp3_path} to {output_wav_path}")


✅ Successfully converted 1.6.mp3 to sir_audio.wav


In [33]:
import os
import subprocess
from pydub import AudioSegment

# === CONFIGURATION ===
FULL_AUDIO = 'sir.wav'
FINAL_AUDIO = 'finetuning_audio.wav'

# === Step 1: Load and convert trimmed audio to mono ===

audio = AudioSegment.from_file("sir_audio.wav")
audio = audio.set_channels(1).set_frame_rate(48000)
audio.export(FINAL_AUDIO, format="wav")
print(f"✅ Final converted audio saved as: {FINAL_AUDIO}")
print(f"Channels: {audio.channels}")
print(f"Frame rate (sample rate): {audio.frame_rate}")

✅ Final converted audio saved as: finetuning_audio.wav
Channels: 1
Frame rate (sample rate): 48000


In [1]:
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence

def segment_audio(input_file, output_dir="tts_dataset_hindi_48k", min_duration=3, max_duration=7, silence_thresh=None, min_silence_len=500):
    # Load full audio
    audio = AudioSegment.from_file(input_file)
    total_original_duration = len(audio) / 1000.0  # in seconds
    print(f"🎧 Total original audio duration: {total_original_duration:.2f} seconds")

    # Set silence threshold dynamically if not provided
    if silence_thresh is None:
        silence_thresh = audio.dBFS - 14

    # Split based on silence
    chunks = split_on_silence(audio,
                              min_silence_len=min_silence_len,
                              silence_thresh=silence_thresh,
                              keep_silence=250)

    os.makedirs(output_dir, exist_ok=True)

    chunk_count = 0
    temp_chunk = AudioSegment.empty()
    total_chunks_duration = 0.0  # Accumulate durations of all saved chunks

    for chunk in chunks:
        if len(temp_chunk) + len(chunk) < max_duration * 1000:
            temp_chunk += chunk
        else:
            if len(temp_chunk) >= min_duration * 1000:
                chunk_filename = f"chunk_{chunk_count:04d}.wav"
                temp_chunk.export(os.path.join(output_dir, chunk_filename), format="wav")
                duration_sec = len(temp_chunk) / 1000.0
                total_chunks_duration += duration_sec
                print(f"💾 Saved {chunk_filename}: Duration = {duration_sec:.2f}s")
                chunk_count += 1
                temp_chunk = chunk
            else:
                temp_chunk += chunk

    # Save remaining audio
    if len(temp_chunk) >= min_duration * 1000:
        chunk_filename = f"chunk_{chunk_count:04d}.wav"
        temp_chunk.export(os.path.join(output_dir, chunk_filename), format="wav")
        duration_sec = len(temp_chunk) / 1000.0
        total_chunks_duration += duration_sec
        print(f"💾 Saved {chunk_filename}: Duration = {duration_sec:.2f}s")

    # Final summary
    print("\n📊 Summary:")
    print(f"🔹 Total original audio duration   : {total_original_duration:.2f} seconds")
    print(f"🔹 Total duration of all chunks    : {total_chunks_duration:.2f} seconds")
    print(f"🔹 Number of chunks created        : {chunk_count + 1}")

    # Optional warning
    diff = abs(total_original_duration - total_chunks_duration)
    if diff > 1.0:
        print(f"⚠️ Warning: Duration mismatch of {diff:.2f} seconds due to silence trimming or rounding.")

# Example usage
segment_audio("finetuning_audio.wav")


🎧 Total original audio duration: 1833.17 seconds
💾 Saved chunk_0000.wav: Duration = 10.06s
💾 Saved chunk_0001.wav: Duration = 15.97s
💾 Saved chunk_0002.wav: Duration = 5.06s
💾 Saved chunk_0003.wav: Duration = 6.13s
💾 Saved chunk_0004.wav: Duration = 6.58s
💾 Saved chunk_0005.wav: Duration = 6.76s
💾 Saved chunk_0006.wav: Duration = 6.92s
💾 Saved chunk_0007.wav: Duration = 3.04s
💾 Saved chunk_0008.wav: Duration = 4.17s
💾 Saved chunk_0009.wav: Duration = 6.57s
💾 Saved chunk_0010.wav: Duration = 4.68s
💾 Saved chunk_0011.wav: Duration = 5.96s
💾 Saved chunk_0012.wav: Duration = 6.63s
💾 Saved chunk_0013.wav: Duration = 8.53s
💾 Saved chunk_0014.wav: Duration = 7.20s
💾 Saved chunk_0015.wav: Duration = 6.76s
💾 Saved chunk_0016.wav: Duration = 4.20s
💾 Saved chunk_0017.wav: Duration = 8.19s
💾 Saved chunk_0018.wav: Duration = 5.67s
💾 Saved chunk_0019.wav: Duration = 7.20s
💾 Saved chunk_0020.wav: Duration = 7.37s
💾 Saved chunk_0021.wav: Duration = 6.95s
💾 Saved chunk_0022.wav: Duration = 7.38s
💾 Save

In [2]:
import os
from pydub import AudioSegment

def convert_to_16k(input_dir="tts_dataset_hindi_48k", output_dir="tts_dataset_hindi_16k"):
    os.makedirs(output_dir, exist_ok=True)

    wav_files = sorted([f for f in os.listdir(input_dir) if f.endswith(".wav")])
    print(f"🔍 Found {len(wav_files)} files in '{input_dir}'")

    for i, fname in enumerate(wav_files):
        input_path = os.path.join(input_dir, fname)
        output_path = os.path.join(output_dir, fname)

        # Load and convert
        audio = AudioSegment.from_wav(input_path)
        audio = audio.set_frame_rate(16000)  # Convert to 16kHz

        # Export to new location
        audio.export(output_path, format="wav")
        print(f"🎵 Converted and saved: {output_path}")

    print(f"\n✅ All files saved in '{output_dir}' as 16 kHz WAV.")

# Example usage
convert_to_16k()


🔍 Found 238 files in 'tts_dataset_hindi_48k'
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0000.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0001.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0002.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0003.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0004.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0005.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0006.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0007.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0008.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0009.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0010.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0011.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0012.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0013.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0014.wav
🎵 Converted and saved: tts_dataset_hindi_16k/chunk_0015

In [6]:
import os
import json
from joblib import Parallel, delayed
import subprocess

# === CONFIGURATION ===
#MODEL_PATH = "/home/karan/Fairseq-Inference/IndicWav2Vec_Hindi_SPRING_INX.pt"
MODEL_PATH = "/home/karan/Fairseq-Inference/SPRING_INX_data2vec_aqc_Hindi.pt"

AUDIO_DIR = "tts_dataset_hindi_16k"
SCRIPT_PATH = "infer_optimized.py"
OUTPUT_JSONL = "transcriptions.jsonl"

NUM_GPUS = 1  # You can set to 4 if needed
BATCH_SIZE = 209 # <<< DIRECTLY SET BATCH SIZE HERE

print(f"🛠️ Using BATCH_SIZE = {BATCH_SIZE}")

# === Load all .wav files
wav_files = sorted([os.path.join(AUDIO_DIR, f) for f in os.listdir(AUDIO_DIR) if f.endswith(".wav")])

# === Create batches
batches = [wav_files[i:i + BATCH_SIZE] for i in range(0, len(wav_files), BATCH_SIZE)]

print(f"🔢 Total batches: {len(batches)}")

# === Worker for one batch
def process_batch(batch_idx, batch_files):
    gpu_id = batch_idx % NUM_GPUS
    print(f"🚀 Processing batch {batch_idx} with {len(batch_files)} files on GPU {gpu_id}")

    cmd = ["python", SCRIPT_PATH, MODEL_PATH, str(gpu_id)] + batch_files

    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"❌ Error in batch {batch_idx}: {result.stderr.strip()}")
        return []

    lines = result.stdout.strip().splitlines()
    parsed = []
    for line in lines:
        try:
            res = eval(line)
            parsed.append(res)
        except Exception as e:
            print(f"⚠️ Parse error: {e} for line: {line}")
    return parsed

# === Parallel execution
all_results = Parallel(n_jobs=NUM_GPUS)(delayed(process_batch)(idx, batch) for idx, batch in enumerate(batches))

# === Flatten results
flat_results = [item for sublist in all_results for item in sublist]

# === Save final JSONL
with open(OUTPUT_JSONL, 'w', encoding='utf-8') as f:
    for item in flat_results:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')

print(f"\n✅ All transcriptions saved to {OUTPUT_JSONL}")


🛠️ Using BATCH_SIZE = 209
🔢 Total batches: 2
🚀 Processing batch 0 with 209 files on GPU 0
❌ Error in batch 0: 2025-05-09 18:41:28.045960: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-09 18:41:28.047243: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-09 18:41:28.073641: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  state = torch.load(f, map_location=torch.device("cpu"))
Traceback (most recent call last):
  File "infer_optimized.py", line 59, in <modu