In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mobassir/multi-speaker-bangla-tts")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/mobassir/multi-speaker-bangla-tts?dataset_version_number=1...


100%|██████████| 958M/958M [00:48<00:00, 20.5MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1


In [None]:
!ls /root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1/bn_bd/

line_index.tsv	wavs


In [None]:
ls

[0m[01;34msample_data[0m/


# mbart and whisper


In [None]:
import torch
import librosa
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import MBartForConditionalGeneration, MBartTokenizer

def transcribe_and_translate_bengali(audio_file_path):
    """
    Transcribe Bengali audio and translate to English using Whisper and mBART.

    Args:
        audio_file_path: Path to the Bengali audio file

    Returns:
        dict: Contains the Bengali transcription and English translation
    """
    print("Loading audio file...")
    # Load and preprocess audio
    audio, sampling_rate = librosa.load(audio_file_path, sr=16000)

    # Load Whisper model and processor for ASR
    print("Loading Whisper model for transcription...")
    whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
    whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

    # Set the device to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    whisper_model = whisper_model.to(device)

    # Process the audio
    input_features = whisper_processor(audio, sampling_rate=16000, return_tensors="pt").input_features
    input_features = input_features.to(device)

    # Generate Bengali transcription
    print("Transcribing Bengali audio...")
    # Force the model to detect Bengali
    forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="bn", task="transcribe")
    generated_tokens = whisper_model.generate(
        input_features,
        forced_decoder_ids=forced_decoder_ids,
        max_length=448
    )

    # Decode the generated tokens to get the Bengali transcription
    bengali_transcription = whisper_processor.batch_decode(generated_tokens, skip_special_tokens=True)[0]

    # Load mBART model for translation
    print("Loading mBART model for translation...")
    mbart_tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
    mbart_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
    mbart_model = mbart_model.to(device)

    # Translate from Bengali to English
    print("Translating Bengali to English...")
    # Set source language to Bengali
    mbart_tokenizer.src_lang = "bn_IN"

    # Tokenize the Bengali text
    encoded_bn = mbart_tokenizer(bengali_transcription, return_tensors="pt")
    encoded_bn = {k: v.to(device) for k, v in encoded_bn.items()}

    # Generate the translation
    generated_tokens = mbart_model.generate(
        **encoded_bn,
        forced_bos_token_id=mbart_tokenizer.lang_code_to_id["en_XX"],
        max_length=1024
    )

    # Decode the translation
    english_translation = mbart_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

    return {
        "bengali_transcription": bengali_transcription,
        "english_translation": english_translation
    }

def batch_process_audio_files(audio_file_paths, output_path="results.txt"):
    """
    Process a batch of Bengali audio files and save results to a file.

    Args:
        audio_file_paths: List of paths to Bengali audio files
        output_path: Path where results will be saved
    """
    results = []

    for i, audio_path in enumerate(audio_file_paths):
        print(f"Processing file {i+1}/{len(audio_file_paths)}: {audio_path}")
        try:
            result = transcribe_and_translate_bengali(audio_path)
            result["file_path"] = audio_path
            results.append(result)

            # Save results incrementally to avoid losing everything if an error occurs
            print(f"File: {audio_path}\n")
            print(f"Bengali Transcription: {result['bengali_transcription']}\n")
            print(f"English Translation: {result['english_translation']}\n")
            print("-" * 80 + "\n")


        except Exception as e:
            print(f"Error processing {audio_path}: {e}")

    return results

# Example usage:
def caller():
    # List of audio files to process
    audio_files = [
        "/root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1/bn_bd/wavs/ban_00779_01116157520.wav",
        "/root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1/bn_bd/wavs/ban_02194_00528356747.wav",
        # Add more files as needed
    ]

    # Process the audio files
    results = batch_process_audio_files(audio_files)

    # Print a summary
    print(f"Successfully processed {len(results)} out of {len(audio_files)} files.")

caller()

Processing file 1/2: /root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1/bn_bd/wavs/ban_00779_01116157520.wav
Loading audio file...
Loading Whisper model for transcription...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/4.29k [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcribing Bengali audio...
Loading mBART model for translation...


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Translating Bengali to English...
Error processing /root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1/bn_bd/wavs/ban_00779_01116157520.wav: 'bn_IN'
Processing file 2/2: /root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1/bn_bd/wavs/ban_02194_00528356747.wav
Loading audio file...
Loading Whisper model for transcription...
Transcribing Bengali audio...


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


Loading mBART model for translation...
Translating Bengali to English...
Error processing /root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1/bn_bd/wavs/ban_02194_00528356747.wav: 'bn_IN'
Successfully processed 0 out of 2 files.


# wv2


In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import librosa

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53-bengali")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53-bengali")

audio, rate = librosa.load("/root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1/bn_bd/wavs/ban_00779_01116157520.wav", sr=16000)
input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]

print(transcription)


OSError: facebook/wav2vec2-large-xlsr-53-bengali is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

# finetune


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
!ls /root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1/bn_bd/

line_index.tsv	wavs


In [None]:
import torch
import pandas as pd
import numpy as np
from datasets import Dataset, Audio, DatasetDict
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import os
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# Set up error handling and logging
import logging
logging.basicConfig(level=logging.INFO)

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    """
    Data collator for padding the speech and label sequences.
    """
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Extract audio and transcription
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad the inputs
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Pad the labels
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding token id with -100 so it's ignored in loss calculation
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # Add labels to batch
        batch["labels"] = labels

        return batch

def prepare_dataset(csv_path, audio_dir, processor):
    """
    Prepare the dataset for fine-tuning.

    Args:
        csv_path: Path to CSV file with audio file paths and transcriptions
        audio_dir: Directory containing audio files
        processor: Whisper processor

    Returns:
        Dataset ready for training
    """
    try:
        # Load the CSV file
        df = pd.read_csv(csv_path)

        # Ensure required columns exist
        required_columns = ["file_path", "transcription"]
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns in CSV: {missing_columns}")

        # Create full paths to audio files
        df["file_path"] = df["file_path"].apply(lambda x: os.path.join(audio_dir, x))

        # Check if files exist
        missing_files = [file for file in df["file_path"] if not os.path.exists(file)]
        if missing_files:
            logging.warning(f"Missing {len(missing_files)} audio files. First few: {missing_files[:5]}")
            # Filter out missing files
            df = df[df["file_path"].apply(lambda x: os.path.exists(x))]

        # Create the dataset
        dataset = Dataset.from_pandas(df)

        # Load audio files
        dataset = dataset.cast_column("file_path", Audio(sampling_rate=16000))

        # Define preprocessing function
        def prepare_dataset(batch):
            # Load and resample audio data
            audio = batch["file_path"]

            # Compute input features
            batch["input_features"] = processor.feature_extractor(
                audio["array"], sampling_rate=audio["sampling_rate"]
            ).input_features[0]

            # Tokenize transcriptions
            batch["labels"] = processor.tokenizer(
                batch["transcription"], padding="max_length", max_length=128
            ).input_ids

            return batch

        # Apply preprocessing
        dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)

        return dataset

    except Exception as e:
        logging.error(f"Error preparing dataset: {str(e)}")
        raise

def finetune_whisper(
    train_csv_path,
    val_csv_path,
    audio_dir,
    output_dir="./whisper-bengali-finetuned",
    base_model="openai/whisper-small",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
):
    """
    Fine-tune Whisper model on Bengali data.

    Args:
        train_csv_path: Path to training CSV
        val_csv_path: Path to validation CSV
        audio_dir: Directory containing audio files
        output_dir: Directory to save fine-tuned model
        base_model: Base Whisper model to fine-tune
        num_train_epochs: Number of training epochs
        per_device_train_batch_size: Batch size per device
        gradient_accumulation_steps: Number of gradient accumulation steps
        learning_rate: Learning rate for training
    """
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Load processor and model
        processor = WhisperProcessor.from_pretrained(base_model)
        model = WhisperForConditionalGeneration.from_pretrained(base_model)

        # Set Bengali as the language and transcription as the task
        processor.tokenizer.set_prefix_tokens(language="bn", task="transcribe")

        # Prepare datasets
        train_dataset = prepare_dataset(train_csv_path, audio_dir, processor)
        val_dataset = prepare_dataset(val_csv_path, audio_dir, processor)

        # Create a data collator
        data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

        # Define training arguments
        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            learning_rate=learning_rate,
            warmup_steps=500,
            max_steps=4000,
            gradient_checkpointing=True,
            fp16=True,
            evaluation_strategy="steps",
            per_device_eval_batch_size=8,
            predict_with_generate=True,
            generation_max_length=225,
            save_steps=1000,
            eval_steps=1000,
            logging_steps=25,
            report_to=["tensorboard"],
            load_best_model_at_end=True,
            metric_for_best_model="wer",
            greater_is_better=False,
            push_to_hub=False,
        )

        # Create trainer
        trainer = Seq2SeqTrainer(
            args=training_args,
            model=model,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=data_collator,
            tokenizer=processor.tokenizer,
        )

        # Start training
        logging.info("Starting fine-tuning...")
        trainer.train()

        # Save the fine-tuned model
        model.save_pretrained(output_dir)
        processor.save_pretrained(output_dir)
        logging.info(f"Model saved to {output_dir}")

    except Exception as e:
        logging.error(f"Error during fine-tuning: {str(e)}")
        raise

# Function to create a sample CSV file for testing
def create_sample_csv(output_path, audio_files, transcriptions):
    """
    Create a sample CSV file for testing.

    Args:
        output_path: Path to save the CSV file
        audio_files: List of audio file paths
        transcriptions: List of transcriptions
    """
    df = pd.DataFrame({
        "file_path": audio_files,
        "transcription": transcriptions
    })
    df.to_csv(output_path, index=False)
    logging.info(f"Sample CSV created at {output_path}")

# Main execution
if __name__ == "__main__":
    # Example usage
    try:
        # Define paths
        AUDIO_DIR = "./root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1/bn_bd/wavs"
        TRAIN_CSV = "./content/output_file.csv"
        VAL_CSV = "./content/output_file.csv"
        OUTPUT_DIR = "./whisper-bengali-finetuned"

        # Check if GPU is available
        if torch.cuda.is_available():
            logging.info(f"GPU available: {torch.cuda.get_device_name(0)}")
        else:
            logging.warning("No GPU available. Training will be slow.")

        # Start fine-tuning
        finetune_whisper(
            train_csv_path=TRAIN_CSV,
            val_csv_path=VAL_CSV,
            audio_dir=AUDIO_DIR,
            output_dir=OUTPUT_DIR,
            base_model="openai/whisper-small",  # Use small model for faster training
            num_train_epochs=3,
            per_device_train_batch_size=8,  # Adjust based on GPU memory
            gradient_accumulation_steps=4,  # Increase for smaller batch sizes
            learning_rate=1e-5,
        )

    except Exception as e:
        logging.error(f"Error in main execution: {str(e)}")

ERROR:root:Error preparing dataset: [Errno 2] No such file or directory: './content/output_file.csv'
ERROR:root:Error during fine-tuning: [Errno 2] No such file or directory: './content/output_file.csv'
ERROR:root:Error in main execution: [Errno 2] No such file or directory: './content/output_file.csv'


# speech to text

In [None]:
import pandas as pd

# Load the TSV file
df = pd.read_csv("/root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1/bn_bd/line_index.tsv", sep="\t")

# Save as CSV
df.to_csv("output_file.csv", index=False)


In [None]:
%pip install banglaspeech2text --upgrade -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m112.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd

# Define column names
column_names = ["audio", "text"]

# Read CSV without headers
df = pd.read_csv("/content/output_file.csv", header=None, names=column_names)

# Save back to CSV
df.to_csv("output_w_header_file.csv", index=False)


In [None]:
import pandas as pd
data= pd.read_csv("/content/output_w_header_file.csv")
data.head()

Unnamed: 0,audio,text
0,ban_00737_00012222450,এইচআর টেক্সটাইল বাংলাদেশের ভেতরে একাধিক আউটলেট...
1,ban_00737_00015581920,স্ট্যান্ডার্ড ব্যাংক এ ইসলামী ব্যাংকিং এর সুবি...
2,ban_00737_00028634754,লাফার্জ সুরমা সিমেন্ট সর্বাধিক ব্যবহৃত সিমেন্ট...
3,ban_00737_00035050432,পিপলস ইন্স্যুরেন্স অব চায়না ছেষট্টি বছর আগে ব্...
4,ban_00737_00068052117,বয়গেস একটি ইন্ডাস্ট্রিয়াল গ্রুপ


In [None]:
from banglaspeech2text import Speech2Text

# Let's see what models are available
available_models = Speech2Text.list_models()
print(available_models)

stt = Speech2Text("base")

# Now let's transcribe a file
transcription = stt.recognize("/content/ban_00737_00012222450.wav")
print(transcription)

Available models:
	tiny:
		whisper-tiny-bn	74 WER	~151 MB	by shhossain (apache-2.0)
		whisper-tiny-bn	75 WER	~151 MB	by Emrul Hasan Zawad (apache-2.0)

	base:
		whisper-base-bn	46 WER	~300 MB	by Sifat (apache-2.0)

	large:
		whisper-large-v2-bn	11 WER	~3.1 GB	by Anurag Singh (apache-2.0)

	small:
		whisper-small-bn	18 WER	~1 GB	by Anurag Singh (apache-2.0)



For more models, visit https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&language=bn&sort=likes
2025-03-20 07:52:05,110 - BanglaSpeech2Text.speech2text - INFO - Initializing Speech2Text with model: base


INFO:BanglaSpeech2Text.speech2text:Initializing Speech2Text with model: base


2025-03-20 07:52:05,229 - BanglaSpeech2Text.speech2text - INFO - Using compute type: float16


INFO:BanglaSpeech2Text.speech2text:Using compute type: float16


2025-03-20 07:52:05,236 - BanglaSpeech2Text.converter - INFO - CTranslate2 model not found at /root/.cache/huggingface/shhossain--whisper-base-bn-ct2-float16. Converting...


INFO:BanglaSpeech2Text.converter:CTranslate2 model not found at /root/.cache/huggingface/shhossain--whisper-base-bn-ct2-float16. Converting...


2025-03-20 07:52:08,631 - BanglaSpeech2Text.converter - INFO - Converting model shhossain/whisper-base-bn to CTranslate2 format...


INFO:BanglaSpeech2Text.converter:Converting model shhossain/whisper-base-bn to CTranslate2 format...


2025-03-20 07:52:08,633 - BanglaSpeech2Text.converter - INFO - Command: ct2-transformers-converter --model shhossain/whisper-base-bn --output_dir /root/.cache/huggingface/shhossain--whisper-base-bn-ct2-float16 --quantization float16


INFO:BanglaSpeech2Text.converter:Command: ct2-transformers-converter --model shhossain/whisper-base-bn --output_dir /root/.cache/huggingface/shhossain--whisper-base-bn-ct2-float16 --quantization float16


2025-03-20 07:52:40,588 - BanglaSpeech2Text.converter - INFO - Successfully converted model to /root/.cache/huggingface/shhossain--whisper-base-bn-ct2-float16


INFO:BanglaSpeech2Text.converter:Successfully converted model to /root/.cache/huggingface/shhossain--whisper-base-bn-ct2-float16
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

FileNotFoundError: [Errno 2] No such file or directory: '/content/ban_00737_00012222450.wav'

In [None]:
transcription = stt.recognize("/root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1/bn_bd/wavs/ban_00737_00012222450.wav")
print(transcription)

এইচার টেক্সাইল বাংলাদেশের ভেতরে একাথিক আল্কলেটার মাধ্যমে শারী, বাচ্চাদের পোশাক, মহিলাদের পোশাক এবং অন্যান্য টে�


# facebook m4Tv2

In [None]:
import pandas as pd

# Load the TSV file
df = pd.read_csv("/root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1/bn_bd/line_index.tsv", sep="\t")

# Save as CSV
df.to_csv("output_file.csv", index=False)

In [None]:
# Define column names
column_names = ["audio", "text"]

# Read CSV without headers
df = pd.read_csv("/content/output_file.csv", header=None, names=column_names)

# Save back to CSV
df.to_csv("output_w_header_file.csv", index=False)

In [None]:
df.head()

Unnamed: 0,audio,text
0,ban_00737_00012222450,এইচআর টেক্সটাইল বাংলাদেশের ভেতরে একাধিক আউটলেট...
1,ban_00737_00015581920,স্ট্যান্ডার্ড ব্যাংক এ ইসলামী ব্যাংকিং এর সুবি...
2,ban_00737_00028634754,লাফার্জ সুরমা সিমেন্ট সর্বাধিক ব্যবহৃত সিমেন্ট...
3,ban_00737_00035050432,পিপলস ইন্স্যুরেন্স অব চায়না ছেষট্টি বছর আগে ব্...
4,ban_00737_00068052117,বয়গেস একটি ইন্ডাস্ট্রিয়াল গ্রুপ


In [None]:
%pip install git+https://github.com/huggingface/transformers.git sentencepiece


Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-_9510g83
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-_9510g83
  Resolved https://github.com/huggingface/transformers.git to commit bd41b9c1ac35f81b7672d0b908bad6784dfd768b
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.51.0.dev0-py3-none-any.whl size=11136707 sha256=f5f53cd33b37139e80675028b586e1aad447349c00834d5ba8220feeeeb0fa58
  Stored in directory: /tmp/pip-ephem-wheel-cache-piuk5d48/wheels/32/4b/78/f195c684dd3a9ed21f3b39fe8f85b48df7918581b6437be143
Successfully b

In [None]:
from transformers import AutoProcessor, SeamlessM4Tv2Model

processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.17M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.72k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/211k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.24G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/9.91M [00:00<?, ?B/s]

In [None]:
import torchaudio, torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [None]:
import numpy as np
import librosa
import soundfile as sf
from scipy import signal

def reduce_noise(audio_path, output_path, noise_reduction_strength=0.2, n_fft=2048, hop_length=512):
    """
    Reduce noise in an audio file using spectral gating.

    Parameters:
    -----------
    audio_path : str
        Path to input audio file
    output_path : str
        Path to save the noise-reduced audio file
    noise_reduction_strength : float
        Strength of noise reduction (0.0 to 1.0)
    n_fft : int
        FFT window size
    hop_length : int
        Number of samples between successive frames
    """
    # Load the audio file
    print(f"Loading audio file: {audio_path}")
    y, sr = librosa.load(audio_path, sr=None)

    # Compute spectrogram
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length)

    # Compute magnitude and phase
    magnitude, phase = librosa.magphase(D)

    # Estimate noise profile from the first few frames (adjust as needed)
    noise_frames = 20
    noise_profile = np.mean(magnitude[:, :noise_frames], axis=1, keepdims=True)

    # Apply spectral gating
    threshold = noise_reduction_strength * noise_profile
    mask = (magnitude > threshold)

    # Apply mask to magnitude
    magnitude_reduced = magnitude * mask

    # Reconstruct signal
    D_reduced = magnitude_reduced * phase
    y_reduced = librosa.istft(D_reduced, hop_length=hop_length)

    # Normalize audio
    y_reduced = y_reduced / np.max(np.abs(y_reduced))

    # Save the processed audio
    print(f"Saving noise-reduced audio to: {output_path}")
    sf.write(output_path, y_reduced, sr)

    print("Noise reduction completed successfully!")


reduce_noise(audio_path='/content/chittagong_sample2.wav', output_path='clean_chittagong_sample2.wav')

# if __name__ == "__main__":
#     import argparse

#     parser = argparse.ArgumentParser(description="Reduce noise in audio files")
#     parser.add_argument("input_file", help="Path to input audio file")
#     parser.add_argument("output_file", help="Path to save output audio file")
#     parser.add_argument("--strength", type=float, default=0.2,
#                         help="Noise reduction strength (0.0-1.0)")
#     args = parser.parse_args()

#     reduce_noise(args.input_file, args.output_file, noise_reduction_strength=args.strength)

Loading audio file: /content/chittagong_sample2.wav
Saving noise-reduced audio to: clean_chittagong_sample2.wav
Noise reduction completed successfully!


In [None]:
import numpy as np
import librosa
import soundfile as sf
import os
import math
audio_segment_array=[]

def split_audio(input_file, output_folder, segment_length=5.0):
    """
    Split an audio file into segments of specified length.

    Parameters:
    -----------
    input_file : str
        Path to input audio file
    output_folder : str
        Folder to save the split audio segments
    segment_length : float
        Length of each segment in seconds
    """
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created output directory: {output_folder}")

    # Load the audio file
    print(f"Loading audio file: {input_file}")
    y, sr = librosa.load(input_file, sr=None)

    # Calculate segment size in samples
    segment_size = int(segment_length * sr)

    # Calculate number of segments
    num_segments = math.ceil(len(y) / segment_size)

    print(f"Splitting audio into {num_segments} segments of {segment_length} seconds each")

    # Extract filename without extension
    base_filename = os.path.splitext(os.path.basename(input_file))[0]

    # Split and save each segment
    for i in range(num_segments):
        # Calculate start and end positions
        start = i * segment_size
        end = min(start + segment_size, len(y))

        # Extract segment
        segment = y[start:end]

        # Create output filename
        output_file = os.path.join(output_folder, f"{base_filename}_segment_{i+1:03d}.wav")

        # Save segment
        sf.write(output_file, segment, sr)

        print(f"Saved segment {i+1}/{num_segments}: {output_file}")
        audio_segment_array.append(output_file)

    print(f"Successfully split audio into {num_segments} segments")

split_audio(input_file='/content/clean_chittagong_sample2.wav', output_folder='clean_chittagong_sample2')

Created output directory: clean_chittagong_sample2
Loading audio file: /content/clean_chittagong_sample2.wav
Splitting audio into 5 segments of 5.0 seconds each
Saved segment 1/5: clean_chittagong_sample2/clean_chittagong_sample2_segment_001.wav
Saved segment 2/5: clean_chittagong_sample2/clean_chittagong_sample2_segment_002.wav
Saved segment 3/5: clean_chittagong_sample2/clean_chittagong_sample2_segment_003.wav
Saved segment 4/5: clean_chittagong_sample2/clean_chittagong_sample2_segment_004.wav
Saved segment 5/5: clean_chittagong_sample2/clean_chittagong_sample2_segment_005.wav
Successfully split audio into 5 segments


In [None]:
 audio_segment_array

['clean_chittagong_sample2/clean_chittagong_sample2_segment_001.wav',
 'clean_chittagong_sample2/clean_chittagong_sample2_segment_002.wav',
 'clean_chittagong_sample2/clean_chittagong_sample2_segment_003.wav',
 'clean_chittagong_sample2/clean_chittagong_sample2_segment_004.wav',
 'clean_chittagong_sample2/clean_chittagong_sample2_segment_005.wav']

In [None]:
audio, orig_freq =  torchaudio.load("/content/splits_audios/clean_audio_segment_010.wav")
audio =  torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16000) # must be a 16 kHz waveform array
audio_inputs = processor(audios=audio, return_tensors="pt").to(device)
output_tokens = model.generate(**audio_inputs, tgt_lang="ben", generate_speech=False)
translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
print(f"Translation from audio: {translated_text_from_audio}")

It is strongly recommended to pass the `sampling_rate` argument to `SeamlessM4TFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


Translation from audio: #আহ আনুমানিক হচ্ছে আপনার ৫০ হাজার টাকা মতো লস কেস


In [None]:
def concat_audio_segments(input_folder):
  text=""
  for filename in audio_segment_array:
    if filename.endswith(".wav"):
      print(filename)
      audio, orig_freq =  torchaudio.load(f"/content/{filename}")
      audio =  torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16000) # must be a 16 kHz waveform array
      audio_inputs = processor(audios=audio, return_tensors="pt").to(device)
      output_tokens = model.generate(**audio_inputs, tgt_lang="ben", generate_speech=False)
      translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
      print(f"Translation from audio: {translated_text_from_audio}")
      text+=translated_text_from_audio+" "
  print(text)


concat_audio_segments(input_folder='/content/clean_chittagong_sample2')

It is strongly recommended to pass the `sampling_rate` argument to `SeamlessM4TFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


clean_chittagong_sample2/clean_chittagong_sample2_segment_001.wav


It is strongly recommended to pass the `sampling_rate` argument to `SeamlessM4TFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


Translation from audio: আই আবদুর রহিম হর্নফুলি নদীর ফারাদ বারিয়ার, প্রতি বছর বাঁহ উড়ে এঁদে
clean_chittagong_sample2/clean_chittagong_sample2_segment_002.wav


It is strongly recommended to pass the `sampling_rate` argument to `SeamlessM4TFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


Translation from audio: এবারের পান আগরবারের চুনবোদড় ছিল
clean_chittagong_sample2/clean_chittagong_sample2_segment_003.wav


It is strongly recommended to pass the `sampling_rate` argument to `SeamlessM4TFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


Translation from audio: এইয়া নিয়ে এবারের পানিও খুব বেশি ছিল না, বিরাট জলের বাহিতে
clean_chittagong_sample2/clean_chittagong_sample2_segment_004.wav


It is strongly recommended to pass the `sampling_rate` argument to `SeamlessM4TFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


Translation from audio: এই লক্ষ্মীর শতি এই তো আইনা উবারত জায়েনা
clean_chittagong_sample2/clean_chittagong_sample2_segment_005.wav
Translation from audio: আগে সিগন্যাল দিও এবার সিগন্যাল কম করে দিও না
আই আবদুর রহিম হর্নফুলি নদীর ফারাদ বারিয়ার, প্রতি বছর বাঁহ উড়ে এঁদে এবারের পান আগরবারের চুনবোদড় ছিল এইয়া নিয়ে এবারের পানিও খুব বেশি ছিল না, বিরাট জলের বাহিতে এই লক্ষ্মীর শতি এই তো আইনা উবারত জায়েনা আগে সিগন্যাল দিও এবার সিগন্যাল কম করে দিও না 


In [None]:
data= pd.read_csv("/content/output_w_header_file.csv")
data['text'][0]

'এইচআর টেক্সটাইল বাংলাদেশের ভেতরে একাধিক আউটলেটের মাধ্যমে শাড়ি বাচ্চাদের পোশাক মহিলাদের পোশাক এবং অন্যান্য টেক্সটাইল পণ্য উৎপাদন ও বিপণন করে'

In [None]:
def run_on_n_audio(n):
  for i in range(n):
    name = data['audio'][i]
    path = f"/root/.cache/kagglehub/datasets/mobassir/multi-speaker-bangla-tts/versions/1/bn_bd/wavs/{name}.wav"
    audio, orig_freq =  torchaudio.load(path)
    audio =  torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16000) # must be a 16 kHz waveform array
    audio_inputs = processor(audios=audio, return_tensors="pt").to(device)
    output_tokens = model.generate(**audio_inputs, tgt_lang="ben", generate_speech=False)
    translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
    print(f"Translation  from audio: {translated_text_from_audio}")
    print("----------")
    print(f"Original text from data: {data['text'][i]}")
    print("===================================================================================================")

run_on_n_audio(5)

It is strongly recommended to pass the `sampling_rate` argument to `SeamlessM4TFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to `SeamlessM4TFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


Translation  from audio: এইচআর টেক্সটাইল বাংলাদেশের ভেতরে একাধিক আউটলেট এর মাধ্যমে শাড়ি, বাচ্চাদের পোশাক, মহিলাদের পোশাক এবং অন্যান্য টেক্সটাইল পণ্য উৎপাদন ও বিপণন করে।
----------
Original text from data: এইচআর টেক্সটাইল বাংলাদেশের ভেতরে একাধিক আউটলেটের মাধ্যমে শাড়ি বাচ্চাদের পোশাক মহিলাদের পোশাক এবং অন্যান্য টেক্সটাইল পণ্য উৎপাদন ও বিপণন করে


It is strongly recommended to pass the `sampling_rate` argument to `SeamlessM4TFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


Translation  from audio: স্ট্যান্ডার্ড ব্যাংকে ইসলামী ব্যাংকিংয়ের সুবিধা রয়েছে।
----------
Original text from data: স্ট্যান্ডার্ড ব্যাংক এ ইসলামী ব্যাংকিং এর সুবিধা রয়েছে


It is strongly recommended to pass the `sampling_rate` argument to `SeamlessM4TFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


Translation  from audio: লাফার্স-সুর্মা সিমেন্ট সর্বাধিক ব্যবহৃত সিমেন্ট উৎপাদন করে।
----------
Original text from data: লাফার্জ সুরমা সিমেন্ট সর্বাধিক ব্যবহৃত সিমেন্ট উৎপাদন করে


It is strongly recommended to pass the `sampling_rate` argument to `SeamlessM4TFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


Translation  from audio: পিপলস ইন্সুরেন্স অফ চায়না ৬৬ বছর আগে ব্যবসা চালু করে।
----------
Original text from data: পিপলস ইন্স্যুরেন্স অব চায়না ছেষট্টি বছর আগে ব্যবসা চালু করে
Translation  from audio: বয়েজেস একটি ইন্ডাস্ট্রিয়াল গ্রুপ
----------
Original text from data: বয়গেস একটি ইন্ডাস্ট্রিয়াল গ্রুপ


# Info extraction


In [None]:
prompt="""

You are an expert in extracting structured information from disaster reports. Analyze the following text and extract the following details in JSON format:

{
  "Name": "[if mentioned, otherwise empty]",
  "District": "[inferred from location details]",
  "Contact Number": "[if mentioned, otherwise empty]",
  "Location": "[specific place mentioned]",
  "Gender": "[inferred from pronouns or names]",
  "Occupation": "[if mentioned, otherwise empty]",
  "Village/Area": "[specific village or area mentioned]",
  "GPS Coordinate": "[if mentioned, otherwise empty]",
  "Disaster Type": "[type of disaster described]",
  "Damages": "[specific damages mentioned]",
  "Loss": "[quantified losses mentioned]"
}

Rules:
1. Only include fields that can be reasonably inferred from the text
2. For numeric values like losses, extract just the number
3. For disaster type, use standard terms like "flood", "storm", "earthquake",flash flood, droughts etc. Infer form the text.
4. Infer occupation and gender from the speakers description.
5. For location, infer the most specific place mentioned with respect to Bnagladesh
You can also use the internet for this task.

"""

In [None]:
input= """Aay Abdul Rahim, the Farad Barrier of the Hornful River, every year the bow flies' This time it was Ban, the last time it was Tunbudar. The water was not very good. The first one is the one that is called "Lakya" which means "Lake" in Hindi. The signal is now signaling to the ground."""

In [None]:
input2="আই আবদুর রহিম হর্নফুলি নদীর ফারাদ বারিয়ার, প্রতি বছর বাঁহ উড়ে এঁদে এবারের পান আগরবারের চুনবোদড় ছিল এইয়া নিয়ে এবারের পানিও খুব বেশি ছিল না, বিরাট জলের বাহিতে এই লক্ষ্মীর শতি এই তো আইনা উবারত জায়েনা আগে সিগন্যাল দিও এবার সিগন্যাল কম করে দিও না"

In [None]:
from google import genai
from pydantic import BaseModel


client = genai.Client(api_key="AIzaSyAaDHYn1dkiJIlZIudw4943SRjvYt581nE")
response = client.models.generate_content(
    model='gemini-2.0-flash',
    contents=f"According to this {prompt} extract info from {input} and {input2}"
)
# Use the response as a JSON string.
print(response.text)

# Use instantiated objects.
# my_recipes: list[Recipe] = response.parsed

```json
{
  "Name": "Abdul Rahim",
  "District": "Unknown",
  "Contact Number": "empty",
  "Location": "Hornfuli River",
  "Gender": "Male",
  "Occupation": "Farad Barrier",
  "Village/Area": "Tunbudar",
  "GPS Coordinate": "empty",
  "Disaster Type": "Flood",
  "Damages": "empty",
  "Loss": "empty"
}
```
