In [1]:
import torch
import numpy as np
import pandas as pd
import librosa
from datasets import Dataset, Audio
from torch.utils.data import DataLoader

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
def add_file_path(text):
  text = "/kaggle/input/bengaliai-speech/train_mp3s/" + f"{text}.mp3"
  return text

In [4]:
df = pd.read_csv('/kaggle/input/bengaliai-speech/train.csv')
df = pd.DataFrame(df, columns=['id', 'sentence'])
df['path'] = df['id'].map(lambda x: add_file_path(x)).drop(columns='id')
print(len(df))

963636


In [5]:
class AudioDataset():
    def __init__(self, paths, sentences):
        self.paths = paths
        self.sentences = sentences
    def __len__(self):
        return len(self.paths)
    def __getitem__(self, idx):
        speech, sr = librosa.load(self.paths[idx], sr=16000) 
        return speech, self.sentences[idx]

In [6]:
train_data = {'paths': [], 'sentences': []}
test_data = {'paths': [], 'sentences': []}
for i in range(0, (int)(len(df)*0.8)):
    train_data['paths'].append(df['path'][i])
    train_data['sentences'].append(df['sentence'][i])
for i in range((int)(len(df)*0.8), (int)(len(df))):
    test_data['paths'].append(df['path'][i])
    test_data['sentences'].append(df['sentence'][i]) 
train_dataset = AudioDataset(train_data['paths'], train_data['sentences'])
test_dataset = AudioDataset(test_data['paths'], test_data['sentences'])
print(len(train_dataset), len(test_dataset))


770908 192728


In [7]:
def collate_fn(batch):
    # Separate speeches and sentences from the batch
    speeches, sentences = zip(*batch)
    
    # Convert the speeches into PyTorch tensors
    speeches = [torch.tensor(speech) for speech in speeches]
    
    # Pad speeches to the maximum length in the batch
    speeches = torch.nn.utils.rnn.pad_sequence(speeches, batch_first=True)
    
    # Convert sentences into a list of strings
    sentences = list(sentences)
    
    return speeches, sentences

In [8]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {len(train_features)}")
print(f"Labels batch shape: {len(train_labels)}")



Feature batch shape: 64
Labels batch shape: 64


In [9]:
from transformers import AutoTokenizer, AutoModel, AutoFeatureExtractor, AutoProcessor, pipeline

class CFG:
    model = AutoModel.from_pretrained('/kaggle/input/bengali-model/bengali/')
    tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/bengali-model/bengali/')
    feature_extractor = AutoFeatureExtractor.from_pretrained('/kaggle/input/bengali-model/bengali/')
    processor = AutoProcessor.from_pretrained('/kaggle/input/bengali-model/bengali/')

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']
Some weights of the model checkpoint at /kaggle/input/bengali-model/bengali/ were not used when initializing Wav2Vec2Model: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

In [10]:
class GreedyCTCDecoder(torch.nn.Module):
    def __init__(self, labels, blank=0):
        super().__init__()
        self.labels = labels
        self.blank = blank

    def forward(self, emission: torch.Tensor) -> str:
        """Given a sequence emission over labels, get the best path string
        Args:
          emission (Tensor): Logit tensors. Shape `[num_seq, num_label]`.

        Returns:
          str: The resulting transcript
        """
        indices = torch.argmax(emission, dim=-1)  # [num_seq,]
        indices = torch.unique_consecutive(indices, dim=-1)
        indices = [i for i in indices if i != self.blank]
        return "".join([self.labels[i] for i in indices])