In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import whisperx
import pandas as pd
import os 
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.set_audio_backend("soundfile")
torchvision is not available - cannot save figures


In [2]:
DEVICE="cuda"
BATCH_SIZE=16
COMPUTE_TYPE="float16"

In [3]:
class AudioDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = pd.read_csv(dataframe)
        self.dirname=os.path.dirname(dataframe)

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        audio_name = self.dataframe.iloc[idx]['audio_name']
        audio_path=os.path.join(self.dirname,audio_name)

        audio = whisperx.load_audio(audio_path)
        return audio

In [4]:
def collate_fn(batch):
    # Find the longest audio in the batch
    max_length = max(audio.shape[0] for audio in batch)

    # Create tensor for padded batch
    padded_batch = torch.zeros((len(batch), max_length))

    # Fill padded batch with audio data
    for i, audio in enumerate(batch):
        length = audio.shape[0]
        padded_batch[i, :length] = torch.tensor(audio, dtype=torch.float32)  # Convert to tensor

    return np.array(padded_batch)

def create_dataloader(dataframe, batch_size):
    dataset = AudioDataset(dataframe)
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
    return dataloader

In [5]:
loader=create_dataloader(dataframe='/root/suyash/acoustic_stuff/hindi-acoustic-word-embedding/train_dataset/metadata.csv',batch_size=32)

In [6]:
data=next(iter(loader))

In [7]:
data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [8]:
model=whisperx.load_model("whisper-tiny-ct2", DEVICE, compute_type=COMPUTE_TYPE)


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.3.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.1+cu121. Bad things might happen unless you revert torch to 1.x.


In [10]:
df=pd.read_csv('/root/suyash/acoustic_stuff/hindi-acoustic-word-embedding/train_dataset/metadata.csv')

In [19]:
root_path=os.path.dirname('/root/suyash/acoustic_stuff/hindi-acoustic-word-embedding/train_dataset/metadata.csv')
audio_path=os.path.join(root_path,df['audio_name'][84])
audio=whisperx.load_audio(audio_path)

In [20]:
result=model.transcribe(audio,batch_size=BATCH_SIZE)


Detected language: tg (0.11) in first 30s of audio...


In [21]:
model_a, metadata = whisperx.load_align_model(language_code='hi', device=DEVICE)


Some weights of the model checkpoint at theainerd/Wav2Vec2-large-xlsr-hindi were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at theainerd/Wav2Vec2-large-xlsr-hindi and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably T

In [22]:
result = whisperx.align(result["segments"], model_a, metadata, audio, DEVICE, return_char_alignments=False)


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [23]:
result.keys()

dict_keys(['segments', 'word_segments'])

In [30]:
segments=result['segments'][0]['words']

In [25]:
result['word_segments']

[{'word': 'जगदेव', 'start': 0.009, 'end': 1.073, 'score': 0.177},
 {'word': 'चंद', 'start': 1.213, 'end': 1.414, 'score': 0.327},
 {'word': 'ठाकुर', 'start': 1.494, 'end': 1.876, 'score': 0.411},
 {'word': 'की', 'start': 1.976, 'end': 2.056, 'score': 0.496},
 {'word': 'दो', 'start': 2.197, 'end': 2.337, 'score': 0.338},
 {'word': 'बेटियाँ', 'start': 2.418, 'end': 2.819, 'score': 0.546},
 {'word': 'विजिया', 'start': 2.839, 'end': 3.281, 'score': 0.271},
 {'word': 'पटियाल', 'start': 3.361, 'end': 3.863, 'score': 0.48},
 {'word': 'व', 'start': 3.983, 'end': 4.144, 'score': 0.248},
 {'word': 'सुनीता', 'start': 4.365, 'end': 4.686, 'score': 0.415},
 {'word': 'ठाकुर', 'start': 4.786, 'end': 5.107, 'score': 0.473},
 {'word': 'हैं।', 'start': 5.147, 'end': 5.208, 'score': 0.497}]

In [34]:
for i,seg in enumerate(segments):
    print(i,seg['end'])

0 1.073
1 1.414
2 1.876
3 2.056
4 2.337
5 2.819
6 3.281
7 3.863
8 4.144
9 4.686
10 5.107
11 5.208
