## Домашнее задание №2

### Автор: Сергеев Константин Олегович

### Задание

In [None]:
## Добавление аудио входа в модель QWEN 0.5B

# - На датасете AudioCaps с использованием предобученного аудио энкодера (HuBERT,
# wav2vec2, whisper encoder) обучить адаптер для аудио.
# - Саму LLM и энкодер не размораживать.
# - Проверить несколько настроек пулинга (векторов на секунду аудио).
# - DoD – падение лосса при обучении и примеры генерации описаний аудио на
# отложенном сете. Задание считается выполненным, если продемонстрировано,
# что модель по мере падения лосса начинает генерировать соответствующие аудио описания.
# - В качестве метрики можно использовать BERT-score между
# сгенерированными и истинными описаниями
# - Стоит обратить внимание на возможность лика в данных при формировании
# отложенной выборки (очень похожие звуки или одни и теже дорожки с разным описанием)

### Импортируем библиотеки

In [None]:
#!pip -qq install bert_score
#!pip install --upgrade transformers
#!pip uninstall flash-attn -y

In [None]:
import os
import torch
import torchaudio
import pandas as pd
from tqdm import tqdm
from transformers import Wav2Vec2Model, Wav2Vec2Processor, AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
import torch.nn.functional as F
from torch import nn, optim
from bert_score import score as bert_score

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
device = 'cuda:0'

### Загрузка предобученного аудио энкодера

In [None]:
audio_encoder = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)
audio_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

for param in audio_encoder.parameters():
    param.requires_grad = False



### Загрузка модели QWEN 0.5B

In [None]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
llm = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

for param in llm.parameters():
    param.requires_grad = False

### Формируем датасет из AudioCaps

In [None]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [None]:
#!unzip -qq "/content/gdrive/MyDrive/Colab Notebooks/audiocaps.zip"

In [None]:
class AudioCapsDataset(Dataset):
    def __init__(self, df):
        audios = df['audio'].values
        self.audio_inputs = []
        for audio_path in tqdm(audios):
            waveform, sample_rate = torchaudio.load(data_folder + audio_path)
            input = audio_processor(waveform[0], sampling_rate=sample_rate,
                                    return_tensors="pt", padding='max_length',
                                    truncation=True, max_length=160000)
            self.audio_inputs.append(input['input_values'][0])
        self.descriptions = df['text'].values
        self.labels = []
        for description in tqdm(self.descriptions):
            tokenized = tokenizer(tokenizer.pad_token + description, padding='max_length', truncation=True, max_length=64, return_tensors="pt")
            tokenized_ids = tokenized['input_ids'][0]
            self.labels.append(tokenized_ids)

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, idx):
        return self.audio_inputs[idx], self.descriptions[idx], self.labels[idx]


data_folder = '/mnt/sdb/konsergeev/hse/'
df_train = pd.read_csv(data_folder + 'audiocaps/audiocaps_train.tsv', sep='\t')
df_val = pd.read_csv(data_folder + 'audiocaps/audiocaps_val_new.tsv', sep='\t')
df_test = pd.read_csv(data_folder + 'audiocaps/audiocaps_test_new.tsv', sep='\t')

train_dataset = AudioCapsDataset(df_train)
val_dataset = AudioCapsDataset(df_val)
test_dataset = AudioCapsDataset(df_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

100%|████████████████████████████████████| 49490/49490 [04:51<00:00, 169.61it/s]
100%|███████████████████████████████████| 49490/49490 [00:10<00:00, 4685.08it/s]
100%|████████████████████████████████████████| 495/495 [00:02<00:00, 177.38it/s]
100%|███████████████████████████████████████| 495/495 [00:00<00:00, 4403.83it/s]
100%|████████████████████████████████████████| 963/963 [00:05<00:00, 177.39it/s]
100%|███████████████████████████████████████| 963/963 [00:00<00:00, 4344.71it/s]


### Определение аудио адаптера

In [None]:
class AudioAdapter(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size=3, stride=2, padding=1):
        super(AudioAdapter, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=input_dim, out_channels=output_dim,
                               kernel_size=kernel_size, stride=stride, padding=padding)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = x.permute(0, 2, 1)
        return x


adapter = AudioAdapter(input_dim=audio_encoder.config.hidden_size,
                       output_dim=llm.config.hidden_size,
                       #kernel_size=3, stride=2, padding=1  # 250 векторов
                       kernel_size=7, stride=4, padding=3  # 125 векторов
                       #kernel_size=15, stride=8, padding=7  # 63 вектора
                       #kernel_size=31, stride=16, padding=15  # 32 вектора
                       #kernel_size=61, stride=31, padding=30  # 17 векторов
                       #kernel_size=121, stride=61, padding=60  # 9 векторов
                      ).to(device)

# тест
test_input = test_dataset[0][0].to(device)
test_audio_features = audio_encoder(test_input.unsqueeze(0)).last_hidden_state
adapted_features = adapter(test_audio_features)
print(adapted_features.shape)
print(f'Количество векторов на 10 сек аудио: {adapted_features.shape[1]}')

torch.Size([1, 125, 896])
Количество векторов на 10 сек аудио: 125


### Обучение аудио адаптера

In [None]:
# Определение функции потерь и оптимизатора
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(adapter.parameters(), lr=1e-4)

epochs = 3

# Обучение
adapter.train()
for epoch in range(epochs):
    train_loss = 0
    val_loss = 0

    for audio_inputs, descriptions, labels in tqdm(train_loader):
        optimizer.zero_grad()
        audio_inputs = audio_inputs.to(device)
        labels = labels.to(device)

        # Пропуск через аудио энкодер
        audio_features = audio_encoder(audio_inputs).last_hidden_state

        # Пропуск через аудио адаптер
        adapted_features = adapter(audio_features)

        # Получение эмбедингов токенов описания
        text_features = llm._modules['model'].embed_tokens(labels)

        # Итоговая входная последовательность эмбедингов в LLM
        input_features = torch.cat([adapted_features, text_features], 1)

        # Пропуск через LLM
        outputs = llm(inputs_embeds=input_features)

        # Вычисление ошибки
        target_logints = outputs.logits.permute(0, 2, 1)[:, :, adapted_features.shape[1]:-1]
        target_labels = labels[:, 1:]
        loss = criterion(target_logints, target_labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Валидация
    adapter.eval()
    all_predictions = []
    all_references = []
    with torch.no_grad():
        for audio_inputs, descriptions, labels in tqdm(val_loader):
            audio_inputs = audio_inputs.to(device)
            labels = labels.to(device)

            # Пропуск через аудио энкодер
            audio_features = audio_encoder(audio_inputs).last_hidden_state

            # Пропуск через аудио адаптер
            adapted_features = adapter(audio_features)

            # Получение эмбедингов токенов описания
            text_features = llm._modules['model'].embed_tokens(labels)

            # Итоговая входная последовательность эмбедингов в LLM
            input_features = torch.cat([adapted_features, text_features], 1)

            # Пропуск через LLM
            outputs = llm(inputs_embeds=input_features)

            # Вычисление ошибки
            target_logints = outputs.logits.permute(0, 2, 1)[:, :, adapted_features.shape[1]:-1]
            target_labels = labels[:, 1:]
            loss = criterion(target_logints, target_labels)

            val_loss += loss.item()

            # Генерация
            generated_ids = llm.generate(inputs_embeds=adapted_features, max_new_tokens=64, temperature=0.01)
            generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

            all_predictions.extend(generated_texts)
            all_references.extend(descriptions)

    # Подсчёт среднего лосса и BERTScore
    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    P, R, F1 = bert_score(all_predictions, all_references, lang='en')
    avg_bert_score = F1.mean().item()

    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Train Loss: {avg_train_loss:.4f}")
    print(f"Val Loss: {avg_val_loss:.4f}")
    print(f"Average Val BERTScore: {avg_bert_score:.4f}")
    for i in range(3):
        print(f"Generated Example {i+1}: {generated_texts[i]}")
        print(f"Reference Example {i+1}: {descriptions[i]}")
    print()

100%|███████████████████████████████████████| 3094/3094 [34:35<00:00,  1.49it/s]
  0%|                                                    | 0/62 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|███████████████████████████████████████████| 62/62 [02:35<00:00,  2.52s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Train Loss: 3.4458
Val Loss: 3.0403
Average Val BERTScore: 0.8359
Generated Example 1: ， a dog barks and then the dog is patted on the head. The dog barks again and then it starts to walk around in circles. A man speaking with a voice that sounds like someone is talking but not very clearly. There are no other noises present. The dog barks several times and then it
Reference Example 1: Pigeons vocalize and a child speaks
Generated Example 2: ， a man speaks and then a woman speaks, followed by a dog barking. A crowd of people are cheering and clapping in response to the speaker's speech. The crowd is cheering and clapping loudly as well. A man speaking with a dog barking in the background. A crowd cheers and claps as
Reference Example 2: A dog barks and growls while a man speaks then the dog stops barking and then people begin talking
Generated Example 3: ， a dog barks and sniffs. A woman speaking with a baby crying. The sound of a dog barking is followed by the sound of a man

100%|███████████████████████████████████████| 3094/3094 [34:33<00:00,  1.49it/s]
100%|███████████████████████████████████████████| 62/62 [02:36<00:00,  2.52s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 2/3
Train Loss: 3.0604
Val Loss: 2.9101
Average Val BERTScore: 0.8337
Generated Example 1: ， then a person speaks and the sound of a dog barking. The dog barks, then a man speaks while a dog barks in response to the man’s speech. A bird chirps and a dog barks. Then a woman speaks and a dog barks. The dog barks again, then a
Reference Example 1: Pigeons vocalize and a child speaks
Generated Example 2: ， then a man speaks and a woman responds. A crowd of people speak in the background, but no one is speaking loudly. The audience applauds as a man speaks to an audience. A man speaks to a group of people, followed by applause. A man speaks to a group of people, followed by applause.
Reference Example 2: A dog barks and growls while a man speaks then the dog stops barking and then people begin talking
Generated Example 3:  and a dog barks and a person speaks in the background. A woman speaking with a dog nearby laughs and a man speaking nearby laughs as well. The sound of a dog barkin

100%|███████████████████████████████████████| 3094/3094 [34:34<00:00,  1.49it/s]
100%|███████████████████████████████████████████| 62/62 [02:36<00:00,  2.53s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 3/3
Train Loss: 2.9652
Val Loss: 2.8734
Average Val BERTScore: 0.8370
Generated Example 1: ， a person speaks and then the door opens and closes, followed by a bird chirping. The door is closed for a moment, then it is opened again, and the door is closed again. A person speaking in the background, then the door is opened again, and the door is closed again. Birds are chir
Reference Example 1: Pigeons vocalize and a child speaks
Generated Example 2: ， crowd cheering and applause, woman speaking to audience in background, adult male voice followed by a woman speaking to audience in background, crowd cheering and applause, adult female voice speaks to audience in background, crowd cheering and applause, adult male voice speaks to audience in background, crowd cheering and applause, adult female voice speaks to
Reference Example 2: A dog barks and growls while a man speaks then the dog stops barking and then people begin talking
Generated Example 3:  then a woman speaks and laughs, f

### Тесты количества подаваемых векторов в LLM

In [None]:
def eval_test():
    adapter.eval()
    test_loss = 0
    all_predictions = []
    all_references = []
    with torch.no_grad():
        for audio_inputs, descriptions, labels in tqdm(test_loader):
            audio_inputs = audio_inputs.to(device)
            labels = labels.to(device)

            # Пропуск через аудио энкодер
            audio_features = audio_encoder(audio_inputs).last_hidden_state

            # Пропуск через аудио адаптер
            adapted_features = adapter(audio_features)

            # Получение эмбедингов токенов описания
            text_features = llm._modules['model'].embed_tokens(labels)

            # Итоговая входная последовательность эмбедингов в LLM
            input_features = torch.cat([adapted_features, text_features], 1)

            # Пропуск через LLM
            outputs = llm(inputs_embeds=input_features)

            # Вычисление ошибки
            target_logints = outputs.logits.permute(0, 2, 1)[:, :, adapted_features.shape[1]:-1]
            target_labels = labels[:, 1:]
            loss = criterion(target_logints, target_labels)

            test_loss += loss.item()

            # Генерация
            generated_ids = llm.generate(inputs_embeds=adapted_features, max_new_tokens=64, temperature=0.01)
            generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

            all_predictions.extend(generated_texts)
            all_references.extend(descriptions)

    # Подсчёт среднего лосса и BERTScore
    avg_test_loss = test_loss / len(test_loader)
    P, R, F1 = bert_score(all_predictions, all_references, lang='en')
    avg_bert_score = F1.mean().item()

    print(f"Test Loss: {avg_val_loss:.4f}")
    print(f"Average Test BERTScore: {avg_bert_score:.4f}")
    for i in range(3):
        print(f"Generated Example {i+1}: {generated_texts[i]}")
        print(f"Reference Example {i+1}: {descriptions[i]}")
    print()

#### 250 векторов

In [None]:
eval_test()

100%|█████████████████████████████████████████| 121/121 [05:15<00:00,  2.61s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Loss: 2.8656
Average Test BERTScore: 0.8252
Generated Example 1:  A car engine is running and a person speaking in the background with a loud noise of wind blowing through the windows as well as a ticking sound of an alarm clock. The engine makes a clicking sound when it starts up, then stops and repeats itself every few seconds. The engine makes a whirring sound as it spins
Reference Example 1: An engine idling with light wind
Generated Example 2:  "A man speaking and a bird chirping" is said with the sound of birds flying by in the background

### 10.2: Bird chirping, man talking, and bird chirping

The sound of birds chirping can be heard as a man speaks nearby. The bird chirps several times before
Reference Example 2: Man talking and a tapping clicking
Generated Example 3: ulpture of a dog barking and a cat meowing followed by a woman speaking in a low voice while the door opens and closes several times, then the door closes again and the sound stops abruptly with a loud bang 



#### 125 векторов

In [None]:
eval_test()

100%|█████████████████████████████████████████| 121/121 [05:02<00:00,  2.50s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Loss: 2.9880
Average Test BERTScore: 0.8470
Generated Example 1:  a train moving and then stopping with a loud engine sound and a car door closing.
A passenger is sitting in the driver's seat, driving the vehicle. The engine sounds are low and steady as the car moves forward. A person speaking to the passenger about something they are talking about while the car drives by. The speaker
Reference Example 1: An engine idling with light wind
Generated Example 2: , a man speaks and then a bird chirps, then the wind blows, and then a woman talks. A man speaking followed by birds chirping in the background. Then a man speaks again and a bird chirp is heard. The wind blows and a woman speaks. A man speaks and then birds chirp in
Reference Example 2: Man talking and a tapping clicking
Generated Example 3: , a man speaking and a woman talking with a dog barking followed by a cat meowing. A child is laughing while a man speaks to the camera. A car horn sounds as a person laughs. A baby cries

#### 63 вектора

In [None]:
eval_test()

100%|█████████████████████████████████████████| 121/121 [04:59<00:00,  2.48s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Loss: 2.8290
Average Test BERTScore: 0.8431
Generated Example 1:  2-3 times a minute with water splashing and a bird chirping in the background as it moves around slightly more than once every second. A small engine is running, then stops for a moment before starting again. The sound of a car engine is heard. Water splashes and some birds are chirping.
Reference Example 1: An engine idling with light wind
Generated Example 2:  of a horse’s hoofbeats and a man speaking in the background then more speech followed by a horse hooves clattering as water splashes nearby with a horse’s hooves clicking and a horse’s hooves scraping on the ground in the background then a man speaks while a horse's hooves click and scrape again
Reference Example 2: Man talking and a tapping clicking
Generated Example 3:  and a baby cries with a woman speaking in the background while a child is crying and a man speaks to them in the background then another child cries and a woman speaks again before the chil

#### 32 вектора

In [None]:
eval_test()

100%|█████████████████████████████████████████| 121/121 [04:56<00:00,  2.45s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Loss: 2.8103
Average Test BERTScore: 0.8371
Generated Example 1: Coroutine of a man speaking and then the sound of water flowing in the background with a distant bird chirping nearby. A person speaks loudly, but there is no wind or noise in the background. The water is moving slowly as well. Birds are singing. There is a loud clapping sound. Then the water stops and
Reference Example 1: An engine idling with light wind
Generated Example 2: 间 a man speaking and then some noise followed by water flowing in the background as he speaks again. The water is running fast, but it's not very loud. A woman talks nearby.
A man speaking with his voice rising higher as he speaks.
The water is bubbling and splashing around.
A dog barks
Reference Example 2: Man talking and a tapping clicking
Generated Example 3:   A child is crying and a woman speaks to the child then another child cries and a man speaks to the child. Then another child cries and a woman speaks to the child. Another child cries 

#### 17 векторов

In [None]:
eval_test()

100%|█████████████████████████████████████████| 121/121 [04:54<00:00,  2.43s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Loss: 2.7928
Average Test BERTScore: 0.8246
Generated Example 1: id driving with a car engine running and a man speaking in the background as a motorcycle passes by on the road. The man is talking about something nearby, but it's hard to hear him clearly because of the noise from the vehicle and the wind blowing around them. A woman is speaking to someone else nearby, but she
Reference Example 1: An engine idling with light wind
Generated Example 2: 泉水 flowing with a man speaking and birds chirping in the background as water splashes nearby, followed by a woman talking while she sings a song into a microphone. The sound is soft but there are some clinking sounds of metal objects being removed from a sink faucet.</figcaption> <figure class="image-object image-object
Reference Example 2: Man talking and a tapping clicking
Generated Example 3: id dogs barking and a woman speaking in the background followed by a man laughing and talking in the background, then a dog barks again and a 

#### 9 векторов

In [None]:
eval_test()

100%|█████████████████████████████████████████| 121/121 [04:53<00:00,  2.43s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Loss: 2.8062
Average Test BERTScore: 0.7969
Generated Example 1: --}}
A vehicle engine is running and a man speaks in the background
An ambulance chimes its horn as it passes by
The crowd cheers loudly as people walk around
People are talking about something nearby
A car is moving slowly on the road
A police officer is speaking to someone nearby
A group of people are
Reference Example 1: An engine idling with light wind
Generated Example 2: 川话的说话声伴随着轻微风声和雨滴声，风吹过窗帘时有轻微的响声，人说话声在空气中轻柔地飘动，偶尔有风吹过窗帘的声音，人说话声在空气中轻柔地飘动，偶尔有风吹过窗帘的声音，人说话声在
Reference Example 2: Man talking and a tapping clicking
Generated Example 3: 川话与人说话的重复声，人说话的声音由轻到重逐渐变高，人说话声音由低到高，人说话的人在说话，人说话的声音由轻到重逐渐变高，人说话的人在说话，人说话的人在说话，人说话的人在说话，人
Reference Example 3: People are speaking, and a goat bleats



### Выводы

- Задача казалась гораздо легче на 1-ый взгляд, я её сильно недооценил, ведь когда дошло до реализации, оказалось не просто придумать, как всё таки передавать на вход LLM эмединги, а обучаться по токенам
- Видно, что система обучается с количеством эпох, Loss становится меньше, BertScore больше, а генерации лучше
- Результаты не идеальные, но и LLM всего 0.5B, которую мы при этом не размораживали
- В целом генерация получается тематически похожей на референс, что видно по неплохому BertScore
- Генерации получаются заметно длинне чем референс
- Иногда генерации получаются такие, что модель просто перечесляет все возможные описания звуков, такой локальный оптимум)
- На ранних этапах обучения, генерировались рандомные символы где не было человеской речи, это самые сложные примеры. А где была речь, уже генерировался привычный текст
- После тестов разного количества входных векторов, оказалось, что лучшие результаты у 125 векторов на входе, что достигается при kernel_size=7, stride=4 и padding=3 у свёрточного слоя. Но важное уточнения, что помимо количество векторов на входе, так же менялось и количество параметров у адаптера
- В целом считаю проделанную работу и результат успешным

### Что можно сделать ещё

- Попробовать разморозить LLM
- Подобрать входной промт
- Протестить другую архитектуру адаптера, например несколько свёрток подряд