In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers==4.28.0
!pip install datasets

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.16.4 tokenizers-0.13.3 transformers-4.28.0
Collecting datasets
  Downloading datasets-2.14.0-py3-none-any.whl (492 kB)
[2K     [9

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

In [None]:
device='cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from transformers import HubertForCTC
hubert=HubertForCTC.from_pretrained('facebook/hubert-large-ls960-ft')
from transformers import AutoProcessor, HubertModel
processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained('gpt2')
from transformers import AutoFeatureExtractor
whisper_ckpt='openai/whisper-base'
feature_extractor=AutoFeatureExtractor.from_pretrained(whisper_ckpt)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [None]:
#importing csv file
"""
File must follow the following foramt
path: path to the relevant folder of audio clip
Label: label to the sample like-sad, happy, angry...
ID: The number value corresponding to Label i.e 0,1,2...
"""
import pandas as pd
df=pd.read_csv('/content/drive/MyDrive/ALM_models2/train_sound_event.csv')

In [None]:
num_labels = len(pd.unique(df['Label']))
id2label = df.set_index('ID')['Label'].to_dict()
label2id = {}
for i in id2label.keys():
  label2id[id2label[i]] = i

In [None]:
from sklearn.model_selection import train_test_split
train_df,val_df=train_test_split(df,test_size=0.2)
val_df,test_df=train_test_split(val_df,test_size=0.7)

In [None]:
import numpy as np
import librosa
#Dataset class for the model
#Audio truncated upto 10 seconds
#returns audio features, labels and the respective prompt along with text transcription
class dataset(Dataset):
  def __init__(self,train_df,feature_extractor,tokenizer,processor,asr_generator):
    super().__init__()
    self.train_df=train_df
    self.feature_extractor=feature_extractor
    self.target_sampling_rate=feature_extractor.sampling_rate
    self.tokenizer=tokenizer
    self.processor=processor
    self.asr_generator=asr_generator.to(device)
  def __len__(self):
    return len(self.train_df['Label'])
  def __getitem__(self,idx):
    path=self.train_df['path'][idx]
    waveform,sr=librosa.load(path,sr=self.target_sampling_rate)
    min_len=min(len(waveform),self.target_sampling_rate*4)
    audio_feat=torch.zeros((min_len))
    waveform=torch.tensor(waveform)
    audio_feat[:min_len]=waveform[:min_len].clone().detach()
    utterance="Generate Emotion"

    #1 Audio features extractracted from Whisper Feature extractor
    audio_feats=self.feature_extractor(audio_feat,sampling_rate=self.target_sampling_rate,return_tensors='pt')['input_features']

    #2 Labels processed using gpt2 tokenizer
    labels=self.train_df['ID'][idx]

    #3 Text produced using Hubert
    processed=self.processor(audio_feat,sampling_rate=self.target_sampling_rate,return_tensors='pt')['input_values'].to(device)
    asr_text=self.asr_generator.forward(processed).logits.cpu()
    asr_text=torch.argmax(asr_text,dim=-1)
    asr=self.processor.batch_decode(asr_text,skip_special_tokens=True)
    asr=self.tokenizer(asr,max_length=100,padding='max_length',truncation=True,return_tensors='pt')['input_ids']


    #4 Prompt for specific task
    prompt_ids=self.tokenizer(utterance,return_tensors='pt',max_length=10,padding='max_length',truncation=True)['input_ids']


    audio_feats=torch.squeeze(audio_feats,axis=0).to(device)
    labels=torch.tensor(labels).to(device)
    prompt_ids=torch.squeeze(prompt_ids,axis=0).to(device)
    asr=torch.squeeze(asr,axis=0).to(device)

    return audio_feats,labels,asr,prompt_ids

In [None]:
train_dict=train_df.to_dict(orient='list')
val_dict=val_df.to_dict(orient='list')
test_dict=test_df.to_dict(orient='list')

In [None]:
tokenizer.pad_token=tokenizer.eos_token

In [None]:
train_dataset=dataset(train_dict,feature_extractor,tokenizer,processor,hubert)
val_dataset=dataset(val_dict,feature_extractor,tokenizer,processor,hubert)
test_dataset=dataset(test_dict,feature_extractor,tokenizer,processor,hubert)

train_loader=DataLoader(train_dataset,batch_size=4,shuffle=True)
val_loader=DataLoader(val_dataset,batch_size=4,shuffle=True)
test_loader=DataLoader(test_dataset,batch_size=4,shuffle=True)

In [None]:
import torch.nn as nn
from transformers import AutoConfig
from transformers import GPT2LMHeadModel
from transformers import WhisperModel
#Classification head on the output of GPT2 based on the number of classes
class ClassificationHead(nn.Module):
  def __init__(self,input_dim,output_dim):
    super().__init__()
    self.linear_layer=nn.Linear(input_dim,output_dim)
    nn.init.xavier_uniform_(self.linear_layer.weight)
  def forward(self,x):
    x=self.linear_layer(x)
    return x
#Transformer mapper network
"""Arguments- input-dimension, number of heads, hidden size dimension, number of blocks, input sequence length,
output_sequence length and output dimension.
Projects input to output dimension followed by multiheadself attention and sequence length reduction
"""
class TransformerModel(nn.Module):
    def __init__(self, input_dim, nhead, nhid, nlayers, inp_seq_len,out_seq, output_dim):
        super(TransformerModel, self).__init__()

        # Stage 1: Self-attention mechanism that changes the output dimension
        self.encoder = nn.Linear(input_dim, nhid)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(nhid, nhead), nlayers
        )
        self.proj = nn.Linear(nhid,output_dim)

        # Stage 2: Learned positional encoding to change the sequence length
        # self.pos_encoder = nn.Linear(inp_seq_len, out_seq)
        self.out_seq=out_seq


    def forward(self, src):
        src = self.encoder(src)
        src = self.transformer_encoder(src)
        src = self.proj(src)

        # Adjust the sequence length
        src = src[:,:self.out_seq,:]

        return src

"""
Main model consists of
whisper encoder, Mapper transformer, GPT2 Decoder
returns loss,logits
"""
class pengi_model_alike(nn.Module):
  def __init__(self,whisper_ckpt,gpt2_ckpt):
    super().__init__()
    self.whisper_encoder=WhisperModel.from_pretrained(whisper_ckpt).encoder
    self.whisper_ckpt=self.whisper_encoder.config
    self.gpt2=GPT2LMHeadModel.from_pretrained(gpt2_ckpt)
    self.gpt2_ckpt=self.gpt2.config
    self.mapping=TransformerModel(input_dim=self.whisper_ckpt.d_model,
                                      nhead=6,
                                     nhid=1440,
                                    nlayers=8,
                                    inp_seq_len=self.whisper_ckpt.max_source_positions,
                                     out_seq=500,
                                     output_dim=self.gpt2_ckpt.n_embd)


  def forward(self,
              input_audio_features,
              asr_text_ids=None,
              input_text_ids=None,
              input_text_embeds=None,
              labels=None,
              return_dict=False):

    out=self.whisper_encoder(input_audio_features)
    audio_embeddings=out[0]
    audio_embeddings=self.mapping(audio_embeddings)


    out_seq=self.mapping.out_seq




    input_embeds=torch.concat([audio_embeddings,self.gpt2.transformer.wte(asr_text_ids)],axis=-2)
    input_embeds=torch.concat([input_embeds,self.gpt2.transformer.wte(input_text_ids)],axis=-2)
    if(labels is not None):
      lab_dtype=labels.dtype
      inp_labels=-100*torch.ones((labels.shape[0],out_seq+asr_text_ids.shape[-1]))
      inp_labels=inp_labels.to(labels.device)
      labels=torch.concat([inp_labels.to(labels.device),labels],axis=-1)
      return self.gpt2(inputs_embeds=input_embeds,labels=labels,return_dict=True)
    else:
      return self.gpt2(inputs_embeds=input_embeds,return_dict=True)

In [None]:
#Model combining Classification head and pengi module
#Returns the logits for the final classes
class ClassifierModel(nn.Module):
  def __init__(self,pengi_model,output_dim):
    super().__init__()
    self.model=pengi_model
    self.classification=ClassificationHead(self.model.gpt2_ckpt.vocab_size,output_dim)
  def forward(self,
              input_audio_features,
              asr_text_ids=None,
              input_text_ids=None,
              input_text_embeds=None,
              labels=None,
              return_dict=False):
    x=self.model(input_audio_features,
              asr_text_ids,
              input_text_ids,
              input_text_embeds,
              labels,
              return_dict=False).logits[:,-1,:]
    x=self.classification(x)
    return x

In [None]:
pengi=pengi_model_alike(whisper_ckpt='openai/whisper-base',gpt2_ckpt='gpt2')
model_path='/content/drive/MyDrive/ALM_models2/Experiment.pt'
state_dict = torch.load(model_path)
pengi.load_state_dict(state_dict)
model=ClassifierModel(pengi,num_labels)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/290M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score
#The final report metrics are accuracy, precision and recall
def metrics(logits,labels):
  logits=logits.detach().cpu()
  labels=labels.cpu()

  logits=np.array(logits)
  labels=np.array(labels)

  logits=np.argmax(logits,axis=-1)

  acc=accuracy_score(logits,labels)
  rec=recall_score(logits,labels,average='weighted')
  prec=precision_score(logits,labels,average='weighted')
  return acc,prec,rec

In [None]:
model=model.to(device)

In [None]:
!pip install wandb
import wandb
wandb.login()

Collecting wandb
  Downloading wandb-0.15.7-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.32-py3-none-any.whl (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.28.1-py2.py3-none-any.whl (214 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.7/214.7 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.2-cp310-cp310-manylinux_2_5_x86_64.manyli

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
import torch.optim as optim
wandb.init(
    project='alm_model_run26_sound_event_pretrained',
)

[34m[1mwandb[0m: Currently logged in as: [33mdevansh20053[0m ([33mdevansh2002[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
for param in model.model.parameters():
  param.requires_grad=False

In [None]:
from tqdm import tqdm
def train(model,train_loader,val_loader,epochs,logging_steps,optim,scheduler,loss_fn,grad_acc=1):
  gradient_accumulator = {}
  for name,param in model.named_parameters():
    gradient_accumulator[name]=0.0

  # Define the total number of training steps and create a scheduler
  total_steps = len(train_loader) * epochs

  #wandb.watch(model,log='all',log_freq=20)
  train_loss=[]
  val_loss=[]
  train_acc=[]
  train_rec=[]
  train_prec=[]

  val_acc=[]
  val_prec=[]
  val_rec=[]
  wer=[]
  tr_steps=0
  val_steps=0
  best_val_loss = float('inf')

  for ep in range(epochs):
    running_loss=0.0
    accuracy=0.0
    precision=0.0
    recall=0.0
    model.train()
    for i,(audio,labels,asr,prompt) in enumerate(tqdm(train_loader)):
      out=model(input_audio_features=audio.to(device),
              asr_text_ids=asr.to(device),
              input_text_ids=prompt.to(device))


      loss=loss_fn(out,labels)
      running_loss+=loss.item()
      loss.backward()
      acc,prec,rec=metrics(out,labels)
      accuracy+=acc
      precision+=prec
      recall+=rec

      # Gradient accumulation
      for name, param in model.named_parameters():
        gradient_accumulator[name] += param.grad if param.grad is not None else 0.0

      if (i + 1) % grad_acc == 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step(running_loss/(i+1))  # Update learning rate schedule
        optimizer.zero_grad()

        for name, param in model.named_parameters():
          gradient_accumulator[name]=0

      if((i+1)%logging_steps==0):
        train_loss.append(running_loss/(i+1))
        train_acc.append(accuracy/(i+1))
        train_prec.append(precision/(i+1))
        train_rec.append(recall/(i+1))
        tr_steps+=1
        wandb.log({"tr/loss":train_loss[-1],
                   "tr/step":tr_steps,
                   "tr/acc":train_acc[-1],
                   "tr/prec":train_prec[-1],
                   "tr/rec":train_rec[-1]})

    if i % grad_acc != 0:
      optimizer.step()
      optimizer.zero_grad()

    model.eval()
    running_loss=0.0
    running_wer=0.0
    accuracy=0.0
    precision=0.0
    recall=0.0
    with torch.no_grad():
      for i,(audio,labels,asr,prompt) in enumerate(tqdm(val_loader)):
        out=model(input_audio_features=audio.to(device),
              asr_text_ids=asr.to(device),
              input_text_ids=prompt.to(device))
        loss=loss_fn(out,labels)
        running_loss+=loss.item()
        acc,prec,rec=metrics(out,labels)
        accuracy+=acc
        precision+=prec
        recall+=rec

        if((i+1)%logging_steps==0):
          val_loss.append(running_loss/(i+1))
          wer.append(running_wer/(i+1))
          val_acc.append(accuracy/(i+1))
          val_prec.append(precision/(i+1))
          val_rec.append(recall/(i+1))
          val_steps+=1
          wandb.log({"val/loss":val_loss[-1],
                     "val/step":val_steps,
                     "val/acc":val_acc[-1],
                     "val/prec":val_prec[-1],
                     "val/rec":val_rec[-1]})
  return train_loss,val_loss,train_acc,val_acc

In [None]:
from transformers import get_linear_schedule_with_warmup

# Add weight decay into the optimizer
learning_rates=[
    {'params':model.classification.parameters(),"lr":0.001,'weight_decay':0.01},
]
optimizer=optim.Adam(learning_rates)

# Define the total number of training steps and create a scheduler
total_steps = len(train_loader) * 2
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
# from torch.optim.lr_scheduler import CyclicLR
# scheduler = CyclicLR(optimizer, base_lr=0.00001, max_lr=0.1, step_size_up=2000)
total_epochs = 5
warmup_steps = 100
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_epochs)

In [None]:
model=model.to(device)

In [None]:
import warnings
warnings.filterwarnings('ignore')
loss_fn=nn.CrossEntropyLoss()
train(model,train_loader,val_loader,4,15,optimizer,scheduler,loss_fn,grad_acc=4)

100%|██████████| 64/64 [01:59<00:00,  1.87s/it]
100%|██████████| 5/5 [00:10<00:00,  2.06s/it]
100%|██████████| 64/64 [00:51<00:00,  1.25it/s]
100%|██████████| 5/5 [00:03<00:00,  1.26it/s]
100%|██████████| 64/64 [00:54<00:00,  1.17it/s]
100%|██████████| 5/5 [00:03<00:00,  1.37it/s]
100%|██████████| 64/64 [00:56<00:00,  1.13it/s]
100%|██████████| 5/5 [00:04<00:00,  1.20it/s]


([1031.0332473754884,
  1010.4556739807128,
  1097.243089972602,
  1014.0724081675212,
  1171.3993794759115,
  894.2603800455729,
  958.4782212999132,
  906.5079010009765,
  903.3610636393229,
  998.6787902832032,
  978.5004964192708,
  916.10724512736,
  843.35458984375,
  936.8789591471354,
  948.327727593316,
  946.6590339660645],
 [],
 [0.05,
  0.05,
  0.05555555555555555,
  0.05416666666666667,
  0.06666666666666667,
  0.06666666666666667,
  0.06666666666666667,
  0.07083333333333333,
  0.03333333333333333,
  0.075,
  0.06111111111111111,
  0.0625,
  0.1,
  0.075,
  0.06666666666666667,
  0.075],
 [])

In [None]:
def predict(model,test_loader):
  pred=None
  ground=None
  model.eval()
  with torch.no_grad():
    for i,(audio,labels,asr,prompt) in enumerate(tqdm(test_loader)):
      out=model(input_audio_features=audio.to(device),
            asr_text_ids=asr.to(device),
            input_text_ids=prompt.to(device))

      #logits=torch.argmax(out,axis=-1)
      if(pred is None):
        pred=out.cpu().detach()
        ground=labels.cpu()
      else:
        pred=torch.concat([pred,out.cpu().detach()],axis=0)
        ground=torch.concat([ground,labels.cpu()],axis=0)

  return pred,ground

In [None]:
pred,labels=predict(model,test_loader)

100%|██████████| 12/12 [00:08<00:00,  1.39it/s]
