In [1]:
import os
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split, StratifiedKFold
import librosa
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm, trange
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score
import pydub 
import numpy as np
from transformers import get_cosine_schedule_with_warmup, AutoModel, AutoFeatureExtractor, AutoModelForAudioClassification
import wandb
pl.seed_everything(56)

56

In [2]:
class CFG:
    class data:
        train_path = '/kaggle/input/tinkoff/train-4/train/'
        test_path = '/kaggle/input/tinkoff/test-5/test/'
        train_data = '/kaggle/input/tinkoff/train_gt.csv'
        test_data = '/kaggle/input/tinkoff/test-10.csv'
        feature_extractor = 'openai/whisper-small'
        sampling_rate=16_000
        num_workers = 4
        nfolds = 5
        batch_size = 16
        seed = 56
    class model:
        model = 'openai/whisper-small'
        num_labels = 2
        scheduler= True
        warnap_steps = 0.25
        pooling = False
        max_epoches = 10
        lr = 1e-4
        num_cycles = 0.5
        eps = 1e-6
        weight_decay = 0.0
        betas = (0.9, 0.999)
    seed = 56
    fold_number = 0

In [3]:
def make_df(data,path,is_test=False,sr=CFG.data.sampling_rate):
    data = pd.read_csv(data,header=None)
    df = pd.DataFrame()
    df['id'] = data[0]
    tqdm.pandas()
    df['array'] = df['id'].progress_apply(lambda x: librosa.load(path+x,sr=sr)[0])
    if not is_test:
        df['label'] = data[1]
    else:
        df['label'] = 0
    return df

In [4]:
class PLDataset(Dataset):
    def __init__(self, df, feature_extr):
        super().__init__()
        self.cfg = CFG.data
        self.data = df
        self.feature_extr = feature_extr
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        encode = self.feature_extr(
            row['array'],
            sampling_rate=self.cfg.sampling_rate,
            return_tensors="pt",
            #max_length=self.cfg.max_length,
            #padding='max_length',
        )
        
        return {'input_features':encode.input_features.squeeze(0),
                #'attention_mask':encode.attention_mask.squeeze(0),
                'labels':row['label']}

In [5]:
class PLDataModule(pl.LightningDataModule):
    def __init__(self):
        super().__init__()
        self.cfg = CFG.data
        self.is_setup = False
        
    def prepare_data(self):
        if self.is_setup:
            return None
        
        self.df = make_df(self.cfg.train_data,self.cfg.train_path)
        self.test_df = make_df(self.cfg.test_data,self.cfg.test_path,is_test=True)
        self.feature_extr = AutoFeatureExtractor.from_pretrained(self.cfg.feature_extractor)
        self.feature_extr.return_attention_mask = True
        
    def setup(self, stage: str):
        if self.is_setup:
            return None

        kf = StratifiedKFold(n_splits=self.cfg.nfolds, shuffle=True, random_state=self.cfg.seed)
        splits = [(x,y) for x,y in  kf.split(self.df,self.df['label'])][CFG.fold_number]
        self.train_df, self.val_df = self.df.iloc[splits[0]], self.df.iloc[splits[1]]
        self.train_dataset = PLDataset(self.train_df,self.feature_extr)
        self.val_dataset = PLDataset(self.val_df,self.feature_extr)
        self.predict_dataset = PLDataset(self.test_df,self.feature_extr)
        self.is_setup = True
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                         batch_size=self.cfg.batch_size,
                         num_workers=self.cfg.num_workers,
                         shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.cfg.batch_size,
                          num_workers=self.cfg.num_workers,
                          shuffle=False)
    
    def predict_dataloader(self):
        return DataLoader(self.predict_dataset,
                          batch_size=self.cfg.batch_size,
                          num_workers=self.cfg.num_workers,
                          shuffle=False)

In [6]:
class AverageMeter():
    def __init__(self):
        self.preds = []
        self.labels = []
    
    def update(self,y_t,y_p):
        self.labels += y_t
        self.preds += y_p
        
    def clean(self):
        self.preds = []
        self.labels = []

    def calc_metrics(self):
        metrics = {}
        metrics['accuracy'] = accuracy_score(self.labels, self.preds)
        metrics['balanced_accuracy'] = balanced_accuracy_score(self.labels, self.preds)
        metrics['f1'] = f1_score(self.labels, self.preds)
        
        return metrics

In [7]:
class PLModule(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.cfg = CFG.model
        self.model = AutoModelForAudioClassification.from_pretrained(self.cfg.model)
        self.criterion = nn.CrossEntropyLoss(weight=torch.tensor([0.367375, 0.632625]))
        self.avg_meter = AverageMeter()
        
    def forward(self, batch):
        output = self.model(**batch)
        return output

    def training_step(self, batch, i):
        out = self(batch)
        loss = out.loss#self.criterion(out.logits,batch['labels'])
        self.log('train_loss', loss.item())
        return loss
            
    def validation_step(self, batch, i):
        out = self(batch)
        loss = out.loss#self.criterion(out.logits,batch['labels'])
        self.log('val_loss',loss.item())
        preds = out.logits.argmax(dim=-1).tolist()
        self.avg_meter.update(batch['labels'].tolist(),preds)
    
    def predict_step(self, batch, i):
        out = self(batch)
        logits = out.logits
        return logits.argmax(dim=-1).tolist()
                
    def on_validation_epoch_end(self):
        metrics = self.avg_meter.calc_metrics()
        self.log_dict(metrics)
        self.avg_meter.clean()
            
    def configure_optimizers(self):
        optim = torch.optim.AdamW(self.parameters(),
                                  lr=self.cfg.lr,
                                  betas=self.cfg.betas,
                                  weight_decay=self.cfg.weight_decay,
                                  eps=self.cfg.eps)
        
        scheduler = get_cosine_schedule_with_warmup(optim,
                                                    num_training_steps=self.cfg.num_training_steps,
                                                    num_warmup_steps=self.cfg.num_training_steps * self.cfg.warnap_steps,
                                                    num_cycles=self.cfg.num_cycles)
        
        scheduler = {'scheduler': scheduler,'interval': 'step', 'frequency': 1}

        return [optim], [scheduler]

In [8]:
dm = PLDataModule()
#dm.feature_extr = AutoFeatureExtractor.from_pretrained(dm.cfg.feature_extractor)
#dm.df = df
#dm.test_df = test_df
dm.prepare_data()
dm.setup(0)

  0%|          | 0/8803 [00:00<?, ?it/s]

  0%|          | 0/2870 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [9]:
CFG.model.num_training_steps = len(dm.train_dataloader()) * CFG.model.max_epoches

In [10]:
model = PLModule()

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

Some weights of WhisperForAudioClassification were not initialized from the model checkpoint at openai/whisper-small and are newly initialized: ['model.classifier.bias', 'model.classifier.weight', 'model.projector.bias', 'model.projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
wandb.login(key="31520b01739d418e5d77a11fd8a79a70b189b8bc")
os.environ['WANDB_API_KEY'] = "31520b01739d418e5d77a11fd8a79a70b189b8bc"
wandb.init(project='AIIJC_task4',name='whisper_small_fastv2')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mandrewkhl[0m ([33mandlh[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.17.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240708_132728-1hku7nbr[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mwhisper_small_fastv2[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/andlh/AIIJC_task4[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/andlh/AIIJC_task4/runs/1hku7nbr[0m


In [12]:
lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='step')
checkpoint_cb = pl.callbacks.ModelCheckpoint(
    dirpath='./outputs/',
    filename='model_{epoch:02d}-{accuracy:.4f}',
    monitor='accuracy',
    mode='max',
    save_last=True
)

trainer = pl.Trainer(
    accelerator="gpu",
    precision=16,
    callbacks = [lr_monitor,checkpoint_cb],
    logger = pl.loggers.WandbLogger(),
    log_every_n_steps=1,
    min_epochs=1,
    devices=2,
    check_val_every_n_epoch=1,
    max_epochs=CFG.model.max_epoches
)

/opt/conda/lib/python3.10/site-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!


In [13]:
trainer.fit(model, datamodule=dm)

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

2024-07-08 13:27:53.385725: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:27:53.385719: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:27:53.385726: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:27:53.385729: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:27:53.385800: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory 

Training: |          | 0/? [00:00<?, ?it/s]

2024-07-08 13:28:15.798184: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:28:15.808518: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 13:28:15.810745: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 13:28:15.847924: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:28:15.848251: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

Validation: |          | 0/? [00:00<?, ?it/s]

2024-07-08 13:33:48.858940: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:33:48.859404: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 13:33:48.862612: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 13:33:48.873284: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:33:48.873612: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

Validation: |          | 0/? [00:00<?, ?it/s]

2024-07-08 13:40:10.596879: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:40:10.598468: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 13:40:10.601424: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 13:40:10.720709: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:40:10.723861: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

Validation: |          | 0/? [00:00<?, ?it/s]

2024-07-08 13:46:34.410650: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:46:34.411169: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 13:46:34.415747: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 13:46:34.595183: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:46:34.596694: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

Validation: |          | 0/? [00:00<?, ?it/s]

2024-07-08 13:52:55.392154: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:52:55.394790: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 13:52:55.397800: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 13:52:55.528357: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:52:55.539525: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

Validation: |          | 0/? [00:00<?, ?it/s]

2024-07-08 13:59:11.990357: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:59:11.990827: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 13:59:11.994362: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 13:59:12.020354: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 13:59:12.030531: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

Validation: |          | 0/? [00:00<?, ?it/s]

2024-07-08 14:05:27.418342: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 14:05:27.419019: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 14:05:27.439772: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 14:05:27.586923: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 14:05:27.591543: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

Validation: |          | 0/? [00:00<?, ?it/s]

2024-07-08 14:11:37.914381: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 14:11:37.925520: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 14:11:37.928469: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 14:11:37.967665: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 14:11:37.977544: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

Validation: |          | 0/? [00:00<?, ?it/s]

2024-07-08 14:17:48.565819: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 14:17:48.567013: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 14:17:48.569722: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 14:17:48.566348: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 14:17:48.570308: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

Validation: |          | 0/? [00:00<?, ?it/s]

2024-07-08 14:23:51.497824: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 14:23:51.497901: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 14:23:51.502035: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 14:23:51.606973: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 14:23:51.608242: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

Validation: |          | 0/? [00:00<?, ?it/s]

2024-07-08 14:29:52.373226: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 14:29:52.373309: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 14:29:52.386072: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 14:29:52.407691: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 14:29:52.408632: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor