IMPORTING REQUIRED LIBRARIES

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers datasets torchaudio librosa
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets
!pip install -U bertviz
!pip install -U umap-learn
!pip install seaborn --upgrade
!pip install evaluate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.1->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from

In [None]:
import torch
import torchaudio
import librosa
import numpy as np
import pandas as pd
from transformers import ASTFeatureExtractor, ASTForAudioClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset, DatasetDict
import torchaudio.transforms as T
import os



LOADING CSV FILE FOR FEATURE EXTRACTION

In [None]:
df = pd.read_csv("/content/drive/MyDrive/metadata of train set .csv")
columns= ['File ID', 'Duration in ms', 'Classname', 'augmentation  id', 'Augmentation  type', 'source']
df.drop(columns=columns, inplace=True)
df['label'] = df['Class ID']

# Convert CSV into Hugging Face Dataset
dataset = Dataset.from_pandas(df)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6289 entries, 0 to 6288
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Filename  6289 non-null   object
 1   Class ID  6289 non-null   int64 
 2   label     6289 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 147.5+ KB


FUNCTION FOR FEATURE EXTRACTION AND PREPROCESSING

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


feature_extractor = ASTFeatureExtractor()

def preprocess_audio(examples):
    input_values_list = []
    for filename in examples["Filename"]:
        waveform, sample_rate = torchaudio.load(os.path.join('/content/drive/MyDrive/train', filename))
        waveform = waveform.to(device)

        if sample_rate != 16000:
            resampler = T.Resample(orig_freq=sample_rate, new_freq=16000).to(device)
            waveform = resampler(waveform)

        waveform = waveform.cpu().numpy()
        inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
        input_values_list.append(inputs["input_values"].squeeze().tolist()) # append to list

    examples["input_values"] = input_values_list # assign list to the examples dictionary
    return examples




In [None]:
dataset = dataset.map(preprocess_audio, batched = True, batch_size = 8)


Map:   0%|          | 0/6289 [00:00<?, ? examples/s]

In [None]:
split_dataset = dataset.train_test_split(test_size=0.2)

IMPORTING TRANSFORMERS FOR CLASSIFICATION

In [None]:
from transformers import ASTConfig
num_labels = len(set(df["Class_id"]))
config = ASTConfig(num_labels=num_labels)


model = ASTForAudioClassification(config)

In [None]:
training_args = TrainingArguments(
    output_dir="./ast_trained",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
)




In [None]:
from sklearn.metrics import accuracy_score, f1_score
from evaluate import load



recall_metric = load("recall")
accuracy_metric = load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)


    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)

    return {"accuracy": accuracy["accuracy"], "recall": recall["recall"]}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    compute_metrics=compute_metrics,
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Recall
1,1.1237,0.939706,0.63752,0.429339
2,1.0005,0.651022,0.761526,0.586656
3,0.652,0.585862,0.793323,0.706659
4,0.3133,0.501511,0.82035,0.74325
5,0.4452,0.506024,0.825119,0.772578


TrainOutput(global_step=3145, training_loss=0.7507315528790788, metrics={'train_runtime': 10985.2466, 'train_samples_per_second': 2.29, 'train_steps_per_second': 0.286, 'total_flos': 1.7051534020037837e+18, 'train_loss': 0.7507315528790788, 'epoch': 5.0})

Valdication accuracy = 82.5% recall = 77%

In [None]:
save_path = "/content/drive/MyDrive/my_custom_model"


model.save_pretrained(save_path)

In [None]:
from transformers import ASTForAudioClassification, ASTFeatureExtractor
save_path = "/content/drive/MyDrive/my_custom_model"

model = ASTForAudioClassification.from_pretrained(save_path)

NOW TESTING IT ON TEST SET

In [None]:
import evaluate


accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)


    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")

    return {
        "accuracy": accuracy["accuracy"],
        "f1_score": f1["f1"],
        "recall": recall["recall"]
    }


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

LOADING THE TEST SET CSV

In [None]:
df = pd.read_csv("/content/drive/MyDrive/metadata of test set.csv")
columns= ['File_ID', 'Durationin ms', 'Classname', 'Augment Id', 'Augmentation type', 'source']
df.drop(columns=columns, inplace=True)
df['label'] = df['Class_id']


dataset_test = Dataset.from_pandas(df)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 725 entries, 0 to 724
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Filename  725 non-null    object
 1   Class_id  725 non-null    int64 
 2   label     725 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 17.1+ KB


CHANGING THE FUNCTION FOR THE TEST SET

In [None]:
def preprocess_audio(examples):
    input_values_list = []
    for filename in examples["Filename"]:
        waveform, sample_rate = torchaudio.load(os.path.join('/content/drive/MyDrive/test', filename))
        waveform = waveform.to(device)

        if sample_rate != 16000:
            resampler = T.Resample(orig_freq=sample_rate, new_freq=16000).to(device)
            waveform = resampler(waveform)

        waveform = waveform.cpu().numpy()
        inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
        input_values_list.append(inputs["input_values"].squeeze().tolist()) # append to list

    examples["input_values"] = input_values_list # assign list to the examples dictionary
    return examples
dataset_test = dataset.map(preprocess_audio, batched = True, batch_size = 8)

Map:   0%|          | 0/725 [00:00<?, ? examples/s]

DOING SOME CHANGES IN TRAINER FOR EVALUATION ON TEST SET

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,

    eval_dataset=dataset_test,
    compute_metrics=compute_metrics,
)

In [None]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.882420539855957, 'eval_model_preparation_time': 0.003, 'eval_accuracy': 0.7158620689655173, 'eval_f1_score': 0.6742143997743737, 'eval_recall': 0.7147930147668824, 'eval_runtime': 124.4278, 'eval_samples_per_second': 5.827, 'eval_steps_per_second': 0.731}


F1 score = 0.67 Accuracy = 71.5% recall = 71.4%