In [1]:
import os
import librosa
import gc
import ast
import torch
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import torchaudio
import sys
import datasets
from datasets import load_dataset,load_metric
from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification, Trainer
from transformers import TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_checkpoint = "facebook/wav2vec2-base"
batch_size = 16

In [3]:
#load the dataset
dataset = load_dataset("audiofolder", data_dir="FoR Dataset Audio Detection\\for-norm\\for-norm")

In [4]:
#verify
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 53868
    })
    validation: Dataset({
        features: ['audio', 'label'],
        num_rows: 10798
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 4634
    })
})

In [5]:
#Check how it is loaded
dataset["test"][50]

{'audio': {'path': 'C:\\Users\\aagas\\Desktop\\Deepfake Audio_Detection\\FoR Dataset Audio Detection\\for-norm\\for-norm\\testing\\fake\\file1043.wav_16k.wav_norm.wav_mono.wav_silence.wav',
  'array': array([ 0.08981323,  0.08868408,  0.08905029, ...,  0.02703857,
         -0.00488281, -0.02178955]),
  'sampling_rate': 16000},
 'label': 0}

In [6]:
dataset["train"].features["label"]

ClassLabel(names=['fake', 'real'], id=None)

In [7]:
#label the real and fake
labels = dataset["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [8]:
#verify the label
id2label

{'0': 'fake', '1': 'real'}

In [9]:
feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
feature_extractor



Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [10]:
max_duration = 5.0  # seconds

In [11]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
    )

    if "label" in examples:
        inputs["labels"] = [int(label) for label in examples["label"]]

    return inputs


In [12]:
encoded_dataset = dataset.map(preprocess_function, remove_columns=["audio"], batched=True)
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values', 'labels'],
        num_rows: 53868
    })
    validation: Dataset({
        features: ['label', 'input_values', 'labels'],
        num_rows: 10798
    })
    test: Dataset({
        features: ['label', 'input_values', 'labels'],
        num_rows: 4634
    })
})

In [13]:
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [15]:
device

device(type='cuda')

In [16]:
#load_gpu
def to_device(data,device):
    if isinstance(data,(list,tuple)):
        return [to_device(x,device) for x in data]
    return data.to(device,non_blocking=True)

In [17]:
to_device(model,device)

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [18]:
model_name = model_checkpoint.split("/")[-1]


args = TrainingArguments(
    f"{model_name}-finetuned-ks",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",

)



In [19]:
metric = load_metric("accuracy", trust_remote_code=True)

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


  metric = load_metric("accuracy", trust_remote_code=True)


In [20]:
class MyTrainer(Trainer):
    def training_step(self, model, inputs):
        if "labels" in inputs:
            inputs["labels"] = inputs["labels"].to(torch.int64)
        return super().training_step(model, inputs)


trainer = MyTrainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

In [21]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  1%|          | 10/841 [01:47<2:24:34, 10.44s/it]

{'loss': 0.6844, 'grad_norm': 1.5011638402938843, 'learning_rate': 3.5294117647058825e-06, 'epoch': 0.01}


  2%|▏         | 20/841 [03:47<2:43:43, 11.97s/it]

{'loss': 0.6539, 'grad_norm': 1.681717872619629, 'learning_rate': 7.058823529411765e-06, 'epoch': 0.02}


  4%|▎         | 30/841 [05:50<2:46:05, 12.29s/it]

{'loss': 0.6044, 'grad_norm': 1.3781179189682007, 'learning_rate': 1.0588235294117648e-05, 'epoch': 0.04}


  5%|▍         | 40/841 [07:52<2:42:30, 12.17s/it]

{'loss': 0.506, 'grad_norm': 2.9065463542938232, 'learning_rate': 1.411764705882353e-05, 'epoch': 0.05}


  6%|▌         | 50/841 [09:53<2:42:16, 12.31s/it]

{'loss': 0.3824, 'grad_norm': 14.558954238891602, 'learning_rate': 1.7647058823529414e-05, 'epoch': 0.06}


  7%|▋         | 60/841 [11:48<2:31:43, 11.66s/it]

{'loss': 0.2624, 'grad_norm': 2.774967670440674, 'learning_rate': 2.1176470588235296e-05, 'epoch': 0.07}


  8%|▊         | 70/841 [13:45<2:35:10, 12.08s/it]

{'loss': 0.1829, 'grad_norm': 5.831363677978516, 'learning_rate': 2.4705882352941174e-05, 'epoch': 0.08}


 10%|▉         | 80/841 [15:33<2:09:01, 10.17s/it]

{'loss': 0.1173, 'grad_norm': 0.8220967650413513, 'learning_rate': 2.823529411764706e-05, 'epoch': 0.1}


 11%|█         | 90/841 [17:08<2:01:49,  9.73s/it]

{'loss': 0.0804, 'grad_norm': 0.5081084966659546, 'learning_rate': 2.98015873015873e-05, 'epoch': 0.11}


 12%|█▏        | 100/841 [18:48<2:02:39,  9.93s/it]

{'loss': 0.1056, 'grad_norm': 0.3603779971599579, 'learning_rate': 2.9404761904761905e-05, 'epoch': 0.12}


 13%|█▎        | 110/841 [20:23<1:55:03,  9.44s/it]

{'loss': 0.0528, 'grad_norm': 0.39001619815826416, 'learning_rate': 2.900793650793651e-05, 'epoch': 0.13}


 14%|█▍        | 120/841 [21:55<1:51:13,  9.26s/it]

{'loss': 0.0534, 'grad_norm': 4.0807061195373535, 'learning_rate': 2.8611111111111113e-05, 'epoch': 0.14}


 15%|█▌        | 130/841 [23:35<1:57:24,  9.91s/it]

{'loss': 0.053, 'grad_norm': 0.2828693389892578, 'learning_rate': 2.8214285714285714e-05, 'epoch': 0.15}


 17%|█▋        | 140/841 [25:14<1:55:35,  9.89s/it]

{'loss': 0.0633, 'grad_norm': 5.372171401977539, 'learning_rate': 2.781746031746032e-05, 'epoch': 0.17}


 18%|█▊        | 150/841 [26:51<1:48:26,  9.42s/it]

{'loss': 0.0253, 'grad_norm': 7.888138294219971, 'learning_rate': 2.7420634920634922e-05, 'epoch': 0.18}


 19%|█▉        | 160/841 [28:25<1:49:15,  9.63s/it]

{'loss': 0.0293, 'grad_norm': 0.15606823563575745, 'learning_rate': 2.7023809523809527e-05, 'epoch': 0.19}


 20%|██        | 170/841 [29:57<1:49:26,  9.79s/it]

{'loss': 0.018, 'grad_norm': 0.8475608825683594, 'learning_rate': 2.6626984126984127e-05, 'epoch': 0.2}


 21%|██▏       | 180/841 [31:33<1:46:46,  9.69s/it]

{'loss': 0.0295, 'grad_norm': 6.218556880950928, 'learning_rate': 2.623015873015873e-05, 'epoch': 0.21}


 23%|██▎       | 190/841 [33:05<1:45:54,  9.76s/it]

{'loss': 0.0234, 'grad_norm': 0.12025588005781174, 'learning_rate': 2.5833333333333336e-05, 'epoch': 0.23}


 24%|██▍       | 200/841 [34:45<1:46:11,  9.94s/it]

{'loss': 0.0381, 'grad_norm': 0.12027119100093842, 'learning_rate': 2.5436507936507936e-05, 'epoch': 0.24}


 25%|██▍       | 210/841 [36:17<1:42:09,  9.71s/it]

{'loss': 0.006, 'grad_norm': 0.09219257533550262, 'learning_rate': 2.503968253968254e-05, 'epoch': 0.25}


 26%|██▌       | 220/841 [37:49<1:34:21,  9.12s/it]

{'loss': 0.0068, 'grad_norm': 0.315120667219162, 'learning_rate': 2.464285714285714e-05, 'epoch': 0.26}


 27%|██▋       | 230/841 [39:28<1:40:35,  9.88s/it]

{'loss': 0.0212, 'grad_norm': 0.07669974863529205, 'learning_rate': 2.424603174603175e-05, 'epoch': 0.27}


 29%|██▊       | 240/841 [41:08<1:39:08,  9.90s/it]

{'loss': 0.0042, 'grad_norm': 0.07142796367406845, 'learning_rate': 2.384920634920635e-05, 'epoch': 0.29}


 30%|██▉       | 250/841 [42:45<1:38:06,  9.96s/it]

{'loss': 0.0321, 'grad_norm': 0.5085096955299377, 'learning_rate': 2.3452380952380954e-05, 'epoch': 0.3}


 31%|███       | 260/841 [44:23<1:32:36,  9.56s/it]

{'loss': 0.0038, 'grad_norm': 0.0709853246808052, 'learning_rate': 2.3055555555555554e-05, 'epoch': 0.31}


 32%|███▏      | 270/841 [46:00<1:33:39,  9.84s/it]

{'loss': 0.0191, 'grad_norm': 2.9118826389312744, 'learning_rate': 2.2658730158730162e-05, 'epoch': 0.32}


 33%|███▎      | 280/841 [47:38<1:32:33,  9.90s/it]

{'loss': 0.0168, 'grad_norm': 0.1290123015642166, 'learning_rate': 2.2261904761904763e-05, 'epoch': 0.33}


 34%|███▍      | 290/841 [49:06<1:23:34,  9.10s/it]

{'loss': 0.0097, 'grad_norm': 0.5669963955879211, 'learning_rate': 2.1865079365079367e-05, 'epoch': 0.34}


 36%|███▌      | 300/841 [50:39<1:19:35,  8.83s/it]

{'loss': 0.0396, 'grad_norm': 0.13628707826137543, 'learning_rate': 2.1468253968253967e-05, 'epoch': 0.36}


 37%|███▋      | 310/841 [52:16<1:26:37,  9.79s/it]

{'loss': 0.0205, 'grad_norm': 0.07147075980901718, 'learning_rate': 2.107142857142857e-05, 'epoch': 0.37}


 38%|███▊      | 320/841 [53:55<1:26:20,  9.94s/it]

{'loss': 0.0065, 'grad_norm': 0.06894975155591965, 'learning_rate': 2.0674603174603176e-05, 'epoch': 0.38}


 39%|███▉      | 330/841 [55:35<1:24:50,  9.96s/it]

{'loss': 0.0167, 'grad_norm': 0.3323654234409332, 'learning_rate': 2.027777777777778e-05, 'epoch': 0.39}


 40%|████      | 340/841 [57:12<1:19:33,  9.53s/it]

{'loss': 0.0076, 'grad_norm': 0.046580493450164795, 'learning_rate': 1.988095238095238e-05, 'epoch': 0.4}


 42%|████▏     | 350/841 [58:43<1:09:19,  8.47s/it]

{'loss': 0.0029, 'grad_norm': 0.04440963640809059, 'learning_rate': 1.9484126984126985e-05, 'epoch': 0.42}


 43%|████▎     | 360/841 [1:00:20<1:18:45,  9.82s/it]

{'loss': 0.0022, 'grad_norm': 0.041120413690805435, 'learning_rate': 1.908730158730159e-05, 'epoch': 0.43}


 44%|████▍     | 370/841 [1:02:00<1:17:46,  9.91s/it]

{'loss': 0.0116, 'grad_norm': 0.04151719808578491, 'learning_rate': 1.869047619047619e-05, 'epoch': 0.44}


 45%|████▌     | 380/841 [1:03:34<1:15:08,  9.78s/it]

{'loss': 0.0183, 'grad_norm': 15.677123069763184, 'learning_rate': 1.8293650793650794e-05, 'epoch': 0.45}


 46%|████▋     | 390/841 [1:05:11<1:11:37,  9.53s/it]

{'loss': 0.0154, 'grad_norm': 0.04176979139447212, 'learning_rate': 1.7896825396825394e-05, 'epoch': 0.46}


 48%|████▊     | 400/841 [1:06:51<1:13:51, 10.05s/it]

{'loss': 0.0112, 'grad_norm': 0.04166216403245926, 'learning_rate': 1.7500000000000002e-05, 'epoch': 0.48}


 49%|████▉     | 410/841 [1:08:30<1:10:49,  9.86s/it]

{'loss': 0.002, 'grad_norm': 0.052082180976867676, 'learning_rate': 1.7103174603174603e-05, 'epoch': 0.49}


 50%|████▉     | 420/841 [1:10:04<1:08:20,  9.74s/it]

{'loss': 0.0046, 'grad_norm': 11.858551025390625, 'learning_rate': 1.6706349206349207e-05, 'epoch': 0.5}


 51%|█████     | 430/841 [1:11:29<1:02:49,  9.17s/it]

{'loss': 0.0205, 'grad_norm': 0.033999864012002945, 'learning_rate': 1.6309523809523807e-05, 'epoch': 0.51}


 52%|█████▏    | 440/841 [1:13:04<1:05:00,  9.73s/it]

{'loss': 0.0017, 'grad_norm': 0.03290587663650513, 'learning_rate': 1.5912698412698415e-05, 'epoch': 0.52}


 54%|█████▎    | 450/841 [1:14:35<55:34,  8.53s/it]  

{'loss': 0.0016, 'grad_norm': 0.03140263259410858, 'learning_rate': 1.5515873015873016e-05, 'epoch': 0.53}


 55%|█████▍    | 460/841 [1:16:15<1:03:33, 10.01s/it]

{'loss': 0.0161, 'grad_norm': 0.03361336141824722, 'learning_rate': 1.511904761904762e-05, 'epoch': 0.55}


 56%|█████▌    | 470/841 [1:17:52<1:00:08,  9.73s/it]

{'loss': 0.0017, 'grad_norm': 0.030031247064471245, 'learning_rate': 1.4722222222222222e-05, 'epoch': 0.56}


 57%|█████▋    | 480/841 [1:19:24<58:24,  9.71s/it]  

{'loss': 0.0118, 'grad_norm': 0.028808485716581345, 'learning_rate': 1.4325396825396825e-05, 'epoch': 0.57}


 58%|█████▊    | 490/841 [1:21:02<55:39,  9.51s/it]

{'loss': 0.0263, 'grad_norm': 1.882121205329895, 'learning_rate': 1.3928571428571429e-05, 'epoch': 0.58}


 59%|█████▉    | 500/841 [1:22:31<50:21,  8.86s/it]

{'loss': 0.0145, 'grad_norm': 0.05417279899120331, 'learning_rate': 1.3531746031746031e-05, 'epoch': 0.59}


 61%|██████    | 510/841 [1:24:04<51:47,  9.39s/it]

{'loss': 0.0101, 'grad_norm': 0.1501542329788208, 'learning_rate': 1.3134920634920635e-05, 'epoch': 0.61}


 62%|██████▏   | 520/841 [1:25:38<51:53,  9.70s/it]

{'loss': 0.0068, 'grad_norm': 0.027501359581947327, 'learning_rate': 1.2738095238095238e-05, 'epoch': 0.62}


 63%|██████▎   | 530/841 [1:27:18<51:38,  9.96s/it]

{'loss': 0.0013, 'grad_norm': 0.02625393494963646, 'learning_rate': 1.2341269841269842e-05, 'epoch': 0.63}


 64%|██████▍   | 540/841 [1:28:55<49:29,  9.87s/it]

{'loss': 0.0012, 'grad_norm': 0.025296321138739586, 'learning_rate': 1.1944444444444444e-05, 'epoch': 0.64}


 65%|██████▌   | 550/841 [1:30:25<43:16,  8.92s/it]

{'loss': 0.0012, 'grad_norm': 0.02458897978067398, 'learning_rate': 1.1547619047619048e-05, 'epoch': 0.65}


 67%|██████▋   | 560/841 [1:31:56<42:32,  9.08s/it]

{'loss': 0.0012, 'grad_norm': 0.02380664087831974, 'learning_rate': 1.1150793650793651e-05, 'epoch': 0.67}


 68%|██████▊   | 570/841 [1:33:35<44:52,  9.93s/it]

{'loss': 0.0011, 'grad_norm': 0.024169402197003365, 'learning_rate': 1.0753968253968255e-05, 'epoch': 0.68}


 69%|██████▉   | 580/841 [1:35:14<43:06,  9.91s/it]

{'loss': 0.0115, 'grad_norm': 0.024211712181568146, 'learning_rate': 1.0357142857142857e-05, 'epoch': 0.69}


 70%|███████   | 590/841 [1:36:53<41:21,  9.89s/it]

{'loss': 0.0064, 'grad_norm': 0.024340959265828133, 'learning_rate': 9.960317460317462e-06, 'epoch': 0.7}


 71%|███████▏  | 600/841 [1:38:29<38:06,  9.49s/it]

{'loss': 0.0012, 'grad_norm': 0.023085715249180794, 'learning_rate': 9.563492063492064e-06, 'epoch': 0.71}


 73%|███████▎  | 610/841 [1:40:05<37:45,  9.81s/it]

{'loss': 0.0183, 'grad_norm': 0.6708444952964783, 'learning_rate': 9.166666666666668e-06, 'epoch': 0.72}


 74%|███████▎  | 620/841 [1:41:44<36:32,  9.92s/it]

{'loss': 0.0108, 'grad_norm': 0.024306319653987885, 'learning_rate': 8.76984126984127e-06, 'epoch': 0.74}


 75%|███████▍  | 630/841 [1:43:23<34:53,  9.92s/it]

{'loss': 0.0012, 'grad_norm': 0.030083732679486275, 'learning_rate': 8.373015873015875e-06, 'epoch': 0.75}


 76%|███████▌  | 640/841 [1:44:55<27:32,  8.22s/it]

{'loss': 0.0053, 'grad_norm': 0.022931547835469246, 'learning_rate': 7.976190476190477e-06, 'epoch': 0.76}


 77%|███████▋  | 650/841 [1:46:34<31:25,  9.87s/it]

{'loss': 0.001, 'grad_norm': 0.021655995398759842, 'learning_rate': 7.57936507936508e-06, 'epoch': 0.77}


 78%|███████▊  | 660/841 [1:48:11<28:25,  9.42s/it]

{'loss': 0.001, 'grad_norm': 0.020880209282040596, 'learning_rate': 7.182539682539683e-06, 'epoch': 0.78}


 80%|███████▉  | 670/841 [1:49:48<28:11,  9.89s/it]

{'loss': 0.001, 'grad_norm': 0.02063470147550106, 'learning_rate': 6.785714285714286e-06, 'epoch': 0.8}


 81%|████████  | 680/841 [1:51:25<26:29,  9.87s/it]

{'loss': 0.001, 'grad_norm': 0.020167533308267593, 'learning_rate': 6.388888888888889e-06, 'epoch': 0.81}


 82%|████████▏ | 690/841 [1:52:59<24:44,  9.83s/it]

{'loss': 0.0009, 'grad_norm': 0.033954668790102005, 'learning_rate': 5.992063492063493e-06, 'epoch': 0.82}


 83%|████████▎ | 700/841 [1:54:37<22:40,  9.65s/it]

{'loss': 0.0009, 'grad_norm': 0.01970587484538555, 'learning_rate': 5.595238095238095e-06, 'epoch': 0.83}


 84%|████████▍ | 710/841 [1:56:09<19:18,  8.84s/it]

{'loss': 0.0086, 'grad_norm': 0.019577007740736008, 'learning_rate': 5.198412698412698e-06, 'epoch': 0.84}


 86%|████████▌ | 720/841 [1:57:55<20:33, 10.19s/it]

{'loss': 0.0009, 'grad_norm': 0.01989012584090233, 'learning_rate': 4.801587301587302e-06, 'epoch': 0.86}


 87%|████████▋ | 730/841 [1:59:27<17:45,  9.60s/it]

{'loss': 0.0009, 'grad_norm': 0.0194618571549654, 'learning_rate': 4.404761904761905e-06, 'epoch': 0.87}


 88%|████████▊ | 740/841 [2:01:03<16:01,  9.52s/it]

{'loss': 0.0009, 'grad_norm': 0.01909959875047207, 'learning_rate': 4.007936507936508e-06, 'epoch': 0.88}


 89%|████████▉ | 750/841 [2:02:40<14:35,  9.62s/it]

{'loss': 0.0026, 'grad_norm': 0.02590973861515522, 'learning_rate': 3.611111111111111e-06, 'epoch': 0.89}


 90%|█████████ | 760/841 [2:04:12<12:38,  9.36s/it]

{'loss': 0.0051, 'grad_norm': 0.018616249784827232, 'learning_rate': 3.2142857142857143e-06, 'epoch': 0.9}


 92%|█████████▏| 770/841 [2:05:51<11:41,  9.88s/it]

{'loss': 0.0009, 'grad_norm': 0.7668159008026123, 'learning_rate': 2.8174603174603176e-06, 'epoch': 0.91}


 93%|█████████▎| 780/841 [2:07:27<09:32,  9.39s/it]

{'loss': 0.0009, 'grad_norm': 0.019522687420248985, 'learning_rate': 2.4206349206349204e-06, 'epoch': 0.93}


 94%|█████████▍| 790/841 [2:09:02<08:08,  9.58s/it]

{'loss': 0.0009, 'grad_norm': 0.018811659887433052, 'learning_rate': 2.0238095238095237e-06, 'epoch': 0.94}


 95%|█████████▌| 800/841 [2:10:36<06:31,  9.54s/it]

{'loss': 0.0009, 'grad_norm': 0.018242577090859413, 'learning_rate': 1.626984126984127e-06, 'epoch': 0.95}


 96%|█████████▋| 810/841 [2:12:10<04:19,  8.38s/it]

{'loss': 0.0009, 'grad_norm': 0.018262123689055443, 'learning_rate': 1.23015873015873e-06, 'epoch': 0.96}


 98%|█████████▊| 820/841 [2:13:47<03:26,  9.84s/it]

{'loss': 0.0008, 'grad_norm': 0.018291091546416283, 'learning_rate': 8.333333333333333e-07, 'epoch': 0.97}


 99%|█████████▊| 830/841 [2:15:22<01:44,  9.48s/it]

{'loss': 0.0114, 'grad_norm': 0.018059974536299706, 'learning_rate': 4.365079365079365e-07, 'epoch': 0.99}


100%|█████████▉| 840/841 [2:17:00<00:09,  9.64s/it]

{'loss': 0.0008, 'grad_norm': 0.018556518480181694, 'learning_rate': 3.968253968253968e-08, 'epoch': 1.0}


100%|██████████| 841/841 [2:17:10<00:00,  9.74s/it]

RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Int'

In [1]:
#Check
import os
import torch
import librosa
import numpy as np
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_checkpoint = "wav2vec2-base-finetuned-ks/checkpoint-841"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
model = AutoModelForAudioClassification.from_pretrained(model_checkpoint)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def process_audio(audio_file_path):
    # Load and resample the audio
    audio, sr = librosa.load(audio_file_path, sr=16000)

    # Extract features
    inputs = feature_extractor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

    predicted_class_id = predictions.argmax().item()
    predicted_label = model.config.id2label[predicted_class_id]
    confidence = predictions[0][predicted_class_id].item()

    return predicted_label, confidence

# Specify the path to your audio file
audio_file_path = "AIVoiceGenerator_com_2-09-2024T14_0_51_ Salli.mp3"
# Process the audio file
predicted_label, confidence = process_audio(audio_file_path)

# Display results
print(f"Prediction: {predicted_label}")
print(f"Confidence: {confidence:.2f}")


Prediction: fake
Confidence: 1.00
