In [77]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
import random 
import torch
import os 
from torch.utils.data import Dataset, DataLoader

In [28]:
paths = []
labels = []
audio_files = ["Audio_Speech_Actors_01-24", "Audio_Song_Actors_01-24"]
for main_file in audio_files:
    for actor in os.listdir(main_file):
        actor_path = os.path.join(main_file,actor)
        for audio in os.listdir(actor_path):
            if audio.endswith(".wav"):
                path = os.path.join(actor_path, audio)
                paths.append(path)
                emotion = int(audio.split("-")[2])
                labels.append(emotion)


In [29]:
print("Length of paths: ",len(paths))
print("Length of labels: ",len(labels))
print("Maximum label: ",max(labels))
print("Minimum label: ",min(labels))
print("Data type of paths: ",type(paths[0]))
print("Data type of lables: ",type(labels[0]))


Length of paths:  2452
Length of labels:  2452
Maximum label:  8
Minimum label:  1
Data type of paths:  <class 'str'>
Data type of lables:  <class 'int'>


In [30]:
data = list(zip(paths, labels))
seed = random.Random(42)
seed.shuffle(data)
data[:5]

[('Audio_Song_Actors_01-24\\Actor_24\\03-02-03-02-02-01-24.wav', 3),
 ('Audio_Song_Actors_01-24\\Actor_02\\03-02-03-02-02-02-02.wav', 3),
 ('Audio_Speech_Actors_01-24\\Actor_14\\03-01-03-01-02-01-14.wav', 3),
 ('Audio_Speech_Actors_01-24\\Actor_17\\03-01-07-01-02-01-17.wav', 7),
 ('Audio_Song_Actors_01-24\\Actor_06\\03-02-06-01-02-01-06.wav', 6)]

In [31]:
df_combined = pd.DataFrame(data, columns=["path","label"])
df_combined.head()


Unnamed: 0,path,label
0,Audio_Song_Actors_01-24\Actor_24\03-02-03-02-0...,3
1,Audio_Song_Actors_01-24\Actor_02\03-02-03-02-0...,3
2,Audio_Speech_Actors_01-24\Actor_14\03-01-03-01...,3
3,Audio_Speech_Actors_01-24\Actor_17\03-01-07-01...,7
4,Audio_Song_Actors_01-24\Actor_06\03-02-06-01-0...,6


In [32]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2452 entries, 0 to 2451
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   path    2452 non-null   object
 1   label   2452 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 38.4+ KB


In [33]:
df_combined.describe()

Unnamed: 0,label
count,2452.0
mean,4.318108
std,2.020284
min,1.0
25%,3.0
50%,4.0
75%,6.0
max,8.0


In [34]:
df_combined.loc[0]

path     Audio_Song_Actors_01-24\Actor_24\03-02-03-02-0...
label                                                    3
Name: 0, dtype: object

In [35]:
from pathlib import Path
df_combined["path"] = df_combined["path"].str.replace("\\", "/", regex=False)


In [36]:
df_combined.head()

Unnamed: 0,path,label
0,Audio_Song_Actors_01-24/Actor_24/03-02-03-02-0...,3
1,Audio_Song_Actors_01-24/Actor_02/03-02-03-02-0...,3
2,Audio_Speech_Actors_01-24/Actor_14/03-01-03-01...,3
3,Audio_Speech_Actors_01-24/Actor_17/03-01-07-01...,7
4,Audio_Song_Actors_01-24/Actor_06/03-02-06-01-0...,6


In [37]:
training_data, validation_data = train_test_split(df_combined, test_size=0.2, random_state=32)

In [38]:
training_data.head()

Unnamed: 0,path,label
2138,Audio_Speech_Actors_01-24/Actor_08/03-01-08-01...,8
623,Audio_Speech_Actors_01-24/Actor_05/03-01-01-01...,1
1942,Audio_Song_Actors_01-24/Actor_15/03-02-03-02-0...,3
1331,Audio_Speech_Actors_01-24/Actor_05/03-01-08-02...,8
401,Audio_Speech_Actors_01-24/Actor_20/03-01-02-02...,2


In [39]:
validation_data.head()

Unnamed: 0,path,label
1794,Audio_Speech_Actors_01-24/Actor_16/03-01-08-02...,8
1833,Audio_Speech_Actors_01-24/Actor_20/03-01-08-02...,8
1488,Audio_Song_Actors_01-24/Actor_17/03-02-05-02-0...,5
2267,Audio_Song_Actors_01-24/Actor_24/03-02-04-01-0...,4
290,Audio_Speech_Actors_01-24/Actor_13/03-01-04-02...,4


In [40]:
print("Length of Training Data: ", len(training_data))
print("Length of Validation Data: ", len(validation_data))

Length of Training Data:  1961
Length of Validation Data:  491


In [66]:
import soundfile as sf 
import torchaudio
import librosa
waveform, sr = sf.read(training_data.loc[0,"path"])
print((waveform.shape),sr)
waveform, sr = librosa.load(training_data.loc[0,"path"], sr=16000)
print(waveform.shape, sr)

(237037,) 48000
(79013,) 16000


In [74]:
max_duration = 0
longest_file = ""
for path in df_combined['path']:
    # full_path = os.path.join(base_dir, path)
    if os.path.isfile(path):
        waveform, sample_rate = librosa.load(path, sr=16000)
        duration = waveform.shape[0] / sample_rate
        if duration > max_duration:
            max_duration = duration
            longest_file = path
    else:
        print(f"one file path is not available {path}")
        break

print(f"Max duration: {max_duration:.8f} seconds")
print(f"sample rate: {sample_rate:.2f}")
print(f"Longest file: {longest_file}")

Max duration: 6.37306250 seconds
sample rate: 16000.00
Longest file: Audio_Song_Actors_01-24/Actor_22/03-02-02-02-02-01-22.wav


In [73]:
waveform, sr = librosa.load("Audio_Song_Actors_01-24/Actor_22/03-02-02-02-02-01-22.wav", sr =16000)
print(waveform.shape, sr)

(101969,) 16000


In [75]:
from collections import Counter
sample_rates = []
for full_path in df_combined['path']:
    # full_path = os.path.join(base_dir, path)
    if os.path.isfile(full_path):
        _, sr = librosa.load(full_path, sr=16000)
        sample_rates.append(sr)

# Count frequency of each sample rate
rate_counts = Counter(sample_rates)
print(rate_counts)

Counter({16000: 2452})


In [120]:
class AudioDataset(Dataset):
    def __init__(self, data, processor, max_length= 6.37306250*16000):
        self.data = data
        self.processor = processor
        self.max_length = max_length
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        audio_path = self.data.iloc[index]["path"]
        label = self.data.iloc[index]["label"]

        audio, sr = librosa.load(audio_path)
        audio = audio.squeeze()

        if len(audio) > self.max_length :
            audio = audio[:self.max_length]
            print(f"found a audio file greater than max length : {audio_path}")
        else:
            audio = np.pad(audio, (0,int(self.max_length-len(audio))), "constant")
            
        inputs = self.processor(audio, sampling_rate=16000, return_tensors='pt', padding=True, truncate=True, max_length=self.max_length)
        input_values = inputs.input_values.squeeze()

        return {'input_values': input_values, 'labels': torch.tensor(label, dtype=torch.long)}

    

In [121]:
import torch
print(torch.__version__)


2.5.1+cu121


In [133]:
from transformers import Wav2Vec2Model, Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, Wav2Vec2Config, Trainer, TrainingArguments
config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", num_labels=8)
model = Wav2Vec2ForSequenceClassification(config)
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")

In [134]:
train_dataset = AudioDataset(training_data, processor)
validation_dataset = AudioDataset(validation_data, processor)

In [135]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=8, shuffle=False)

In [138]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`