In [7]:
import pandas as pd
import numpy as np
import librosa
import os
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import tqdm
metadata = pd.read_csv('TinySOL_metadata (1).csv')
metadata.head()

Unnamed: 0,Path,Fold,Family,Instrument (abbr.),Instrument (in full),Technique (abbr.),Technique (in full),Pitch,Pitch ID,Dynamics,Dynamics ID,Instance ID,String ID (if applicable),Needed digital retuning
0,Brass/Bass_Tuba/ordinario/BTb-ord-F#1-pp-N-N.wav,2,Brass,BTb,Bass Tuba,ord,ordinario,F#1,30,pp,0,0,,False
1,Brass/Bass_Tuba/ordinario/BTb-ord-G1-pp-N-R100...,4,Brass,BTb,Bass Tuba,ord,ordinario,G1,31,pp,0,0,,True
2,Brass/Bass_Tuba/ordinario/BTb-ord-G#1-pp-N-T16...,3,Brass,BTb,Bass Tuba,ord,ordinario,G#1,32,pp,0,0,,True
3,Brass/Bass_Tuba/ordinario/BTb-ord-A1-pp-N-T23d...,2,Brass,BTb,Bass Tuba,ord,ordinario,A1,33,pp,0,0,,True
4,Brass/Bass_Tuba/ordinario/BTb-ord-A#1-pp-N-N.wav,0,Brass,BTb,Bass Tuba,ord,ordinario,A#1,34,pp,0,0,,False


In [23]:
def features_extractor(file):
    audio, sample_rate = librosa.load(file, res_type='scipy') 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    
    return mfccs_scaled_features

In [29]:
from tqdm import tqdm
extracted_features=[]
for index_num,row in tqdm(metadata.iterrows()):
    file_name = str(row["Path"])
    final_class_labels=row["Instrument (in full)"]
    data=features_extractor(file_name)
    extracted_features.append([data,final_class_labels])

2913it [04:35, 10.58it/s]


In [30]:
### converting extracted_features to Pandas dataframe
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-796.161, 154.30495, 115.17337, 69.384094, 33...",Bass Tuba
1,"[-780.7726, 155.0216, 114.37417, 66.248245, 28...",Bass Tuba
2,"[-737.51624, 197.43735, 118.784454, 46.450912,...",Bass Tuba
3,"[-779.69147, 155.60567, 113.64972, 64.04005, 2...",Bass Tuba
4,"[-745.1969, 203.65991, 119.42016, 43.040768, 4...",Bass Tuba


In [31]:
### Split the dataset into independent and dependent dataset
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

In [32]:
### Label Encoding
###y=np.array(pd.get_dummies(y))
### Label Encoder
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [33]:
### Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [40]:
# Reshape X to match the input shape expected by Conv2D
X = X.reshape(X.shape[0], 40, 1, 1)  # (number_of_samples, height, width, channels)

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [69]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense
model = Sequential([
    Conv1D(32, 3, activation='relu', input_shape=(40, 1)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(y.shape[1],activation='softmax'),
])

In [70]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [71]:
model.fit(X_train, y_train, batch_size=32, epochs=30)


Epoch 1/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3744 - loss: 7.9363  
Epoch 2/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8922 - loss: 0.3184
Epoch 3/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9614 - loss: 0.1335
Epoch 4/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9710 - loss: 0.0917
Epoch 5/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9803 - loss: 0.0655
Epoch 6/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9812 - loss: 0.0535
Epoch 7/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9863 - loss: 0.0454
Epoch 8/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9830 - loss: 0.0579
Epoch 9/30
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x1f24a73df90>

In [None]:
import os
import pandas as pd
import numpy as np
import librosa
from sklearn.preprocessing import LabelEncoder

class AudioDataset:
    def __init__(self, csv_path, audio_folder):
        self.csv_path = csv_path
        self.audio_folder = audio_folder
        self.data = pd.read_csv(csv_path)
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(self.data['Path'])  # Assuming 'label' is the column containing ground truth labels
    
    def get_data_at_index(self, index):
        if index < 0 or index >= len(self.data):
            raise IndexError("Index out of range")
        
        row = self.data.iloc[index]
        audio_file = str(row(['Path']))
        audio, sample_rate = librosa.load(audio_file, res_type='scipy')  # Load audio file
        mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate)  # Compute Mel spectrogram
        mel_spectrogram = np.expand_dims(mel_spectrogram, axis=0)  # Add batch dimension
        
        ground_truth = self.label_encoder.transform([row['Path']])[0]  # Encode ground truth label
        
        # Assuming you have a function to get pseudo labels based on model inference
        pseudo_label = self.get_pseudo_label(audio)  # Get pseudo label for audio
        
        return {
            'file': row['Path'],
            'audio': audio.reshape(1, -1),  # Reshape audio to [1, T]
            'mel': mel_spectrogram,  # Mel spectrogram shape: [1, F, T]
            'gt': ground_truth,
            'pseudo': pseudo_label
        }
    
    def get_pseudo_label(self, audio):
        # Placeholder function for generating pseudo labels (replace with actual implementation)
        # This could involve loading a trained model and performing inference on the audio
        return 0  # Placeholder label for illustration
    
# Example usage:
csv_path = 'TinySOL_metadata (1).csv'
audio_folder = 'path/to/audio/folder'
dataset = AudioDataset(csv_path, audio_folder)

# Get data at index 0
data_at_index_0 = dataset.get_data_at_index(0)
print(data_at_index_0)
