In [1]:
import pandas as pd
import librosa
import numpy as np

In [2]:
import tensorflow as tf
from tensorflow.keras import layers,models
from sklearn.model_selection import train_test_split

In [4]:
# Step 1: Read the CSV file containing file names and labels
csv_file_path = '/Users/dheemankumar/github/audio-ai/hindi_broken_3s_audio_data.csv'
df = pd.read_csv(csv_file_path)

In [62]:
audio_data = []
labels = []

# Step 2 and 3: Load audio files and process the data with a sample rate of 22050
for index, row in df.iterrows():
    audio_file_path = '/Users/dheemankumar/github/audio-ai/3sec_audio/' + row['name']  # Adjust the path as needed
    audio, sample_rate = librosa.load(audio_file_path, sr=22050)  # Load audio with a sample rate of 22050

    # Perform additional processing if needed, e.g., creating spectrograms\

    d=librosa.stft(audio)
    s_db=librosa.amplitude_to_db(np.abs(d),ref=np.max)

    s_db_with_channel = np.expand_dims(s_db, axis=-1)

    #print(s_db.shape)


    # Append the processed audio data and label to the lists
    audio_data.append(s_db_with_channel)
    labels.append(row[['male','female']])

In [63]:
# Step 4: Create NumPy arrays
audio_data = np.array(audio_data)
labels = np.array(labels)


In [64]:
labels.shape

(779, 2)

In [65]:
audio_data.shape


(779, 1025, 130, 1)

In [11]:
# Create a Sequential model
model = models.Sequential()

# Add a Flatten layer to convert the input shape to a 1D tensor
model.add(layers.Flatten(input_shape=(1025,130,1)))

# Add one or more hidden layers with desired units and activation functions
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))

# Add the output layer with 7 units (since you want 7 outputs) and a suitable activation function (e.g., softmax for classification)
model.add(layers.Dense(5, activation='softmax'))

# Compile the model with an appropriate loss function, optimizer, and metrics
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 133250)            0         
                                                                 
 dense (Dense)               (None, 64)                8528064   
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 5)                 165       
                                                                 
Total params: 8530309 (32.54 MB)
Trainable params: 8530309 (32.54 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
X_train, X_test, y_train, y_test = train_test_split(audio_data, labels, test_size=0.2, random_state=42)

In [10]:
X_train = tf.convert_to_tensor(X_train, dtype=tf.float32)
y_train = tf.convert_to_tensor(y_train, dtype=tf.float32)
X_test = tf.convert_to_tensor(X_test, dtype=tf.float32)
y_test = tf.convert_to_tensor(y_test, dtype=tf.float32)

<tf.Tensor: shape=(623, 1025, 130, 1), dtype=float32, numpy=
array([[[[-65.06258 ],
         [-68.19394 ],
         [-80.      ],
         ...,
         [-69.91271 ],
         [-80.      ],
         [-70.99723 ]],

        [[-64.97798 ],
         [-68.402695],
         [-78.414604],
         ...,
         [-67.16352 ],
         [-69.45882 ],
         [-66.37231 ]],

        [[-65.77309 ],
         [-66.4268  ],
         [-72.61808 ],
         ...,
         [-64.82132 ],
         [-64.48978 ],
         [-68.21896 ]],

        ...,

        [[-80.      ],
         [-80.      ],
         [-80.      ],
         ...,
         [-80.      ],
         [-80.      ],
         [-80.      ]],

        [[-80.      ],
         [-80.      ],
         [-80.      ],
         ...,
         [-80.      ],
         [-80.      ],
         [-80.      ]],

        [[-80.      ],
         [-80.      ],
         [-80.      ],
         ...,
         [-80.      ],
         [-80.      ],
         [-80.      ]]],



In [12]:
model.fit(X_train,y_train,epochs=2)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x28ad36190>

In [76]:
# Create a Sequential model
model2 = models.Sequential()

# Add a 2D convolutional layer with 32 filters, a 3x3 kernel, and 'relu' activation
model2.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(1025, 130, 1)))

# Add a max-pooling layer
model2.add(layers.MaxPooling2D((2, 2)))

# Add another 2D convolutional layer with 64 filters and 'relu' activation
model2.add(layers.Conv2D(64, (3, 3), activation='relu'))

# Add another max-pooling layer
model2.add(layers.MaxPooling2D((2, 2)))

# Add a flattening layer to convert to 1D tensor
model2.add(layers.Flatten())

# Add a fully connected (dense) layer with 64 units and 'relu' activation
model2.add(layers.Dense(64, activation='relu'))

# Add the output layer with 7 units (since you want 7 outputs) and 'softmax' activation
model2.add(layers.Dense(2, activation='softmax'))

# Compile the model
model2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Display the model summary
model2.summary()


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_8 (Conv2D)           (None, 1023, 128, 32)     320       
                                                                 
 max_pooling2d_8 (MaxPoolin  (None, 511, 64, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_9 (Conv2D)           (None, 509, 62, 64)       18496     
                                                                 
 max_pooling2d_9 (MaxPoolin  (None, 254, 31, 64)       0         
 g2D)                                                            
                                                                 
 flatten_5 (Flatten)         (None, 503936)            0         
                                                                 
 dense_11 (Dense)            (None, 64)               

In [77]:
# Assuming you have already created the model and loaded your data

# Train the model
history = model2.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)




Epoch 1/10


ValueError: in user code:

    File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/keras/src/engine/training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/keras/src/engine/training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/keras/src/engine/training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/keras/src/engine/training.py", line 1127, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/keras/src/engine/training.py", line 1185, in compute_loss
        return self.compiled_loss(
    File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/keras/src/losses.py", line 143, in __call__
        losses = call_fn(y_true, y_pred)
    File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/keras/src/losses.py", line 270, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/keras/src/losses.py", line 2221, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/keras/src/backend.py", line 5575, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 5) and (None, 2) are incompatible


In [68]:
# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test loss: {test_loss}, Test accuracy: {test_accuracy}")

Test loss: 0.42092469334602356, Test accuracy: 0.9871794581413269


In [70]:
#audio_file_path = '/Users/dheemankumar/github/audio-ai/ab.wav'

audio_file_path = '/Users/dheemankumar/github/audio-ai/female_eng.wav'  # Adjust the path as needed
audio, sample_rate = librosa.load(audio_file_path, sr=22050)  # Load audio with a sample rate of 22050

In [71]:
d=librosa.stft(audio)
s_db=librosa.amplitude_to_db(np.abs(d),ref=np.max)

s_db_with_channel = np.expand_dims(s_db, axis=-1)

In [72]:
new_data = tf.convert_to_tensor(audio, dtype=tf.float32)
audio_= np.array(s_db_with_channel)

In [73]:
input_data = audio_.reshape(1, 1025, 130, 1)

In [74]:
predictions = model.predict(input_data)



In [75]:
predictions

array([[1.000000e+00, 2.627575e-36, 0.000000e+00, 0.000000e+00,
        0.000000e+00]], dtype=float32)