In [2]:
import pandas as pd
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers,models
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import save_model


In [3]:
csv_file_path = '/Users/dheemankumar/github/audio-ai/broken_3s_audio_data.csv'
df = pd.read_csv(csv_file_path)

In [4]:
audio_data = []
labels_gender = []
labels_language = []
labels_noise=[]

# Step 2 and 3: Load audio files and process the data with a sample rate of 22050
for index, row in df.iterrows():
    audio_file_path = '/Users/dheemankumar/github/audio-ai/3sec_audio/' + row['name']  # Adjust the path as needed
    audio, sample_rate = librosa.load(audio_file_path, sr=22050)  # Load audio with a sample rate of 22050

    # Perform additional processing if needed, e.g., creating spectrograms\

    d=librosa.stft(audio)
    s_db=librosa.amplitude_to_db(np.abs(d),ref=np.max)

    s_db_with_channel = np.expand_dims(s_db, axis=-1)

    #print(s_db.shape)


    # Append the processed audio data and label to the lists
    audio_data.append(s_db_with_channel)
    labels_gender.append(row[['male','female']])
    labels_language.append(row[["english","hindi","punjabi","bangoli"]])
    labels_noise.append(row[["noise"]])

In [5]:
# Step 4: Create NumPy arrays
audio_data = np.array(audio_data)
labels_gender = np.array(labels_gender)
labels_language = np.array(labels_language)
labels_noise = np.array(labels_noise)

In [6]:
labels_gender.shape

(13388, 2)

In [7]:
labels_language.shape

(13388, 4)

In [8]:
labels_noise.shape

(13388, 1)

In [9]:
audio_data.shape

(13388, 1025, 130, 1)

In [11]:
X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(audio_data, labels_gender, test_size=0.2, random_state=42)

In [12]:
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(audio_data, labels_language, test_size=0.2, random_state=42)

In [13]:
X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(audio_data, labels_noise, test_size=0.2, random_state=42)

In [14]:
y_train_l[0]

array([0, 0, 0, 1], dtype=object)

In [15]:
# Create a Sequential model_gender
model_gender = models.Sequential()

# Add a 2D convolutional layer with 32 filters, a 3x3 kernel, and 'relu' activation
model_gender.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(1025, 130, 1)))

# Add a max-pooling layer
model_gender.add(layers.MaxPooling2D((2, 2)))

# Add another 2D convolutional layer with 64 filters and 'relu' activation
model_gender.add(layers.Conv2D(64, (3, 3), activation='relu'))

# Add another max-pooling layer
model_gender.add(layers.MaxPooling2D((2, 2)))

# Add a flattening layer to convert to 1D tensor
model_gender.add(layers.Flatten())

# Add a fully connected (dense) layer with 64 units and 'relu' activation
model_gender.add(layers.Dense(64, activation='relu'))

# Add the output layer with 7 units (since you want 7 outputs) and 'softmax' activation
model_gender.add(layers.Dense(2, activation='softmax'))

# Compile the model_gender
model_gender.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Display the model_gender summary
model_gender.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 1023, 128, 32)     320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 511, 64, 32)       0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 509, 62, 64)       18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 254, 31, 64)       0         
 g2D)                                                            
                                                                 
 flatten (Flatten)           (None, 503936)            0         
                                                                 
 dense (Dense)               (None, 64)                3

In [16]:
# Create a Sequential model_noise
model_noise = models.Sequential()

# Add a 2D convolutional layer with 32 filters, a 3x3 kernel, and 'relu' activation
model_noise.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(1025, 130, 1)))

# Add a max-pooling layer
model_noise.add(layers.MaxPooling2D((2, 2)))

# Add another 2D convolutional layer with 64 filters and 'relu' activation
model_noise.add(layers.Conv2D(64, (3, 3), activation='relu'))

# Add another max-pooling layer
model_noise.add(layers.MaxPooling2D((2, 2)))

# Add a flattening layer to convert to 1D tensor
model_noise.add(layers.Flatten())

# Add a fully connected (dense) layer with 64 units and 'relu' activation
model_noise.add(layers.Dense(64, activation='relu'))

# Add the output layer with 7 units (since you want 7 outputs) and 'softmax' activation
model_noise.add(layers.Dense(1, activation='sigmoid'))

# Compile the model_noise
model_noise.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Display the model_noise summary
model_noise.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_2 (Conv2D)           (None, 1023, 128, 32)     320       
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 511, 64, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_3 (Conv2D)           (None, 509, 62, 64)       18496     
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 254, 31, 64)       0         
 g2D)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 503936)            0         
                                                                 
 dense_2 (Dense)             (None, 64)               

In [23]:
# Create a Sequential model_language
model_language = models.Sequential()

# Add a 2D convolutional layer with 32 filters, a 3x3 kernel, and 'relu' activation
model_language.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(1025, 130, 1)))

# Add a max-pooling layer
model_language.add(layers.MaxPooling2D((2, 2)))

# Add another 2D convolutional layer with 64 filters and 'relu' activation
model_language.add(layers.Conv2D(64, (3, 3), activation='relu'))

# Add another max-pooling layer
model_language.add(layers.MaxPooling2D((2, 2)))

# Add a flattening layer to convert to 1D tensor
model_language.add(layers.Flatten())

# Add a fully connected (dense) layer with 64 units and 'relu' activation
model_language.add(layers.Dense(64, activation='relu'))

# Add the output layer with 7 units (since you want 7 outputs) and 'softmax' activation
model_language.add(layers.Dense(4, activation='softmax'))

# Compile the model_language
model_language.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Display the model_language summary
model_language.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_6 (Conv2D)           (None, 1023, 128, 32)     320       
                                                                 
 max_pooling2d_6 (MaxPoolin  (None, 511, 64, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_7 (Conv2D)           (None, 509, 62, 64)       18496     
                                                                 
 max_pooling2d_7 (MaxPoolin  (None, 254, 31, 64)       0         
 g2D)                                                            
                                                                 
 flatten_3 (Flatten)         (None, 503936)            0         
                                                                 
 dense_6 (Dense)             (None, 64)               

In [18]:
# Convert data types if needed
X_train_g = np.asarray(X_train_g, dtype=np.float32)
X_test_g = np.asarray(X_test_g, dtype=np.float32)
y_train_g = np.asarray(y_train_g, dtype=np.float32)
y_test_g = np.asarray(y_test_g, dtype=np.float32)

In [19]:
# Convert data types if needed
X_train_l = np.asarray(X_train_l, dtype=np.float32)
X_test_l = np.asarray(X_test_l, dtype=np.float32)
y_train_l = np.asarray(y_train_l, dtype=np.float32)
y_test_l = np.asarray(y_test_l, dtype=np.float32)

In [20]:
# Convert data types if needed
X_train_n = np.asarray(X_train_n, dtype=np.float32)
X_test_n = np.asarray(X_test_n, dtype=np.float32)
y_train_n = np.asarray(y_train_n, dtype=np.float32)
y_test_n = np.asarray(y_test_n, dtype=np.float32)

In [24]:
# Train the model
ep=1

history_language = model_language.fit(X_train_l, y_train_l, epochs=ep, validation_data=(X_test_l, y_test_l), batch_size=32)




: 

In [22]:

history_gender = model_gender.fit(X_train_g, y_train_g, epochs=ep, validation_data=(X_test_g, y_test_g), batch_size=32)



In [None]:
history_noise = model_noise.fit(X_train_n, y_train_n, epochs=ep, validation_data=(X_test_n, y_test_n), batch_size=64)

In [25]:
model_gender.save("models/GenderModel.h5")

  saving_api.save_model(


In [None]:

model_language.save("models/LanguageModel.h5")
model_noise.save("models/NoiseModel.h5")

In [None]:
audio_file_path = '/Users/dheemankumar/github/audio-ai/ab.wav'

#audio_file_path = '/Users/dheemankumar/github/audio-ai/female_eng.wav'  # Adjust the path as needed
audio, sample_rate = librosa.load(audio_file_path, sr=22050)  # Load audio with a sample rate of 22050

In [None]:
d=librosa.stft(audio)
s_db=librosa.amplitude_to_db(np.abs(d),ref=np.max)

s_db_with_channel = np.expand_dims(s_db, axis=-1)

In [None]:
new_data = tf.convert_to_tensor(audio, dtype=tf.float32)
audio_= np.array(s_db_with_channel)

In [None]:
input_data = audio_.reshape(1, 1025, 130, 1)

In [None]:
predictions1 = model_gender.predict(input_data)
predictions2 = model_language.predict(input_data)


In [None]:
predictions1

In [None]:
predictions2

In [None]:
predictions3 = model_noise.predict(input_data)
predictions3

In [None]:
type(predictions1)

numpy.ndarray