In [107]:
import pandas as pd
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers,models
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import save_model


In [None]:
csv_file_path = '/Users/dheemankumar/github/audio-ai/hindi_broken_3s_audio_data.csv'
df = pd.read_csv(csv_file_path)

In [None]:
audio_data = []
labels_gender = []
labels_language = []
labels_noise=[]

# Step 2 and 3: Load audio files and process the data with a sample rate of 22050
for index, row in df.iterrows():
    audio_file_path = '/Users/dheemankumar/github/audio-ai/3sec_audio/' + row['name']  # Adjust the path as needed
    audio, sample_rate = librosa.load(audio_file_path, sr=22050)  # Load audio with a sample rate of 22050

    # Perform additional processing if needed, e.g., creating spectrograms\

    d=librosa.stft(audio)
    s_db=librosa.amplitude_to_db(np.abs(d),ref=np.max)

    s_db_with_channel = np.expand_dims(s_db, axis=-1)

    #print(s_db.shape)


    # Append the processed audio data and label to the lists
    audio_data.append(s_db_with_channel)
    labels_gender.append(row[['male','female']])
    labels_language.append(row[["english","hindi","punjabi","bangoli"]])
    labels_noise.append(row[["noise"]])

In [None]:
# Step 4: Create NumPy arrays
audio_data = np.array(audio_data)
labels_gender = np.array(labels_gender)
labels_language = np.array(labels_language)
labels_noise = np.array(labels_noise)

In [None]:
labels_gender.shape

In [None]:
labels_language.shape

In [None]:
labels_noise.shape

In [None]:
audio_data.shape

In [None]:
X_train_gender, X_test_gender, y_train_gender, y_test_gender = train_test_split(audio_data, labels_gender, test_size=0.2, random_state=42)

In [None]:
X_train_language, X_test_language, y_train_language, y_test_language = train_test_split(audio_data, labels_language, test_size=0.2, random_state=42)

In [None]:
X_train_noise, X_test_noise, y_train_noise, y_test_noise = train_test_split(audio_data, labels_noise, test_size=0.2, random_state=42)

In [None]:
y_train_gender[0]

In [None]:
# Create a Sequential model_gender
model_gender = models.Sequential()

# Add a 2D convolutional layer with 32 filters, a 3x3 kernel, and 'relu' activation
model_gender.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(1025, 130, 1)))

# Add a max-pooling layer
model_gender.add(layers.MaxPooling2D((2, 2)))

# Add another 2D convolutional layer with 64 filters and 'relu' activation
model_gender.add(layers.Conv2D(64, (3, 3), activation='relu'))

# Add another max-pooling layer
model_gender.add(layers.MaxPooling2D((2, 2)))

# Add a flattening layer to convert to 1D tensor
model_gender.add(layers.Flatten())

# Add a fully connected (dense) layer with 64 units and 'relu' activation
model_gender.add(layers.Dense(64, activation='relu'))

# Add the output layer with 7 units (since you want 7 outputs) and 'softmax' activation
model_gender.add(layers.Dense(2, activation='softmax'))

# Compile the model_gender
model_gender.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Display the model_gender summary
model_gender.summary()


In [None]:
# Create a Sequential model_noise
model_noise = models.Sequential()

# Add a 2D convolutional layer with 32 filters, a 3x3 kernel, and 'relu' activation
model_noise.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(1025, 130, 1)))

# Add a max-pooling layer
model_noise.add(layers.MaxPooling2D((2, 2)))

# Add another 2D convolutional layer with 64 filters and 'relu' activation
model_noise.add(layers.Conv2D(64, (3, 3), activation='relu'))

# Add another max-pooling layer
model_noise.add(layers.MaxPooling2D((2, 2)))

# Add a flattening layer to convert to 1D tensor
model_noise.add(layers.Flatten())

# Add a fully connected (dense) layer with 64 units and 'relu' activation
model_noise.add(layers.Dense(64, activation='relu'))

# Add the output layer with 7 units (since you want 7 outputs) and 'softmax' activation
model_noise.add(layers.Dense(1, activation='sigmoid'))

# Compile the model_noise
model_noise.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Display the model_noise summary
model_noise.summary()


In [None]:
# Create a Sequential model_language
model_language = models.Sequential()

# Add a 2D convolutional layer with 32 filters, a 3x3 kernel, and 'relu' activation
model_language.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(1025, 130, 1)))

# Add a max-pooling layer
model_language.add(layers.MaxPooling2D((2, 2)))

# Add another 2D convolutional layer with 64 filters and 'relu' activation
model_language.add(layers.Conv2D(64, (3, 3), activation='relu'))

# Add another max-pooling layer
model_language.add(layers.MaxPooling2D((2, 2)))

# Add a flattening layer to convert to 1D tensor
model_language.add(layers.Flatten())

# Add a fully connected (dense) layer with 64 units and 'relu' activation
model_language.add(layers.Dense(64, activation='relu'))

# Add the output layer with 7 units (since you want 7 outputs) and 'softmax' activation
model_language.add(layers.Dense(4, activation='softmax'))

# Compile the model_language
model_language.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Display the model_language summary
model_language.summary()


In [None]:
# Convert data types if needed
X_train_gender = np.asarray(X_train_gender, dtype=np.float32)
X_test_gender = np.asarray(X_test_gender, dtype=np.float32)
y_train_gender = np.asarray(y_train_gender, dtype=np.float32)
y_test_gender = np.asarray(y_test_gender, dtype=np.float32)

In [None]:

X_train_language = np.asarray(X_train_language, dtype=np.float32)
X_test_language = np.asarray(X_test_language, dtype=np.float32)
y_train_language = np.asarray(y_train_language, dtype=np.float32)
y_test_language = np.asarray(y_test_language, dtype=np.float32)

In [None]:


X_train_noise = np.asarray(X_train_noise, dtype=np.float32)
X_test_noise = np.asarray(X_test_noise, dtype=np.float32)
y_train_noise = np.asarray(y_train_noise, dtype=np.float32)
y_test_noise = np.asarray(y_test_noise, dtype=np.float32)

In [None]:
# Train the model
ep=5

history_gender = model_gender.fit(X_train_gender, y_train_gender, epochs=ep, validation_data=(X_test_gender, y_test_gender), batch_size=32)



In [None]:
history_language = model_language.fit(X_train_language, y_train_language, epochs=ep, validation_data=(X_test_language, y_test_language), batch_size=32)

In [None]:
history_noise = model_noise.fit(X_train_noise, y_train_noise, epochs=ep, validation_data=(X_test_noise, y_test_noise), batch_size=32)

In [111]:
model_gender.save("models/GenderModel.h5")
model_language.save("models/LanguageModel.h5")
model_noise.save("models/NoiseModel.h5")

  saving_api.save_model(


In [None]:
audio_file_path = '/Users/dheemankumar/github/audio-ai/ab.wav'

#audio_file_path = '/Users/dheemankumar/github/audio-ai/female_eng.wav'  # Adjust the path as needed
audio, sample_rate = librosa.load(audio_file_path, sr=22050)  # Load audio with a sample rate of 22050

In [None]:
d=librosa.stft(audio)
s_db=librosa.amplitude_to_db(np.abs(d),ref=np.max)

s_db_with_channel = np.expand_dims(s_db, axis=-1)

In [None]:
new_data = tf.convert_to_tensor(audio, dtype=tf.float32)
audio_= np.array(s_db_with_channel)

In [None]:
input_data = audio_.reshape(1, 1025, 130, 1)

In [None]:
predictions1 = model_gender.predict(input_data)
predictions2 = model_language.predict(input_data)


In [None]:
predictions1

In [None]:
predictions2

In [None]:
predictions3 = model_noise.predict(input_data)
predictions3