In [2]:
import librosa
import numpy as np
import os

In [4]:
def normalize_spectrogram(spec):
    # Normalize the spectrogram by subtracting the mean and dividing by the standard deviation
    mean = np.mean(spec)
    std = np.std(spec)
    normalized_spec = (spec - mean) / std
    return normalized_spec

In [5]:
def load_audio_and_extract_spectrogram(audio_file,duration=5, n_mels=128, hop_length=512):

    if not os.path.exists(audio_file):
        print(f"Error: File '{audio_file}' not found.")
        return None
    else:
        try:
            # Load audio file
            y, sr = librosa.load(audio_file,duration=duration)

            # Extract Mel spectrogram
            mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_length)
            
          
            # Convert to dB scale
            mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
            #Normalize spectogram
            norm_spectogram=normalize_spectrogram(mel_spectrogram_db)
            # Transpose to have time steps as the first dimension (compatible with Conv1D input)
            norm_spectogram = norm_spectogram.T

            return norm_spectogram

        
        except Exception as e:
            print(f"Error encountered while processing '{audio_file}': {e}")
            return None


In [6]:
import pandas as pd
# Load the Excel file
dataset= pd.read_csv('dataset.csv')
dataset.head()

Unnamed: 0,Public filename,Interpreter,Song,Interpretation
0,0000.wav,216,Potter,Hum
1,0001.wav,100,Potter,Hum
2,0002.wav,177,Potter,Hum
3,0003.wav,159,Potter,Hum
4,0004.wav,160,Potter,Whistle


In [7]:
import numpy as np
from tqdm import tqdm
### Now we iterate through every audio file and extract features
### using Mel-Frequency Cepstral Coefficients
audio_dataset_path='audio/'
y_class=[]
for index_num,row in tqdm(dataset.iterrows()):
    y_class.append(row["Song"])
    

6611it [00:00, 8329.48it/s] 


In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
y_class=to_categorical(labelencoder.fit_transform(y_class))

In [None]:
# Load the saved model
import tensorflow as tf

model = tf.keras.models.load_model('model.keras',compile=False)


In [None]:
import pyaudio
import wave

def record_audio(output_file, duration=10, rate=44100, chunk=1024, format=pyaudio.paInt16, channels=2):
    """
    Record audio from the default audio input device for a specified duration.
    
    Parameters:
    - output_file: Output WAV file path where the recorded audio will be saved.
    - duration: Duration of the recording in seconds (default is 10 seconds).
    - rate: Sampling rate (samples/second).
    - chunk: Number of frames per buffer.
    - format: Audio sample format (e.g., pyaudio.paInt16).
    - channels: Number of audio channels (1 for mono, 2 for stereo).
    """
    audio = pyaudio.PyAudio()
    
    # Open stream
    stream = audio.open(format=format,
                        channels=channels,
                        rate=rate,
                        input=True,
                        frames_per_buffer=chunk)
    
    print("Recording...")
    
    frames = []
    
    # Record audio in chunks and store in frames
    for i in range(0, int(rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)
    
    print("Finished recording.")
    
    # Stop and close the stream
    stream.stop_stream()
    stream.close()
    audio.terminate()
    
    # Save recorded audio to a WAV file
    with wave.open(output_file, 'wb') as wf:
        wf.setnchannels(channels)
        wf.setsampwidth(audio.get_sample_size(format))
        wf.setframerate(rate)
        wf.writeframes(b''.join(frames))

if __name__ == "__main__":
    output_file = "recorded_audio.wav"
    record_duration = 10  # seconds
    
    # Record audio and save to WAV file
    record_audio(output_file, duration=record_duration)
    
    print(f"Audio recorded and saved to {output_file}.")


In [None]:
test_spectogram=load_audio_and_extract_spectrogram('recorded_audio.wav')

In [11]:
test_spectogram= np.expand_dims(test_spectogram, axis=0) 

In [12]:
y_decoded = np.argmax(model.predict(np.array(test_spectogram)), axis=1)  # Get the index of the highest value in each row
y_original = labelencoder.inverse_transform(y_decoded)
y_original

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step


array(['Hakuna'], dtype='<U8')