- First save the model from the model maker.
- get audio file. 
- run librosa track tempo and convert to integer.
- at this bpm generate segment timings [ 0.0, 0.5, 1.0,...] (for 120 bpm)
- look for filters to apply here (band pass, etc) to filter noise from regular recordings
- apply process_audio_and_save_pcp 
- load model weights into this file. (model.save_weights('model.keras'))
- predict using pcps
- display the mapped chords 
    - grouping the segment times for the same chord and displaying the corresponding one chord per segment group 



In [None]:
import tensorflow as tf
import librosa
model = tf.keras.models.load_model('model.keras')

In [None]:
# Getting audio file 
audio_filename = '0001_mix.mp3'
y,sr = librosa.load(audio_filename)
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr )
# frame is a segment that is sampled at a regular interval ex every 512 samples
# beat is high level info on a regular pulse in music
# since the tempo can be varying in the given audio 
# im trying with all beat frames. first frame being non zero doesnt cause issue 
beat_times = librosa.frames_to_time(beat_frames, sr=sr)
# beat times gives the length of audio segment equivalent to column 0 in the annotation.

In [5]:
import tensorflow as tf
import librosa
import numpy as np
import os
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, InputLayer
from model_maker import create_ffnn_model
from pcp_module import pcp_vectorise_segment
from itertools import groupby
from operator import itemgetter

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress all logs except errors
tf.get_logger().setLevel('ERROR') 

def load_trained_model(model_path):
    """
    Load the model with trained weights.
    For inference, we only need the model architecture and weights, not the optimizer state.
    """
    try:
        # First try to load as a complete model
        model = load_model(model_path)
    except:
        # If that fails, create new model and load just the weights
        model = create_ffnn_model()
        # Load weights without optimizer state
        model.load_weights(model_path, by_name=True, skip_mismatch=True)
    
    # Recompile the model for inference only (no training needed)
    model.compile(loss='binary_crossentropy', metrics=['accuracy'])
    return model
   

def predict_chord(pcp_vector, model):
    """
    Predict chord from PCP vector using the trained model
    Returns the predicted chord label
    """
    chord_list = ['Cmaj', 'Cmin', 'C#maj', 'C#min', 'Dmaj', 'Dmin', 'D#maj', 'D#min', 
                  'Emaj', 'Emin', 'Fmaj', 'Fmin', 'F#maj', 'F#min', 'Gmaj', 'Gmin', 
                  'G#maj', 'G#min', 'Amaj', 'Amin', 'A#maj', 'A#min', 'Bmaj', 'Bmin']
    
    # Reshape PCP vector for model input
    pcp_vector = np.array(pcp_vector).reshape(1, -1)
    
    # Get model prediction
    prediction = model.predict(pcp_vector, verbose=0)
    chord_index = np.argmax(prediction)
    
    return chord_list[chord_index]

def apply_audio_filters(audio_data, sr):
    """Apply audio filters to clean the signal"""
    # Apply a bandpass filter (keeping frequencies between 50Hz and 2000Hz)
    y_filtered = librosa.effects.preemphasis(audio_data)
    
    # Apply HPSS (Harmonic-Percussive Source Separation)
    y_harmonic, _ = librosa.effects.hpss(y_filtered)
    
    return y_harmonic

def group_consecutive_chords(times, chords):
    """Group consecutive identical chords and their time intervals"""
    grouped_segments = []
    
    # Create pairs of (time, chord)
    chord_segments = list(zip(times[:-1], times[1:], chords))
    
    # Group by chord
    for chord, group in groupby(chord_segments, key=lambda x: x[2]):
        group_list = list(group)
        start_time = group_list[0][0]
        end_time = group_list[-1][1]
        grouped_segments.append((start_time, end_time, chord))
    
    return grouped_segments

def infer_chords(audio_file, model_weights_path):
    """
    Main inference function that processes audio and returns chord predictions
    """
    # Load the audio file
    print(f"Loading audio file: {audio_file}")
    y, sr = librosa.load(audio_file)
    
    # Get tempo and beat frames
    tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
    print(f"Detected tempo: {tempo} BPM")
    
    # Convert beat frames to time
    beat_times = librosa.frames_to_time(beat_frames, sr=sr)
    
    # Generate half-beat times by interpolating between beats
    half_beat_times = []
    for i in range(len(beat_times) - 1):
        start_time = beat_times[i]
        end_time = beat_times[i + 1]
        mid_time = start_time + (end_time - start_time) / 2
        half_beat_times.extend([start_time, mid_time])
    # Add the last beat time
    half_beat_times.append(beat_times[-1])
    
    # Convert to numpy array for easier handling
    times = np.array(half_beat_times)
    
    # Load trained model
    model = load_trained_model(model_weights_path)
    
    # Process each segment
    predictions = []
    for i in range(len(times) - 1):
        start_time = times[i]
        end_time = times[i + 1]
        
        # Convert times to sample indices
        start_idx = int(start_time * sr)
        end_idx = int(end_time * sr)
        
        # Extract segment
        segment = y[start_idx:end_idx]
        
        # Apply filters
        filtered_segment = apply_audio_filters(segment, sr)
        
        # Get PCP vector
        pcp_vector_str = pcp_vectorise_segment(filtered_segment, sr, f"segment_{start_time}")
        pcp_vector = [float(x) for x in pcp_vector_str.strip('[]').split(',')]
        
        # Predict chord
        chord = predict_chord(pcp_vector, model)
        predictions.append(chord)
    
    # Group consecutive identical chords
    grouped_segments = group_consecutive_chords(times, predictions)
    
    return grouped_segments

def format_time(seconds):
    """Format time in seconds to MM:SS.mmm"""
    minutes = int(seconds // 60)
    seconds_remainder = seconds % 60
    return f"{minutes:02d}:{seconds_remainder:06.3f}"

if __name__ == "__main__":
    audio_file = "0001_infer.mp3"
    model_weights_path = "model.h5"
    
    # Run inference
    chord_segments = infer_chords(audio_file, model_weights_path)
    
    # Display results
    print("\nPredicted Chord Progression:")
    print("-----------------------------")
    for start_time, end_time, chord in chord_segments:
        print(f"{format_time(start_time)} - {format_time(end_time)}: {chord}")


Loading audio file: 0001_infer.mp3
Detected tempo: [92.28515625] BPM





Predicted Chord Progression:
-----------------------------
00:00.697 - 00:01.997: Fmaj
00:01.997 - 00:02.647: Emin
00:02.647 - 00:03.622: Gmin
00:03.622 - 00:03.947: Amin
00:03.947 - 00:04.272: Cmin
00:04.272 - 00:04.598: A#maj
00:04.598 - 00:04.934: Fmaj
00:04.934 - 00:05.271: A#min
00:05.271 - 00:05.596: Amin
00:05.596 - 00:05.921: Emaj
00:05.921 - 00:06.246: Fmaj
00:06.246 - 00:06.571: Amin
00:06.571 - 00:06.896: Amaj
00:06.896 - 00:07.221: Fmaj
00:07.221 - 00:07.546: Cmaj
00:07.546 - 00:07.872: Dmin
00:07.872 - 00:08.197: Cmaj
00:08.197 - 00:08.522: Amin
00:08.522 - 00:09.822: Fmaj
00:09.822 - 00:10.147: Emin
00:10.147 - 00:10.472: Dmin
00:10.472 - 00:10.797: Gmaj
00:10.797 - 00:11.122: Gmin
00:11.122 - 00:11.459: Gmaj
00:11.459 - 00:12.121: Amin
00:12.121 - 00:12.446: Fmaj
00:12.446 - 00:12.771: Gmaj
00:12.771 - 00:13.096: Dmaj
00:13.096 - 00:15.697: Fmaj
00:15.697 - 00:16.022: Fmin
00:16.022 - 00:17.647: Fmaj
00:17.647 - 00:18.297: Emin
00:18.297 - 00:19.284: Gmin
00:19.284 - 00

In [None]:
%%script false --no-raise-error
'''NOISE REDUCTION CODE USING SPECTRAL SUBTRACTION - BUT A NOISE SAMPLE IS NEEDED FOR THIS TO WORK'''
import librosa
import numpy as np
from scipy import signal

def calculate_noise_profile(y, sr, frame_length=2048, hop_length=512, n_fft=2048):
    """
    Calculate noise profile from an audio signal
    
    Parameters:
        y (np.ndarray): Input audio signal
        sr (int): Sampling rate
        frame_length (int): Length of each frame
        hop_length (int): Number of samples between frames
        n_fft (int): Length of FFT
        
    Returns:
        np.ndarray: Noise profile spectrum
    """
    # Calculate spectrogram
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=frame_length)
    mag_spec = np.abs(D)
    
    # Estimate noise profile (using median of first few frames)
    noise_frames = 10
    noise_profile = np.median(mag_spec[:, :noise_frames], axis=1)
    
    return noise_profile

def spectral_subtraction(y, sr, noise_profile, frame_length=2048, hop_length=512, n_fft=2048, 
                        reduction_factor=1.0, smoothing=0.1):
    """
    Perform spectral subtraction using the calculated noise profile
    
    Parameters:
        y (np.ndarray): Input audio signal
        sr (int): Sampling rate
        noise_profile (np.ndarray): Pre-calculated noise profile
        frame_length (int): Length of each frame
        hop_length (int): Number of samples between frames
        n_fft (int): Length of FFT
        reduction_factor (float): Factor to control noise reduction strength
        smoothing (float): Smoothing factor for noise reduction
        
    Returns:
        np.ndarray: Noise-reduced audio signal
    """
    # Calculate STFT
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=frame_length)
    mag_spec = np.abs(D)
    phase_spec = np.angle(D)
    
    # Reshape noise profile to match spectrogram
    noise_profile = noise_profile.reshape(-1, 1)
    
    # Subtract noise profile from magnitude spectrogram
    mag_spec_reduced = mag_spec - (reduction_factor * noise_profile)
    
    # Apply flooring to avoid negative values
    mag_spec_reduced = np.maximum(mag_spec_reduced, smoothing * mag_spec)
    
    # Reconstruct signal
    D_reduced = mag_spec_reduced * np.exp(1j * phase_spec)
    y_reduced = librosa.istft(D_reduced, hop_length=hop_length, win_length=frame_length)
    
    return y_reduced

def reduce_noise(audio_path, output_path, noise_start_time=0, noise_duration=1.0):
    """
    Complete noise reduction pipeline
    
    Parameters:
        audio_path (str): Path to input audio file
        output_path (str): Path to save processed audio
        noise_start_time (float): Start time (in seconds) of noise sample
        noise_duration (float): Duration (in seconds) of noise sample
    """
    # Load audio file
    y, sr = librosa.load(audio_path)
    
    # Extract noise sample
    noise_start = int(noise_start_time * sr)
    noise_length = int(noise_duration * sr)
    noise_sample = y[noise_start:noise_start + noise_length]
    
    # Calculate noise profile
    noise_profile = calculate_noise_profile(noise_sample, sr)
    
    # Apply noise reduction
    y_reduced = spectral_subtraction(y, sr, noise_profile)
    
    y_reduced = librosa.util.normalize(y_reduced)
    librosa.output.write_wav(output_path, y_reduced, sr)
    
    return y_reduced, sr

# Example usage
# if __name__ == "__main__":
#     input_file = "noisy_audio.wav"
#     output_file = "cleaned_audio.wav"
    
#     # Process the audio with custom parameters
#     reduced_audio, sr = reduce_noise(
#         input_file,
#         output_file,
#         noise_start_time=0,  # Assume noise sample is at the beginning
#         noise_duration=1.0   # Use 1 second of noise for profile
#     )