# Data Exploration for GuitarTab Project

This notebook explores the Guitar Chords V2 and GuitarSet datasets to understand their structure, distribution, and characteristics.
01_data_exploration is focused solely on exploration
## Table of Contents
1. [Import Libraries](#Import-Libraries)
2. [Define Paths and Parameters](#Define-Paths-and-Parameters)
3. [Load Sample Audio Files](#Load-Sample-Audio-Files)
4. [Visualize Waveforms](#Visualize-Waveforms)
5. [Compute and Visualize Spectrograms](#Compute-and-Visualize-Spectrograms)
6. [Class Distribution](#Class-Distribution)
7. [Conclusion](#Conclusion)

In [2]:
# 1. Import Libraries
import os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from src.data_preprocessing import get_file_paths, encode_labels, create_tf_datasets
from src.models import create_crnn_model
from src.data_preprocessing import load_wav_16k_mono, standardize_audio_length, compute_mel_spectrogram
import librosa
import pandas as pd



## 2. Define Paths and Parameters

Setting the paths to the datasets and defining any necessary parameters.

In [7]:
# Define paths and parameters
# To match the actual raw data path:

RAW_IDMT_PATH = "data/raw/IDMT-SMT-CHORDS/guitar"
PROCESSED_DIR = "data/processed/IDMT_CHORDS"
TRAIN_DIR = os.path.join(PROCESSED_DIR, "Training")
TEST_DIR = os.path.join(PROCESSED_DIR, "Test")
TRAIN_TEST_RATIO = 0.8
batch_size = 32



In [8]:
def load_wav_16k_mono(file_path):
    """Load a WAV file, resample to 16kHz and convert to mono"""
    wav, sr = librosa.load(file_path, sr=16000, mono=True)
    return wav

def standardize_audio_length(wav, target_length=16000):
    """Standardize audio length by padding or truncating"""
    if len(wav) > target_length:
        return wav[:target_length]
    else:
        return np.pad(wav, (0, max(0, target_length - len(wav))), 'constant')

def compute_mel_spectrogram(wav, sr=16000, n_mels=128, n_fft=2048, hop_length=512):
    """Compute mel spectrogram from waveform"""
    mel_spec = librosa.feature.melspectrogram(
        y=wav, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length
    )
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Resize to fixed dimensions (128, 128) if needed
    if mel_spec_db.shape[1] != 128:
        # Using simple padding/truncation for consistency
        target_length = 128
        if mel_spec_db.shape[1] > target_length:
            mel_spec_db = mel_spec_db[:, :target_length]
        else:
            padding = np.zeros((mel_spec_db.shape[0], target_length - mel_spec_db.shape[1]))
            mel_spec_db = np.hstack((mel_spec_db, padding))
    
    return mel_spec_db

In [9]:
def get_chords():
    """Auto-detect chord classes"""
    chords = []
    for d in os.listdir(TRAIN_DIR):
        dir_path = os.path.join(TRAIN_DIR, d)
        if os.path.isdir(dir_path) and len(os.listdir(dir_path)) > 0:
            chords.append(d)
    return sorted(chords)

In [10]:
def verify_paths():
    """Verify all required directories and files exist"""
    # Check base directories
    assert os.path.exists(PROCESSED_DIR), f"Missing base directory: {PROCESSED_DIR}"
    assert os.path.exists(TRAIN_DIR), f"Missing training directory: {TRAIN_DIR}"
    assert os.path.exists(TEST_DIR), f"Missing test directory: {TEST_DIR}"
    
    # Check chord subfolders - using auto-detected chords
    detected_chords = get_chords()
    for split in [TRAIN_DIR, TEST_DIR]:
        for chord in detected_chords:
            chord_dir = os.path.join(split, chord)
            assert os.path.exists(chord_dir), f"Missing chord directory: {chord_dir}"
            assert len(os.listdir(chord_dir)) > 0, f"No files in {chord_dir}"
    
    print("All paths validated successfully!")

## 3. Load Sample Audio Files

Load a few sample audio files from each dataset to inspect their content.

In [13]:
# Load sample audio file

#chords = get_chords()  
#sample_chord = chords[0]  # Use first available chord
#sample_files = glob.glob(os.path.join(TRAIN_DIR, sample_chord, '*.wav'))
#if sample_files:
#    sample_file = sample_files[0]
#    wav = load_wav_16k_mono(sample_file)
#    print(f"Audio Loaded: {sample_file}, Duration: {len(wav)/16000:.2f} seconds")

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'data/processed/IDMT_CHORDS\\Training'

## 4. Visualize Waveforms

Plot the waveform of the sample audio to understand its amplitude variations over time.

In [12]:
# 4. Visualize Waveforms
def plot_waveform(wav, sr=16000, title="Waveform"):
    plt.figure(figsize=(12, 4))
    plt.plot(np.arange(len(wav))/sr, wav)
    plt.title(title)
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.tight_layout()
    plt.show()

plot_waveform(wav, title=f"Waveform of {sample_chord} Chord")

NameError: name 'wav' is not defined

## 5. Compute and Visualize Spectrograms

Convert the audio waveform into a spectrogram and visualize it.

In [4]:
# 5. Compute and Visualize Spectrograms
def plot_spectrogram(spectrogram, title="Spectrogram"):
    plt.figure(figsize=(12, 8))
    plt.imshow(spectrogram, aspect='auto', origin='lower', cmap='viridis')
    plt.title(title)
    plt.xlabel("Time Frame")
    plt.ylabel("Frequency Bin")
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    plt.show()

# Standardize and compute mel spectrogram
wav_standardized = standardize_audio_length(wav)
mel_spec = compute_mel_spectrogram(wav_standardized)
plot_spectrogram(mel_spec, title=f"Mel Spectrogram of {sample_chord} Chord")


NameError: name 'wav' is not defined

## 6. Class Distribution

Analyze the distribution of chords in the Guitar Chords V2 dataset.

In [5]:
def get_all_files(dataset_path, chords):
    """Get all audio files and their labels from dataset path"""
    files = []
    labels = []
    for chord in chords:
        chord_path = os.path.join(dataset_path, chord)
        if os.path.exists(chord_path):
            chord_files = glob.glob(os.path.join(chord_path, '*.wav'))
            files += chord_files
            labels += [chord] * len(chord_files)
    return files, labels


In [6]:

def main():
    """Main execution function"""
    # Verify paths
    try:
        verify_paths()
    except AssertionError as e:
        print(f"WARNING: {e}")
        print("Some paths may not exist yet. Run data_preparation.py first.")
        return
    
    # Auto-detect chords
    chords = get_chords()
    print(f"Detected chords: {chords}")
    
    # Load sample audio file
    sample_chord = chords[0] if chords else None  # Use first available chord
    if sample_chord:
        sample_files = glob.glob(os.path.join(TRAIN_DIR, sample_chord, '*.wav'))
        if sample_files:
            sample_file = sample_files[0]
            wav = load_wav_16k_mono(sample_file)
            print(f"Audio Loaded: {sample_file}, Duration: {len(wav)/16000:.2f} seconds")
            
            # Visualize waveform
            plot_waveform(wav, title=f"Waveform of {sample_chord} Chord")
            
            # Standardize and compute mel spectrogram
            wav_standardized = standardize_audio_length(wav)
            mel_spec = compute_mel_spectrogram(wav_standardized)
            plot_spectrogram(mel_spec, title=f"Mel Spectrogram of {sample_chord} Chord")
    
    # Analyze class distribution
    train_files, train_labels = get_all_files(TRAIN_DIR, chords)
    test_files, test_labels = get_all_files(TEST_DIR, chords)
    
    # Combine and create a DataFrame
    all_labels = train_labels + test_labels
    df = pd.DataFrame({'Chord': all_labels})
    
    # Plot distribution
    plt.figure(figsize=(10,6))
    sns.countplot(data=df, x='Chord', order=chords)
    plt.title('Chord Distribution in Dataset')
    plt.xlabel('Chord')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Print summary stats
    print(f"Total samples: {len(all_labels)}")
    print(f"Training samples: {len(train_labels)}")
    print(f"Testing samples: {len(test_labels)}")
    print("\nChord distribution:")
    for chord in chords:
        count = df['Chord'].value_counts().get(chord, 0)
        percentage = 100 * count / len(df)
        print(f"  {chord}: {count} samples ({percentage:.1f}%)")

if __name__ == "__main__":
    main()

NameError: name 'verify_paths' is not defined