# Milestone 1

This milestone focuses on understanding the dataset and establishing a baseline performance through **exploratory data analysis (EDA)** and simple **heuristic-based methods** using `librosa`.

---

## Suggested Readings
- [Hugging Face Audio Course](https://huggingface.co/learn/audio-course/en/chapter0/introduction)
- [Librosa Documentation](https://librosa.org/doc/main/core.html#audio-loading)

---

## Instructions
Use this notebook to answer **all Milestone-1 questions**.

---

## Resources
- Notebook Link:  
  https://colab.research.google.com/drive/1m6UczhxQIke_raWSqukSWuiKbIVt7MMb?usp=sharing  

- Competition Link:  
  https://www.kaggle.com/competitions/jan-2026-dl-gen-ai-project/


In [4]:
import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
import librosa
import librosa.display
import matplotlib.pyplot as plt
import random
import torch

import warnings
warnings.filterwarnings("ignore")

In [5]:
#----------------------------- DON'T CHANGE THIS --------------------------
DATA_SEED = 67
TRAINING_SEED = 1234
SR = 22050
DURATION = 5.0
N_FFT = 2048
HOP_LENGTH = 512
N_MELS = 128
TOP_DB=20
TARGET_SNR_DB = 10

random.seed(DATA_SEED)
np.random.seed(DATA_SEED)
torch.manual_seed(DATA_SEED)
torch.cuda.manual_seed(DATA_SEED)

In [6]:
# CONFIGURATION
DATA_ROOT = '/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/genres_stems'
GENRES = sorted([g for g in os.listdir(DATA_ROOT) 
                 if os.path.isdir(os.path.join(DATA_ROOT, g))]) # Make the list of all genres available (alphabetical order)
STEM_KEYS = ['drums', 'vocals', 'bass', 'other']
STEMS = {
    'drums.wav': 'drums',
    'vocals.wav': 'vocals',
    'bass.wav': 'bass',
    'other.wav': 'other'
}
GENRE_TO_TEST = 'rock'
# SONG_INDEX = #Enter index as per Q10.

In [7]:
def build_dataset(root_dir, val_split=0.17, seed=42):

    train_dataset = {g: {k: [] for k in STEM_KEYS} for g in GENRES}
    val_dataset   = {g: {k: [] for k in STEM_KEYS} for g in GENRES}

    rng = random.Random(seed)

    corrupted_count = 0
    less_5_0491MB = 0
    greater_5_0493MB = 0

    for genre in GENRES:
        genre_path = os.path.join(root_dir, genre)
        songs = sorted(os.listdir(genre_path))

        valid_songs = []

        for song in songs:
            song_path = os.path.join(genre_path, song)
            stem_files = []

            for stem_file in STEMS:
                fpath = os.path.join(song_path, stem_file)
                if not os.path.exists(fpath):
                    break
                size = os.path.getsize(fpath)

                # corruption check (< 4 KB)
                if size < 4 * 1024:
                    corrupted_count += 1

                # size comparisons
                if size < 5.0491 * 1024 * 1024:
                    less_5_0491MB += 1

                if size > 5.0493 * 1024 * 1024:
                    greater_5_0493MB += 1

                stem_files.append(fpath)

            if len(stem_files) == 4:
                valid_songs.append(song_path)

        rng.shuffle(valid_songs)

        split_idx = int(len(valid_songs) * (1 - val_split))
        train_songs = valid_songs[:split_idx]
        val_songs   = valid_songs[split_idx:]

        for s in train_songs:
            for stem_file in STEMS:
                train_dataset[genre][STEMS[stem_file]].append(
                    os.path.join(s, stem_file)
                )

        for s in val_songs:
            for stem_file in STEMS:
                val_dataset[genre][STEMS[stem_file]].append(
                    os.path.join(s, stem_file)
                )

    print("\n--- Q1 ---")
    print("Corrupted + (<5.0491MB):", corrupted_count + less_5_0491MB)

    print("\n--- Q2 ---")
    print("Absolute difference:",
          abs(greater_5_0493MB - less_5_0491MB))

    print("\n--- Q3 ---")
    reggae_train_drums = len(train_dataset['reggae']['drums'])
    country_val_vocals = len(val_dataset['country']['vocals'])
    print("Absolute difference:",
          abs(reggae_train_drums - country_val_vocals))

    return train_dataset, val_dataset


tr, val = build_dataset(DATA_ROOT)



--- Q1 ---
Corrupted + (<5.0491MB): 1256

--- Q2 ---
Absolute difference: 1072

--- Q3 ---
Absolute difference: 66


In [8]:
def find_long_silences(dataset_dict, sr=SR, threshold_sec=DURATION, top_db=TOP_DB):

    records = []

    for genre in dataset_dict:
        for stem in dataset_dict[genre]:
            for file_path in tqdm(dataset_dict[genre][stem], leave=False):

                y, _ = librosa.load(file_path, sr=sr)
                total_duration = len(y) / sr

                intervals = librosa.effects.split(y, top_db=top_db)

                silence_type = []
                max_silence = 0

                if len(intervals) == 0:
                    max_silence = total_duration
                    silence_type.append("Full")

                else:
                    if intervals[0][0] > 0:
                        start_silence = intervals[0][0] / sr
                        max_silence = max(max_silence, start_silence)
                        silence_type.append("Start")

                    if intervals[-1][1] < len(y):
                        end_silence = (len(y) - intervals[-1][1]) / sr
                        max_silence = max(max_silence, end_silence)
                        silence_type.append("End")

                    for i in range(len(intervals)-1):
                        gap = (intervals[i+1][0] - intervals[i][1]) / sr
                        if gap > 0:
                            max_silence = max(max_silence, gap)
                            silence_type.append("Middle")

                if max_silence >= threshold_sec:
                    records.append({
                        "Genre": genre,
                        "Stem": stem,
                        "Duration": round(total_duration,2),
                        "Max_Silence_Sec": round(max_silence,2),
                        "Silence_Location": ", ".join(silence_type),
                        "File_Path": file_path
                    })

    df = pd.DataFrame(records)
    return df


df_silence = find_long_silences(tr)

print("\n--- Q4 ---")
print("Total files silence >=5:", len(df_silence))

print("\n--- Q5 ---")
print("Vocals silence >=5:",
      len(df_silence[df_silence['Stem']=='vocals']))

print("\n--- Q6 ---")
print("Average silence vocals:",
      df_silence[df_silence['Stem']=='vocals']['Max_Silence_Sec'].mean())

print("\n--- Q7 ---")
print("Jazz drums silence >=5:",
      len(df_silence[(df_silence['Genre']=='jazz') &
                     (df_silence['Stem']=='drums')]))

print("\n--- Q8 ---")
print("Jazz drums middle only:",
      len(df_silence[(df_silence['Genre']=='jazz') &
                     (df_silence['Stem']=='drums') &
                     (df_silence['Silence_Location']=='Middle')]))

print("\n--- Q9 ---")
print("Jazz drums silence >=10:",
      len(df_silence[(df_silence['Genre']=='jazz') &
                     (df_silence['Stem']=='drums') &
                     (df_silence['Max_Silence_Sec']>=10)]))

# ----------------------------------------
# Q10â€“Q12 MIX SAMPLE
# ----------------------------------------
rock_songs = sorted(os.listdir(os.path.join(DATA_ROOT,'rock')))
first_song = rock_songs[0]


                                               


--- Q4 ---
Total files silence >=5: 680

--- Q5 ---
Vocals silence >=5: 304

--- Q6 ---
Average silence vocals: 12.590789473684211

--- Q7 ---
Jazz drums silence >=5: 24

--- Q8 ---
Jazz drums middle only: 0

--- Q9 ---
Jazz drums silence >=10: 7




In [9]:
stems_audio = []
try:
    for stem_file in STEMS:
        path = os.path.join(DATA_ROOT,'rock',first_song,stem_file)
        y,_ = librosa.load(path, sr=SR, duration=5.0)
        stems_audio.append(y)

    print("Audio loaded successfully.")
except NameError:
    print("ERROR: 'tr' dictionary not found. Please run build_dataset() first.")
except IndexError:
    print(f"ERROR: Song index {SONG_INDEX} out of range for genre {GENRE_TO_TEST}.")
except Exception as e:
    print(f"ERROR: {e}")

Audio loaded successfully.


In [10]:
# ------------------- write your code here -------------------------------
# Stack them into a numpy array (Shape: 4 x Samples)
stems_stack = np.vstack(stems_audio)

# Mix the stems by summing them element-wise
mix_raw = np.sum(stems_stack, axis=0)

# Calculate RMS Amplitude MANUALLY
rms_val = np.sqrt(np.mean(mix_raw**2))

#Peak Normalization
max_val = np.max(np.abs(mix_raw))

if max_val > 0:
    mix_norm = mix_raw / max_val
else:
    mix_norm = mix_raw

# VALIDATION
assert np.isclose(np.max(np.abs(mix_norm)), 1.0), "Normalization failed."
#------------------------------------------------------------------------
print("\n--- Q10 ---")
print("Mix length:", len(mix_raw))

print("\n--- Q11 ---")
print("RMS:", round(rms_val,2))

print("\n--- Q12 ---")
print("Max peak before norm:", max_val)


--- Q10 ---
Mix length: 110250

--- Q11 ---
RMS: 0.2

--- Q12 ---
Max peak before norm: 0.96006984
