<a href="https://colab.research.google.com/github/23f2002498/genai-project-milestones/blob/main/milestone_1_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Milestone 1

This milestone focuses on understanding the dataset and establishing a baseline performance through **exploratory data analysis (EDA)** and simple **heuristic-based methods** using `librosa`.

---

## Suggested Readings
- [Hugging Face Audio Course](https://huggingface.co/learn/audio-course/en/chapter0/introduction)
- [Librosa Documentation](https://librosa.org/doc/main/core.html#audio-loading)

---

## Instructions
Use this notebook to answer **all Milestone-1 questions**.

---

## Resources
- Notebook Link:  
  https://colab.research.google.com/drive/1m6UczhxQIke_raWSqukSWuiKbIVt7MMb?usp=sharing  

- Competition Link:  
  https://www.kaggle.com/competitions/jan-2026-dl-gen-ai-project/


In [None]:
import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
import librosa
import librosa.display
import matplotlib.pyplot as plt
import random
import torch

import warnings
warnings.filterwarnings("ignore")

In [None]:
#----------------------------- DON'T CHANGE THIS --------------------------
DATA_SEED = 67
TRAINING_SEED = 1234
SR = 22050
DURATION = 5.0
N_FFT = 2048
HOP_LENGTH = 512
N_MELS = 128
TOP_DB=20
TARGET_SNR_DB = 10

random.seed(DATA_SEED)
np.random.seed(DATA_SEED)
torch.manual_seed(DATA_SEED)
torch.cuda.manual_seed(DATA_SEED)

In [None]:
# CONFIGURATION
DATA_ROOT = # Enter dataset path
GENRES = [] # Make the list of all genres available (alphabetical order)
STEMS = {} # Write here stems file name
STEM_KEYS = ['drums', 'vocals', 'bass', 'other']
GENRE_TO_TEST = 'rock'
SONG_INDEX = #Enter index as per Q10.

In [None]:
def build_dataset(root_dir, val_split=0.17, seed=42):
    # Initialize empty dictionaries
    train_dataset = {g: {s.replace('.wav', ''): [] for s in STEMS} for g in GENRES}
    val_dataset   = {g: {s.replace('.wav', ''): [] for s in STEMS} for g in GENRES}

    rng = random.Random(seed)

    # ------------------- write your code here -------------------------------

        # Iterate through Genres
        # Check: if genre folder exists
        # CHECK : Completeness (Does it have all stems?)
        # CHECK : Corruption (Is any file too small? (less than 4kb))
        # size checks
        # Stratified Shuffle Split
     #-------------------------------------------------------------------------

        # Helper function to populate dict
        def add_to_dict(target_dict, song_list):
            pass

    return train_dataset, val_dataset

tr, val = build_dataset(DATA_ROOT)

In [None]:
def find_long_silences(dataset_dict, sr=SR, threshold_sec=DURATION, top_db=TOP_DB):
    """
    Input:
        dataset_dict: The dictionary structure {genre: {stem: [paths...]}}
    Output:
        df: Pandas DataFrame containing details of all files with silence >= 5s
    """
    records = []
    # ------------------- write your code here -------------------------------

    total_files =     # ---- COUNT TOTAL FILES ----



        # Load Audio

        # Find Non-Silent Intervals

        # CASE A: Fully silent
        # CASE B: START silence
        # CASE C: END silence
        # CASE D: MIDDLE silence

        # Store result
        # if max_silence >= threshold_sec:
        #     records.append({
        #         "Genre": genre,
        #         "Stem": stem_name,
        #         "Duration": round(total_duration, 2),
        #         "Max_Silence_Sec": round(max_silence, 2),
        #         "Silence_Location": ", ".join(silence_type),
        #         "File_Path": file_path
        #     })
    #-------------------------------------------------------------------------
    df = pd.DataFrame(records)
    return df


# --- EXECUTION ---
# Pass your 'tr' (training) dictionary here.
# Ensure 'tr' is defined from your previous build_dataset code.
df_silence = find_long_silences(tr, threshold_sec=DURATION, top_db=TOP_DB)

# --- RESULTS ANALYSIS ---

# ------------------- write your code here -------------------------------
#-------------------------------------------------------------------------
# Hint: Create a pivot Table: Count by Genre vs Stem


In [None]:
stems_audio = []
try:
    for key in STEM_KEYS:
      pass
    # ------------------- write your code here -------------------------------
    # Load audio (Duration 5.0s for speed/consistency)
    #-------------------------------------------------------------------------

    print("Audio loaded successfully.")
except NameError:
    print("ERROR: 'tr' dictionary not found. Please run build_dataset() first.")
except IndexError:
    print(f"ERROR: Song index {SONG_INDEX} out of range for genre {GENRE_TO_TEST}.")
except Exception as e:
    print(f"ERROR: {e}")

In [None]:
# ------------------- write your code here -------------------------------
# Stack them into a numpy array (Shape: 4 x Samples)
stems_stack =

# Mix the stems by summing them element-wise
mix_raw =

# Calculate RMS Amplitude MANUALLY
rms_val =

#Peak Normalization
max_val =

if max_val > 0:
    mix_norm = mix_raw / max_val
else:
    mix_norm = mix_raw

# VALIDATION
assert np.isclose(np.max(np.abs(mix_norm)), 1.0), "Normalization failed."
#------------------------------------------------------------------------
