# 🎧 Audio Feature Extraction for Content-Based Music Recommendation

This notebook implements the feature extraction process for our **content-based music recommender system**.
We will analyze the **audio characteristics** of tracks from the **MP3-Example** folder, which contains **30-second previews** across **15 music genres**.

## 🛠️ Objectives

- Extract key audio features from the MP3 files, including:
  - 🎼 **MFCCs (Mel-Frequency Cepstral Coefficients)** – capturing timbral characteristics.
  - 🎚️ **Chroma Features** – representing harmonic and pitch information.
  - 🌊 **Spectral Contrast** – measuring frequency distribution differences.
  - 🥁 **Tempo** – detecting the song's speed (beats per minute).

- Save the extracted features into a **CSV file** for further analysis and modeling.

## 📂 Dataset Overview

- **Location:** `ContentBased_data/MP3-Example/`
- **Structure:** One folder per genre, each containing 100 tracks.
- **Genres:** Blues, Country, Electronic, Folk, Jazz, Latin, Metal, New Age, Pop, Punk, Rap, Reggae, RnB, Rock, World.

## ⚙️ Steps

1. Traverse each genre folder and process all audio files.
2. Extract relevant features using **`librosa`**.
3. Store the resulting dataset as `extracted_audio_features.csv`.

---

Let's get started! 🚀🎶



In [None]:
import os

# Path to the MP3-Example folder
AUDIO_DIR = "../ContentBased_data/MP3-Example"

def list_audio_files(base_dir):
    """List all audio files grouped by genre"""
    genre_files = {}

    # Iterate through each genre directory
    for genre in os.listdir(base_dir):
        genre_path = os.path.join(base_dir, genre)

        if os.path.isdir(genre_path):
            files = [f for f in os.listdir(genre_path) if f.endswith('.mp3')]
            genre_files[genre] = files

    return genre_files

# Display the files
audio_files = list_audio_files(AUDIO_DIR)

# Print summary of files found
for genre, files in audio_files.items():
    print(f"{genre}: {len(files)} files")
    print(f"Sample: {files[:3]}") # Print the 3 first files as a sample

In [None]:
from mutagen.mp3 import MP3
import os

def extract_basic_features(file_path):
    """Extract basic audio features like duration and sample rate."""
    try:
        # Extract audio metadata
        audio = MP3(file_path)
        duration = round(audio.info.length, 2)
        sample_rate = audio.info.sample_rate

        return {
            "sample_rate": sample_rate,
            "duration_sec": duration
        }

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return {}

# Test with 3 files from different genres
sample_files = [
    os.path.join(AUDIO_DIR, "Rock", audio_files["Rock"][0]),
    os.path.join(AUDIO_DIR, "Jazz", audio_files["Jazz"][0]),
    os.path.join(AUDIO_DIR, "Blues", audio_files["Blues"][0])
]

# Extract features for these sample files
for file in sample_files:
    features = extract_basic_features(file)
    if features:
        print(f"{file}")
        print(f"Sample Rate: {features.get('sample_rate', 'N/A')} Hz")
        print(f"Duration: {features.get('duration_sec', 'N/A')} sec")
    else:
        print(f"Failed to extract features from: {file}")

In [None]:
import os
import librosa
import soundfile as sf

# Paths
AUDIO_DIR = "../ContentBased_data/MP3-Example"
DOWNSAMPLED_DIR = "../ContentBased_data/MP3-Example-Downsampled"
TARGET_SR = 22050

def downsample_all_audio(base_dir, target_dir, target_sr=22050):
    """Downsample all audio files from base_dir and save them to target_dir"""

    # Ensure target dictionary exists
    os.makedirs(target_dir, exist_ok=True)

    # Iterate through each genre folder
    for genre in os.listdir(base_dir):
        genre_path = os.path.join(base_dir, genre)
        target_genre_path = os.path.join(target_dir, genre)

        # Create the corresponding genre folder in the target directory
        if os.path.isdir(genre_path):
            os.makedirs(target_genre_path, exist_ok=True)

            print(f"Processing genre: {genre}")

            # Process each file in the genre folder
            for file_name in os.listdir(genre_path):
                if file_name.endswith(".mp3"):
                    file_path = os.path.join(genre_path, file_name)
                    target_file_path = os.path.join(target_genre_path, file_name)

                    # Downsample audio
                    try:
                        y, sr = librosa.load(file_path, sr=None, mono=True)
                        y_downsampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr)

                        # Save the downsampled file
                        sf.write(target_file_path, y_downsampled, target_sr)
                        print(f"Downsampled: {file_name} -> {target_sr} Hz")

                    except Exception as e:
                        print(f"Failed to process {file_path}: {e}")

# Run the batch doensampling
# downsample_all_audio(AUDIO_DIR, DOWNSAMPLED_DIR, TARGET_SR) # Done