In [1]:
# IMPORTANT: Before proceeding please update the HEADERS value in config.py to include your e-mail address

import os
import sys

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from src.extract_training_data import extract_and_save_raw_audio_features
from src.train_model import train_model
from src.output_trained_model_data import output_model_data
from src.predict_with_model import search_for_album_you_like

In [None]:
# === STEP 1: Data Extraction ===
# This function reads your 'my_liked_and_disliked_tracks.csv' from the raw data folder
# For each track, it fetches detailed audio features from AcousticBrainz and metadata from MusicBrainz
# and saves an enriched CSV file including all features.
# This CSV will be the base dataset for training.
extract_and_save_raw_audio_features()

In [None]:
# === STEP 2: Model Training ===
# These hyperparameters control your neural network training process.
# Feel free to tune them to improve model performance or training speed.
TEST_SIZE = 0.1                     # 10% of data reserved for testing the model
NUMBER_OF_EPOCHS = 2000             # Number of full passes over the training dataset
BATCH_SIZE = 32                     # Number of samples per training update (mini-batch)
LEARNING_RATE = 0.00001             # Step size for optimizer updates, smaller = slower but more precise
ADAMS_OPTIMIZER_WEIGHT_DECAY = 0.0001  # L2 regularization strength to avoid overfitting
EVALUATE_EVERY_X_EPOCHS = 1        # How often to evaluate and print training progress
TARGET_TEST_ACCURACY = 0.85         # Early stopping threshold: if test accuracy reaches this, stop training

# Train the model on the extracted and transformed dataset.
# This function performs:
# - Loading and cleaning the raw feature CSV
# - Scaling numeric features with MinMaxScaler
# - Encoding categorical variables (like musical key and scale)
# - Splitting into train and test sets
# - Initializing the neural network architecture (from src.model.Net)
# - Training the model using binary cross-entropy loss and Adam optimizer
# - Periodic evaluation on train and test sets, printing progress
# - Early stopping if test accuracy threshold is hit
train_model(
    test_size=TEST_SIZE, 
    num_epochs=NUMBER_OF_EPOCHS,
    batch_size=BATCH_SIZE, 
    learning_rate=LEARNING_RATE, 
    adams_optimizer_weight_decay=ADAMS_OPTIMIZER_WEIGHT_DECAY,
    evaluate_every_x_epochs=EVALUATE_EVERY_X_EPOCHS,
    target_test_accuracy=TARGET_TEST_ACCURACY  # <-- important to enable early stopping
)

In [None]:
# === STEP 3: Output Model Data ===
# This function generates output based on the trained model:
# - May produce evaluation reports like final accuracy, loss curves
# - Could save model summary statistics or metadata for future analysis
# - Prepares information needed for downstream tasks such as recommendation
output_model_data()

In [None]:
# === STEP 4: Album Recommendation Search ===
# These thresholds guide your album recommendation logic:
LIKED_SONG_THRESHOLD = 0.9           # Minimum predicted probability for a song to be considered "liked"
ALBUM_RECOMMENDATION_THRESHOLD = 0.8  # Minimum average "liked" probability across album tracks to recommend
ALBUM_TRACK_LENGTH_THRESHOLD = 5     # Minimum number of tracks in an album for it to qualify for recommendation

# This function searches through your dataset or external sources for albums matching your criteria:
# - It scores each song using the trained model's predictions
# - Aggregates scores by album
# - Filters albums by track count and average liked prediction score
# - Outputs or prints albums you are likely to enjoy based on learned preferences
search_for_album_you_like(
    LIKED_SONG_THRESHOLD, 
    ALBUM_RECOMMENDATION_THRESHOLD,
    ALBUM_TRACK_LENGTH_THRESHOLD
)