In [None]:
# IMPORTANT: Before proceeding please update the HEADERS value in config.py to include your e-mail address

import os
import sys

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from src.extract_training_data import extract_and_save_raw_audio_features
from src.train_model import train_model
from src.output_trained_model_data import output_model_data
from src.predict_with_model import search_for_album_you_like

In [None]:
# === STEP 1: Data Extraction ===
# This function reads your 'my_liked_and_disliked_tracks.csv' from the raw data folder and gathers associated audio features from 
# Acoustic Brainz and Music Brainz.
#
# A set of 1500 liked and disliked songs has already been included, as well as the raw data this method extracts. This can be used without modification.
# If you want to train the neural network using that data, you can skip directly to the training step.
#
# If you want to use your own data, you need to update the 'my_liked_and_disliked_tracks.csv' file in data/raw.
# Your 'my_liked_and_disliked_tracks.csv' file needs to contain a column of 'mbid' IDs representing the unique music brainz ID of 
# songs you like. A second column 'liked' should include a 1 or 0 representing if you like or dislike the given song.
# Ideally, include about 50% liked and 50% disliked songs.
#
# For each track, this function fetches detailed audio features from AcousticBrainz and metadata from MusicBrainz
# and saves an enriched CSV file including all features. This CSV will be saved in data/raw and will be the base dataset for training.
extract_and_save_raw_audio_features()

In [None]:
# === STEP 2: Model Training ===
# These hyperparameters control your neural network training process.
# Feel free to tune them to improve model performance or training speed.
TEST_SIZE = 0.1                        # Percentage of data reserved for testing the model
NUMBER_OF_EPOCHS = 1000                # Number of full passes over the training dataset
BATCH_SIZE = 32                        # Number of samples per training update (mini-batch)
LEARNING_RATE = 0.0001                 # Step size for optimizer updates, smaller = slower but more precise
ADAMS_OPTIMIZER_WEIGHT_DECAY = 0.0001  # Regularization strength to avoid overfitting
EVALUATE_EVERY_X_EPOCHS = 1            # How often to evaluate and print training progress, as well as check for the 
                                       # early stopping threshold set by TARGET_TEST_ACCURACY.
TARGET_TEST_ACCURACY = 0.85            # Early stopping threshold: If test accuracy reaches this percentage, stop training and save the model.

# Train the model on the extracted and transformed dataset, as well as the model set up in src/model.py.
# If desired you can update the model structure in src/model.py to try out different numbers of layers, different activation functions, etc. 
# This function performs:
# Loading and cleaning the raw feature CSV
# Scaling numeric features with MinMaxScaler and saving the scaler object to scaler.joblib
# Encoding categorical variables (like musical key and scale)
# Splitting into train and test sets
# Initializing the neural network architecture (from src.model.Net in src/model.py)
# Training the model using binary cross-entropy loss and Adam optimizer
# Periodic evaluation on train and test sets, and printing progress of average loss.
# The average loss represents the target liked/disliked value (1 or 0) minus the predicted value averaged over the epoch. 
# Early stopping if test accuracy threshold is hit
# Saves the trained model to song_pref_model_weights.pth in the output/models folder
train_model(
    test_size=TEST_SIZE, 
    num_epochs=NUMBER_OF_EPOCHS,
    batch_size=BATCH_SIZE, 
    learning_rate=LEARNING_RATE, 
    adams_optimizer_weight_decay=ADAMS_OPTIMIZER_WEIGHT_DECAY,
    evaluate_every_x_epochs=EVALUATE_EVERY_X_EPOCHS,
    #target_test_accuracy=TARGET_TEST_ACCURACY  # <-- important to enable early stopping once an ideal stopping point has been identified
)

In [None]:
# === STEP 3: Output Model Data ===
# This function generates output based on the trained model and saves it in the data/output/weights folder
# Produces csv files that give information on bias, grouped importance of features, top 10 features, and weights per layer
# of the neural network
# These can be used to glean information about what the neural network learned from the dataset, how it is making recommendations,
# and why it has overfit or underfit the dataset.
output_model_data()

In [None]:
# === STEP 4: Album Recommendation Search ===
# These thresholds guide your album recommendation logic:
LIKED_SONG_THRESHOLD = 0.5            # Minimum predicted probability for a song to be considered "liked"
ALBUM_RECOMMENDATION_THRESHOLD = 0.5  # Minimum percentage of "liked" album tracks needed to recommend the album
ALBUM_TRACK_LENGTH_THRESHOLD = 5      # Minimum number of tracks in an album for it to qualify for recommendation

# This function searches through MusicBrainz for albums matching your criteria:
# It scores each song using the trained model's weights to make a prediction.
# It determines if the album has enough total songs and enough liked songs to recommend
# Outputs or prints albums you are likely to enjoy based on learned preferences
# Please note that this is slow due to Music Brainz and Acoustic Brainz being Free APIs
# Which allow one query per second, and due to a high number of Music Brainz albums not having
# The necessary low level features that the neural network has been trained on
#
# IMPORTANT: Before proceeding please update the HEADERS value in config.py to include your e-mail address
#
search_for_album_you_like(
    LIKED_SONG_THRESHOLD, 
    ALBUM_RECOMMENDATION_THRESHOLD,
    ALBUM_TRACK_LENGTH_THRESHOLD
)