Registro de Modelos en DagsHub - Audio Classification

Este notebook entrena y registra modelos de clasificación de audio en DagsHub, incluyendo:
- Logistic Regression
- Linear Support Vector Classifier
- Random Forest
- XGBoost

## Libraries

In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from tensorflow.keras.metrics import Metric
from tensorflow.keras.models import load_model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification
import mlflow
import mlflow.sklearn
import dagshub

## Paths

In [2]:
# base data directory
base_dir = '../data/'

# raw data directory
raw_dir = os.path.join(base_dir, 'raw/')

# interim data directory
interim_dir = os.path.join(base_dir, 'interim/')

# processed data directory
proc_dir = os.path.join(base_dir, 'processed/')

# splits 
splits = ['bal_train/', 'eval/', 'unbal_train/']

# segments files
segments = ['balanced_train_segments.csv', 'unbalanced_train_segments.csv', 'eval_segments.csv']

# labels file path
labels_path = os.path.join(raw_dir, 'class_labels_indices.csv')

# ontology file path
ont_path = os.path.join(raw_dir, 'ontology.json')

# [raw/interim/processed][split]
data_path = {'raw': {}, 'interim': {}, 'processed': {}}  
data_dir = {'raw': {}, 'interim': {}, 'processed': {}}  

for i, seg in enumerate(splits):
    seg_rm = seg.replace('/', '')
    
    raw = os.path.join(raw_dir, splits[i])
    data_dir['raw'][seg_rm] = raw
    raw = os.path.join(raw, segments[i])
    data_path['raw'][seg_rm] = raw
    
    interim = os.path.join(interim_dir, splits[i])
    data_dir['interim'][seg_rm] = interim
    interim = os.path.join(interim, segments[i])
    data_path['interim'][seg_rm] = interim
    
    processed = os.path.join(proc_dir, splits[i])
    data_dir['processed'][seg_rm] = processed
    processed = os.path.join(processed, segments[i])
    data_path['processed'][seg_rm] = processed

## Functions

In [3]:
@tf.function(reduce_retracing=True)
def parse_music_example(example_proto, music_ids, id_labels_dict, seq_length=10):
    """Parse TFRecord example."""
    # Convert dict to constant tensor
    id_labels_tensor = tf.constant(list(id_labels_dict.values()))

    # Define features
    context_features = {
        "video_id": tf.io.FixedLenFeature([], tf.string),
        "labels": tf.io.VarLenFeature(tf.int64)
    }
    sequence_features = {
        "audio_embedding": tf.io.FixedLenSequenceFeature([], tf.string)
    }

    # Parse the example
    context, sequences = tf.io.parse_single_sequence_example(
        example_proto, 
        context_features=context_features, 
        sequence_features=sequence_features
    )

    # Process audio embeddings
    audio_embeddings = tf.io.decode_raw(sequences['audio_embedding'], tf.uint8)
    audio_embeddings = tf.reshape(audio_embeddings, [-1, 128])
    audio_embeddings = (tf.cast(audio_embeddings, tf.float32) - 127.5) / 127.5
    audio_embeddings = audio_embeddings[:seq_length]
    padding = [[0, seq_length - tf.shape(audio_embeddings)[0]], [0, 0]]
    audio_embeddings = tf.pad(audio_embeddings, padding)
    audio_embeddings.set_shape([seq_length, 128])

    # Process labels
    labels = tf.sparse.to_dense(context['labels'])
    id_labels = tf.gather(id_labels_tensor, labels)
    
    # Check if any label matches music_ids
    is_music = tf.reduce_any(tf.equal(tf.expand_dims(id_labels, -1), music_ids))
    
    return audio_embeddings, tf.cast(is_music, tf.float32)

def create_dataset(tfrecord_dir, music_ids, batch_size=32, seq_length=10, is_training=True):
    """Create TF dataset pipeline."""
    tfrecord_files = tf.io.gfile.glob(os.path.join(tfrecord_dir, "*.tfrecord"))
    if not tfrecord_files:
        raise ValueError(f"No TFRecord files found in {tfrecord_dir}")
        
    # Convert music_ids to tensor
    music_ids_tensor = tf.constant([str(id) for id in music_ids], dtype=tf.string)
    
    # Create dataset
    dataset = tf.data.TFRecordDataset(tfrecord_files, num_parallel_reads=tf.data.AUTOTUNE)
    
    # Parse examples
    parse_fn = lambda x: parse_music_example(x, music_ids_tensor, id_labels_dict, seq_length)
    dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.AUTOTUNE)
    
    # Shuffle if training
    if is_training:
        dataset = dataset.shuffle(buffer_size=1000)
    
    # Batch and prefetch
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    return dataset

## Dataframes

In [4]:
# Read balanced train segments
with open(data_path['raw']['bal_train'], "r", encoding="utf-8") as f:
    lines = f.readlines()

fixed_rows = []
for line in lines[3:]:  
    parts = line.strip().split(",")  
    if len(parts) >= 4:  
        fixed_rows.append([parts[0], parts[1], parts[2], ",".join(parts[3:])]) 

df_segments = pd.DataFrame(fixed_rows, columns=["YTID", "start_seconds", "end_seconds", "positive_labels"])

# Load ontology
with open(ont_path, "r", encoding="utf-8") as f:
    data = json.load(f)

df_ontology = pd.DataFrame(data)

# Create music labels
keywords_column_name = ["music", "musical", "song", "instrument", "singing"]
keywords_column_description = ["music", "musical", "song", "singing"]

pattern_column_name = "|".join(keywords_column_name)
pattern_column_description = "|".join(keywords_column_description)

name_contains = df_ontology["name"].str.lower().str.contains(pattern_column_name)
description_contains = df_ontology["description"].str.lower().str.contains(pattern_column_description, na=False)

df_ontology["is_music"] = (name_contains | description_contains).astype(int)

# Load class labels
df_class_labels_indices = pd.read_csv(labels_path)

# Merge ontology and labels
df_ontology_labels = pd.merge(df_class_labels_indices, df_ontology, left_on='mid', right_on='id', how='left')
df_ontology_labels = df_ontology_labels.drop(columns=['mid', 'display_name'])
id_labels_dict = df_ontology_labels.set_index('index')['id'].to_dict()
df_ontology_labels.set_index('index', inplace=True)

## Train, val data

In [5]:
# Get music IDs
music_ids = set(df_ontology_labels[df_ontology_labels["is_music"] == 1]["id"].astype(str))

# Create full dataset
full_dataset = create_dataset(
    tfrecord_dir=data_dir['raw']['bal_train'],
    music_ids=music_ids,
    batch_size=32
)

# Better way to get dataset size (without loading all data)
def get_dataset_size(dataset):
    return sum(1 for _ in dataset)

# Split dataset
dataset_size = get_dataset_size(full_dataset)
val_size = int(0.2 * dataset_size)
train_ds = full_dataset.skip(val_size)
val_ds = full_dataset.take(val_size)

# Convert TF Dataset to numpy arrays
def dataset_to_numpy(dataset):
    X, y = [], []
    for audio_emb, label in dataset.unbatch():
        X.append(audio_emb.numpy().flatten())  # Flatten to [seq_length * 128]
        y.append(label.numpy())
    return np.array(X), np.array(y)

X_train, y_train = dataset_to_numpy(train_ds)
X_val, y_val = dataset_to_numpy(val_ds)

# Normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Dimensionality reduction
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)


## DagsHub init and functions

In [6]:
# Initialize DagsHub
dagshub.init(repo_owner='felytz', repo_name='waveled', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/felytz/waveled.mlflow')

def train_and_register_model(model, model_name, params, X_train, y_train, X_test, y_test, use_pca=False):
    """Train model with GridSearch and log to MLflow."""
    with mlflow.start_run(run_name=model_name):
        # Log tags
        mlflow.set_tag("model_type", model_name)
        mlflow.set_tag("dataset", "AudioSet")
        mlflow.set_tag("features", "PCA" if use_pca else "Original")
        
        # Grid Search
        grid = GridSearchCV(
            model, 
            params, 
            cv=3, 
            scoring='f1_weighted', 
            n_jobs=-1, 
            verbose=1
        )
        grid.fit(X_train, y_train)
        
        # Get best model
        best_model = grid.best_estimator_
        
        # Evaluation
        y_pred = best_model.predict(X_test)
        f1 = f1_score(y_test, y_pred, average='weighted')
        accuracy = accuracy_score(y_test, y_pred)
        
        # Log metrics
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_params(grid.best_params_)
        
        # Log artifacts
        mlflow.sklearn.log_model(best_model, model_name)
        mlflow.log_dict(
            classification_report(y_test, y_pred, output_dict=True), 
            "classification_report.json"
        )
        
        # Log preprocessing models
        if use_pca:
            mlflow.sklearn.log_model(pca, "pca_model")
        mlflow.sklearn.log_model(scaler, "scaler_model")
        
        print(f"{model_name} - F1: {f1:.4f}, Accuracy: {accuracy:.4f}")

## Models definition, training, and register

In [7]:
# Model configurations
models_config = [
    {
        "model": LogisticRegression(class_weight='balanced', random_state=10, max_iter=1000),
        "name": "LogisticRegression_PCA",
        "params": {
            'C': [0.01, 0.1, 1, 10], 
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear']
        },
        "use_pca": True
    },
    {
        "model": LinearSVC(class_weight='balanced', random_state=10, dual=False, max_iter=10000),
        "name": "LinearSVC_PCA",
        "params": {
            'C': [0.1, 1, 10],
            'penalty': ['l2'],
            'loss': ['squared_hinge']
        },
        "use_pca": True
    },
    {
        "model": RandomForestClassifier(class_weight='balanced', random_state=10),
        "name": "RandomForest",
        "params": {
            'n_estimators': [50, 100],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        },
        "use_pca": False
    },
    {
        "model": XGBClassifier(
            scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
            random_state=10,
            eval_metric='logloss'
        ),
        "name": "XGBoost",
        "params": {
            'n_estimators': [50, 100],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 6],
            'subsample': [0.8, 1.0]
        },
        "use_pca": False
    }
]

# Train and register all models
for config in models_config:
    train_data = X_train_pca if config["use_pca"] else X_train_scaled
    val_data = X_val_pca if config["use_pca"] else X_val_scaled
    
    train_and_register_model(
        model=config["model"],
        model_name=config["name"],
        params=config["params"],
        X_train=train_data,
        y_train=y_train,
        X_test=val_data,
        y_test=y_val,
        use_pca=config["use_pca"]
    )

Fitting 3 folds for each of 8 candidates, totalling 24 fits




LogisticRegression_PCA - F1: 0.8588, Accuracy: 0.8580
🏃 View run LogisticRegression_PCA at: https://dagshub.com/felytz/waveled.mlflow/#/experiments/0/runs/e0f0e4cad21b49c1b5624580b5cc9460
🧪 View experiment at: https://dagshub.com/felytz/waveled.mlflow/#/experiments/0
Fitting 3 folds for each of 3 candidates, totalling 9 fits




LinearSVC_PCA - F1: 0.8592, Accuracy: 0.8587
🏃 View run LinearSVC_PCA at: https://dagshub.com/felytz/waveled.mlflow/#/experiments/0/runs/34af28b1bf2c4d39ae95a5d25d17f706
🧪 View experiment at: https://dagshub.com/felytz/waveled.mlflow/#/experiments/0
Fitting 3 folds for each of 12 candidates, totalling 36 fits




RandomForest - F1: 0.8626, Accuracy: 0.8632
🏃 View run RandomForest at: https://dagshub.com/felytz/waveled.mlflow/#/experiments/0/runs/639a4d6c86144fbcab73884145ff9006
🧪 View experiment at: https://dagshub.com/felytz/waveled.mlflow/#/experiments/0
Fitting 3 folds for each of 16 candidates, totalling 48 fits




XGBoost - F1: 0.8737, Accuracy: 0.8736
🏃 View run XGBoost at: https://dagshub.com/felytz/waveled.mlflow/#/experiments/0/runs/95a82933d456494984644985dfa5d82b
🧪 View experiment at: https://dagshub.com/felytz/waveled.mlflow/#/experiments/0
