In [147]:
import pandas as pd
import numpy as np
import re
import os
import zipfile
from nltk.corpus import stopwords
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras_tuner import RandomSearch, HyperParameters, Objective
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, Layer
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, Callback
from transformers import BertTokenizer, TFBertModel, get_linear_schedule_with_warmup, WarmUp, AdamW, RobertaTokenizer, TFRobertaModel, RobertaModel, XLMRobertaTokenizer, XLMRobertaModel

In [6]:
# 解压 cleaned_lyrics.zip 文件
with zipfile.ZipFile('sampled.zip', 'r') as zip_ref:
    zip_ref.extractall('sampled')

# 获取所有歌词文件的路径
lyrics_files = {os.path.splitext(f)[0]: os.path.join('sampled', f) for f in os.listdir('sampled')}

# 读取 filtered_dataset.csv 文件
data = pd.read_csv('sampled_dataset.csv')

def read_lyrics(record_id):
    file_path = lyrics_files.get(str(record_id))
    if file_path and os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    return ''

# 读取歌词并添加到数据框中
data['lyrics'] = data['record_id'].apply(read_lyrics)


In [7]:
data

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,record_id,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,...,instrumentalness,liveness,valence,tempo,time_signature,track_genre,valence_bin,energy_bin,danceability_bin,lyrics
0,1458,2834,2834,20097,4b98LXC0QUWGBteJ5uwVQY,Doja Cat,Summer Music Festival Hits,Boss Bitch,0,134239,...,0.000000,0.2030,0.575,125.993,4,dance,1,2,2,mmm not tryna ah not tryna not tryna yeah not ...
1,1557,2967,2967,22816,58Z83tSbShyHxCwqTCE8M6,Goatwhore,Vengeful Ascension,"Under the Flesh, Into the Soul",21,273266,...,0.110000,0.1070,0.347,170.015,4,death-metal,1,2,0,world grave apathetic cold selfish prison cove...
2,425,616,616,3220,0AOmbw8AwDnwXhHC3OhdVB,Thousand Foot Krutch,The End Is Where We Begin,Courtesy Call,72,236898,...,0.000000,0.0822,0.445,164.079,4,alternative,1,1,1,hey comes danger club get started man not gonn...
3,989,2054,2054,14581,5Jaj6nLjHCizmcPddJLO3k,Blippi,"Blippi Tunes, Vol. 2: Machines (Music for Todd...",The Train Song,53,207428,...,0.000019,0.3250,0.662,139.895,4,children,2,1,2,choo choo comes train choo choo comes train ro...
4,383,562,562,3717,2oaK4JLVnmRGIO9ytBE1bt,Red Hot Chili Peppers,The Getaway,Dark Necessities,74,302000,...,0.019900,0.1100,0.197,91.959,4,alternative,0,2,2,comin light day got many moons deep play keep ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,658,1229,1229,8662,3hSy2AUoElgIk6fjhYnRH3,Elvin Bishop,Sure Feels Good: The Best Of Elvin Bishop,Fooled Around And Fell In Love,57,276933,...,0.080500,0.0953,0.610,113.463,3,blues,1,1,1,million girls love em leave em alone not care ...
596,5818,10682,10682,113479,5ELZpvTDGorz9BIE9zaBoZ,Tenth Avenue North,Followers,I Have This Hope,52,204800,...,0.000000,0.1890,0.110,108.009,4,world-music,0,1,1,walk great unknown questions come questions go...
597,265,426,426,2185,3mJV4kByjmgU3ubU7JPp9W,Marilyn Manson,Halloween 2022,You And Me And The Devil Makes 3,0,264266,...,0.834000,0.2030,0.399,128.021,4,alt-rock,1,2,1,like rolling stone hill hades want lie gonna l...
598,5031,9325,9325,94717,5xUpsNCW71S58c8TycsqNa,Ollie,Sunsets & Goodbyes,what if,39,173615,...,0.000627,0.0723,0.283,146.006,4,sad,0,0,2,yeah call one day everything yeah call next no...


In [9]:
# 使用 Tokenizer 处理文本
max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['lyrics'])
sequences = tokenizer.texts_to_sequences(data['lyrics'])
X_lyrics = pad_sequences(sequences, maxlen=max_len)

# 准备标签
y_valence = to_categorical(data['valence_bin'].values)
y_energy = to_categorical(data['energy_bin'].values)
y_danceability = to_categorical(data['danceability_bin'].values)

# 拆分数据集
X_train_val, X_test, y_train_val_valence, y_test_valence, y_train_val_energy, y_test_energy, y_train_val_danceability, y_test_danceability = train_test_split(
    X_lyrics, y_valence, y_energy, y_danceability, test_size=0.2, random_state=42)

X_train, X_val, y_train_valence, y_val_valence, y_train_energy, y_val_energy, y_train_danceability, y_val_danceability = train_test_split(
    X_train_val, y_train_val_valence, y_train_val_energy, y_train_val_danceability, test_size=0.2, random_state=42)


In [10]:
X_train

array([[   9,   80,   77, ...,  718, 3619,   32],
       [  28,  344,  780, ...,  344,  780,   32],
       [  77,  609,  198, ...,  609,  198,   32],
       ...,
       [   1,    1,  433, ...,   28,   44,   32],
       [ 653, 1380,  591, ...,    4,  203,   32],
       [  58,  135,   13, ..., 1744,  313,   32]])

In [12]:
from keras_tuner import RandomSearch, HyperParameters, Objective
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Embedding, Flatten
from keras.callbacks import EarlyStopping


# 构建模型函数
def build_model(hp):
    inputs = Input(shape=(max_len,))
    x = Embedding(input_dim=hp.Int('input_dim', min_value=1000, max_value=10000, step=1000),
                  output_dim=hp.Int('output_dim', min_value=32, max_value=128, step=32),
                  input_length=max_len)(inputs)
    x = Flatten()(x)

    num_layers = hp.Int('num_layers', min_value=1, max_value=5, step=1)
    for i in range(num_layers):
        if i == 0:
            x = Dense(units=hp.Int(f'units_layer{i+1}', min_value=32, max_value=512, step=32), activation='relu')(x)
        else:
            x = Dense(units=hp.Int(f'units_layer{i+1}', min_value=32, max_value=512, step=32), activation='relu')(x)
        x = Dropout(rate=hp.Float(f'dropout_layer{i+1}', min_value=0.0, max_value=0.5, step=0.1))(x)

    x = Dense(units=hp.Int('units_final', min_value=32, max_value=512, step=32),
              activation='relu',
              kernel_regularizer=tf.keras.regularizers.l2(hp.Choice('l2_regularization', values=[0.0, 1e-4, 1e-3])),
              kernel_initializer=hp.Choice('kernel_initializer', values=['glorot_uniform', 'he_normal']))(x)

    
    output_valence = Dense(y_valence.shape[1], activation='softmax', name='valence_output')(x)
    output_energy = Dense(y_energy.shape[1], activation='softmax', name='energy_output')(x)
    output_danceability = Dense(y_danceability.shape[1], activation='softmax', name='danceability_output')(x)
    
    model = Model(inputs=inputs, outputs=[output_valence, output_energy, output_danceability])

    optimizer_choice = hp.Choice('optimizer', values=['adam', 'rmsprop', 'sgd'])
    learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])

    if optimizer_choice == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_choice == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    else:
        optimizer = SGD(learning_rate=learning_rate)

    model.compile(optimizer=optimizer,
                  loss={'valence_output': 'categorical_crossentropy', 
                        'energy_output': 'categorical_crossentropy', 
                        'danceability_output': 'categorical_crossentropy'},
                  metrics={'valence_output': 'accuracy', 
                           'energy_output': 'accuracy', 
                           'danceability_output': 'accuracy'})
    return model

# 超参数调优
tuner = RandomSearch(
    build_model,
    objective=Objective('val_valence_output_accuracy', direction='max'),
    max_trials=10,
    executions_per_trial=1,
    directory='tuner_dir',
    project_name='dnn_mood_detection_600'
)

# 启动调优过程
tuner.search(X_train, [y_train_valence, y_train_energy, y_train_danceability], 
             epochs=20, 
             validation_data=(X_val, [y_val_valence, y_val_energy, y_val_danceability]), 
             callbacks=[EarlyStopping(patience=3)])

# 获取最佳模型
best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hyperparameters.values)

# 评估模型

loss, valence_output_loss, energy_output_loss, danceability_output_loss, accuracy_valence, accuracy_energy, accuracy_danceability = best_model.evaluate(X_test, [y_test_valence, y_test_energy, y_test_danceability])
print(f'Test Loss: {loss}, valence_output_loss: {valence_output_loss}, energy_output_loss: {energy_output_loss}, danceability_output_loss: {danceability_output_loss}, Test Accuracy Valence: {accuracy_valence}, Test Accuracy Energy: {accuracy_energy}, Test Accuracy Danceability: {accuracy_danceability}')

Trial 10 Complete [00h 00m 03s]
val_valence_output_accuracy: 0.5416666865348816

Best val_valence_output_accuracy So Far: 0.5416666865348816
Total elapsed time: 00h 00m 21s


  saveable.load_own_variables(weights_store.get(inner_path))


{'input_dim': 7000, 'output_dim': 128, 'num_layers': 1, 'units_layer1': 32, 'dropout_layer1': 0.0, 'units_final': 256, 'l2_regularization': 0.0, 'kernel_initializer': 'glorot_uniform', 'optimizer': 'adam', 'learning_rate': 0.0001, 'units_layer2': 448, 'dropout_layer2': 0.0, 'units_layer3': 160, 'dropout_layer3': 0.4, 'units_layer4': 416, 'dropout_layer4': 0.30000000000000004, 'units_layer5': 384, 'dropout_layer5': 0.2}
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - danceability_output_accuracy: 0.4948 - energy_output_accuracy: 0.3417 - loss: 3.1083 - valence_output_accuracy: 0.3975  


ValueError: not enough values to unpack (expected 7, got 4)

In [13]:
metrics = best_model.evaluate(X_test, [y_test_valence, y_test_energy, y_test_danceability])
#print(f'Test Loss: {loss}, Test Accuracy Valence: {accuracy_valence}, Test Accuracy Energy: {accuracy_energy}, Test Accuracy Danceability: {accuracy_danceability}')
print(metrics)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - danceability_output_accuracy: 0.4948 - energy_output_accuracy: 0.3417 - loss: 3.1083 - valence_output_accuracy: 0.3975 
[3.0829503536224365, 0.5, 0.375, 0.4000000059604645]


In [14]:
# 构建 CNN 模型函数
def build_cnn_model(hp):
    inputs = Input(shape=(max_len,))
    x = Embedding(input_dim=max_words, output_dim=hp.Int('embedding_output_dim', min_value=32, max_value=128, step=32), input_length=max_len)(inputs)
    x = tf.keras.layers.Conv1D(filters=hp.Int('filters', min_value=32, max_value=128, step=32), kernel_size=hp.Int('kernel_size', min_value=3, max_value=7, step=2), activation='relu')(x)
    x = tf.keras.layers.AveragePooling1D(pool_size=hp.Int('pool_size', min_value=2, max_value=5, step=1))(x)
    x = Flatten()(x)
    
    num_layers = hp.Int('num_layers', min_value=1, max_value=3, step=1)
    for i in range(num_layers):
        x = Dense(units=hp.Int(f'dense_units_{i+1}', min_value=32, max_value=512, step=32), activation='relu')(x)
        x = Dropout(rate=hp.Float(f'dropout_{i+1}', min_value=0.0, max_value=0.5, step=0.1))(x)
    
    output_valence = Dense(y_valence.shape[1], activation='softmax', name='valence_output')(x)
    output_energy = Dense(y_energy.shape[1], activation='softmax', name='energy_output')(x)
    output_danceability = Dense(y_danceability.shape[1], activation='softmax', name='danceability_output')(x)
    
    model = Model(inputs=inputs, outputs=[output_valence, output_energy, output_danceability])

    optimizer_choice = hp.Choice('optimizer', values=['adam', 'rmsprop', 'sgd'])
    learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])

    if optimizer_choice == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_choice == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    else:
        optimizer = SGD(learning_rate=learning_rate)

    model.compile(optimizer=optimizer,
                  loss={'valence_output': 'categorical_crossentropy', 
                        'energy_output': 'categorical_crossentropy', 
                        'danceability_output': 'categorical_crossentropy'},
                  metrics={'valence_output': 'accuracy', 
                           'energy_output': 'accuracy', 
                           'danceability_output': 'accuracy'})
    return model

# 超参数调优
tuner = RandomSearch(
    build_cnn_model,
    objective=Objective('val_valence_output_accuracy', direction='max'),
    max_trials=10,
    executions_per_trial=1,
    directory='tuner_dir',
    project_name='cnn_mood_detection_600'
)

# 启动调优过程
tuner.search(X_train, [y_train_valence, y_train_energy, y_train_danceability], 
             epochs=20, 
             validation_data=(X_val, [y_val_valence, y_val_energy, y_val_danceability]), 
             callbacks=[EarlyStopping(patience=3)])

# 获取最佳模型
best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hyperparameters.values)

# 评估模型
loss, valence_output_loss, energy_output_loss, danceability_output_loss, accuracy_valence, accuracy_energy, accuracy_danceability = best_model.evaluate(X_test, [y_test_valence, y_test_energy, y_test_danceability])
print(f'Test Loss: {loss}, valence_output_loss: {valence_output_loss}, energy_output_loss: {energy_output_loss}, danceability_output_loss: {danceability_output_loss}, Test Accuracy Valence: {accuracy_valence}, Test Accuracy Energy: {accuracy_energy}, Test Accuracy Danceability: {accuracy_danceability}')

Trial 10 Complete [00h 00m 02s]
val_valence_output_accuracy: 0.4791666567325592

Best val_valence_output_accuracy So Far: 0.53125
Total elapsed time: 00h 00m 26s


  saveable.load_own_variables(weights_store.get(inner_path))


{'embedding_output_dim': 64, 'filters': 64, 'kernel_size': 3, 'pool_size': 5, 'num_layers': 3, 'dense_units_1': 96, 'dropout_1': 0.4, 'optimizer': 'rmsprop', 'learning_rate': 0.001, 'dense_units_2': 32, 'dropout_2': 0.0, 'dense_units_3': 32, 'dropout_3': 0.0}
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - danceability_output_accuracy: 0.4258 - energy_output_accuracy: 0.3546 - loss: 3.1556 - valence_output_accuracy: 0.4721  


ValueError: not enough values to unpack (expected 7, got 4)

In [16]:
# 拆分数据集
X_train_val, X_test, y_train_val_valence, y_test_valence, y_train_val_energy, y_test_energy, y_train_val_danceability, y_test_danceability = train_test_split(
    data['lyrics'], y_valence, y_energy, y_danceability, test_size=0.2, random_state=42)

X_train, X_val, y_train_valence, y_val_valence, y_train_energy, y_val_energy, y_train_danceability, y_val_danceability = train_test_split(
    X_train_val, y_train_val_valence, y_train_val_energy, y_train_val_danceability, test_size=0.2, random_state=42)

# 使用 BertTokenizer 和 TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(sentences, tokenizer, max_len=128):
    input_ids, attention_masks = [], []
    for sent in sentences:
        encoded = tokenizer.encode_plus(
            text=sent,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            truncation=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids), np.array(attention_masks)

max_len = 128

X_train_input_ids, X_train_attention_masks = tokenize(X_train, tokenizer, max_len)
X_val_input_ids, X_val_attention_masks = tokenize(X_val, tokenizer, max_len)
X_test_input_ids, X_test_attention_masks = tokenize(X_test, tokenizer, max_len)


# BERT

In [125]:
import torch.nn as nn
import transformers
import torch
import optuna
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import LambdaLR
from sklearn import metrics
from transformers import BertForSequenceClassification, BertModel, get_linear_schedule_with_warmup, logging

In [72]:
X_train_input_ids

array([[  101,  2371,  5223, ...,  5223,  4125,   102],
       [  101,  2821,  2132, ...,  8072,  2823,   102],
       [  101,  2111,  2088, ...,  4536,  4536,   102],
       ...,
       [  101,  2092,  2387, ..., 12927,  2100,   102],
       [  101,  2034,  3058, ...,  5236,  4845,   102],
       [  101,  2379,  2203, ...,  5223,  8843,   102]])

In [96]:
num_labels = 3

class MultiLabelBERT(nn.Module):
    def __init__(self, model_name, num_labels_valence, num_labels_energy, num_labels_danceability, dropout_rate):
        super(MultiLabelBERT, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier_valence = nn.Linear(self.bert.config.hidden_size, num_labels_valence)
        self.classifier_energy = nn.Linear(self.bert.config.hidden_size, num_labels_energy)
        self.classifier_danceability = nn.Linear(self.bert.config.hidden_size, num_labels_danceability)
    
    def forward(self, input_ids, attention_mask, labels_valence=None, labels_energy=None, labels_danceability=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        pooled_output = self.dropout(outputs.pooler_output)
        
        logits_valence = self.classifier_valence(pooled_output)
        logits_energy = self.classifier_energy(pooled_output)
        logits_danceability = self.classifier_danceability(pooled_output)
        
        loss = None
        if labels_valence is not None and labels_energy is not None and labels_danceability is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss_valence = loss_fct(logits_valence, labels_valence)
            loss_energy = loss_fct(logits_energy, labels_energy)
            loss_danceability = loss_fct(logits_danceability, labels_danceability)
            loss = loss_valence + loss_energy + loss_danceability
        
        return (loss, logits_valence, logits_energy, logits_danceability)

In [95]:
class MultiLabelDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels_valence, labels_energy, labels_danceability):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels_valence = np.argmax(labels_valence, axis=1)
        self.labels_energy = np.argmax(labels_energy, axis=1)
        self.labels_danceability = np.argmax(labels_danceability, axis=1)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'labels_valence': torch.tensor(self.labels_valence[idx], dtype=torch.long),
            'labels_energy': torch.tensor(self.labels_energy[idx], dtype=torch.long),
            'labels_danceability': torch.tensor(self.labels_danceability[idx], dtype=torch.long)
        }
        return item

In [94]:
def train_and_evaluate(params):
    lr = params['lr']
    num_epochs = int(params['num_epochs'])
    batch_size = int(params['batch_size'])
    weight_decay = params['weight_decay']
    dropout_rate = params['dropout_rate']
    
    torch.cuda.empty_cache()
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = MultiLabelBERT('bert-base-uncased', num_labels_valence=3, num_labels_energy=3, num_labels_danceability=3, dropout_rate=dropout_rate).to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    train_dataset = MultiLabelDataset(X_train_input_ids, X_train_attention_masks, y_train_valence, y_train_energy, y_train_danceability)
    val_dataset = MultiLabelDataset(X_val_input_ids, X_val_attention_masks, y_val_valence, y_val_energy, y_val_danceability)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    
    num_training_steps = len(train_dataloader) * num_epochs
    warmup_steps = int(0.1 * num_training_steps)
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=num_training_steps)

    best_val_loss = float('inf')
    best_val_accuracy = float(0)
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_predictions_valence = []
        train_predictions_energy = []
        train_predictions_danceability = []
        train_labels_valence = []
        train_labels_energy = []
        train_labels_danceability = []
        progress_bar = tqdm(train_dataloader, desc='Training', leave=False)

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels_valence = batch['labels_valence'].to(device)
            labels_energy = batch['labels_energy'].to(device)
            labels_danceability = batch['labels_danceability'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels_valence=labels_valence, labels_energy=labels_energy, labels_danceability=labels_danceability)
            loss, logits_valence, logits_energy, logits_danceability = outputs
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()

            train_loss += loss.item()
            train_predictions_valence.extend(logits_valence.argmax(dim=-1).cpu().numpy())
            train_predictions_energy.extend(logits_energy.argmax(dim=-1).cpu().numpy())
            train_predictions_danceability.extend(logits_danceability.argmax(dim=-1).cpu().numpy())
            train_labels_valence.extend(labels_valence.cpu().numpy())
            train_labels_energy.extend(labels_energy.cpu().numpy())
            train_labels_danceability.extend(labels_danceability.cpu().numpy())
            progress_bar.set_postfix({'loss': loss.item()})

        train_accuracy_valence = metrics.accuracy_score(train_labels_valence, train_predictions_valence)
        train_accuracy_energy = metrics.accuracy_score(train_labels_energy, train_predictions_energy)
        train_accuracy_danceability = metrics.accuracy_score(train_labels_danceability, train_predictions_danceability)

        model.eval()
        val_loss = 0.0
        val_predictions_valence = []
        val_predictions_energy = []
        val_predictions_danceability = []
        val_labels_valence = []
        val_labels_energy = []
        val_labels_danceability = []
        progress_bar = tqdm(val_dataloader, desc='Validation', leave=False)

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels_valence = batch['labels_valence'].to(device)
            labels_energy = batch['labels_energy'].to(device)
            labels_danceability = batch['labels_danceability'].to(device)

            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask, labels_valence=labels_valence, labels_energy=labels_energy, labels_danceability=labels_danceability)
                loss, logits_valence, logits_energy, logits_danceability = outputs
                val_loss += loss.item()
                val_predictions_valence.extend(logits_valence.argmax(dim=-1).cpu().numpy())
                val_predictions_energy.extend(logits_energy.argmax(dim=-1).cpu().numpy())
                val_predictions_danceability.extend(logits_danceability.argmax(dim=-1).cpu().numpy())
                val_labels_valence.extend(labels_valence.cpu().numpy())
                val_labels_energy.extend(labels_energy.cpu().numpy())
                val_labels_danceability.extend(labels_danceability.cpu().numpy())
                progress_bar.set_postfix({'val_loss': loss.item()})

        avg_val_loss = val_loss / len(val_dataloader)
        val_accuracy_valence = metrics.accuracy_score(val_labels_valence, val_predictions_valence)
        val_accuracy_energy = metrics.accuracy_score(val_labels_energy, val_predictions_energy)
        val_accuracy_danceability = metrics.accuracy_score(val_labels_danceability, val_predictions_danceability)

        val_accuracy = (val_accuracy_valence + val_accuracy_energy + val_accuracy_danceability) / 3

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_model2.pt')
            
    model.load_state_dict(torch.load('best_model2.pt'))
    print(f'No. of Epoch: {epoch}; Validation accuracy: {val_accuracy * 100:.2f}%; train accuracies: valence {train_accuracy_valence * 100:.2f}%, energy {train_accuracy_energy * 100:.2f}%, danceability {train_accuracy_danceability * 100:.2f}%')
    return val_accuracy, avg_val_loss, train_accuracy_valence, train_accuracy_energy, train_accuracy_danceability, train_loss

In [157]:
def objective(trial):
    params = {
        'lr': trial.suggest_float('lr', 2e-5, 5e-5, log=True),
        'num_epochs': trial.suggest_int('num_epochs', 4, 10),
        'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64]),
        'weight_decay': trial.suggest_float('weight_decay', 0.01, 0.3),
        'dropout_rate': trial.suggest_float('dropout_rate', 0.05, 0.3),
    }
    val_accuracy, avg_val_loss, train_accuracy_valence, train_accuracy_energy, train_accuracy_danceability, train_loss = train_and_evaluate(params)
    trial.set_user_attr("val_accuracy", val_accuracy)
    trial.set_user_attr("avg_val_loss", avg_val_loss)
    trial.set_user_attr("train_accuracy_valence", train_accuracy_valence)
    trial.set_user_attr("train_accuracy_energy", train_accuracy_energy)
    trial.set_user_attr("train_accuracy_danceability", train_accuracy_danceability)
    trial.set_user_attr("train_loss", train_loss)
    return avg_val_loss

In [145]:
logging.set_verbosity_error()
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

print('Best hyperparameters: ', study.best_params)
print('Best value (negative validation loss): ', study.best_value)

[I 2024-05-21 02:53:19,416] A new study created in memory with name: no-name-33576725-569d-490e-830f-dd56489858ac


[W 2024-05-21 02:54:49,059] Trial 0 failed with parameters: {'lr': 3.004027098177885e-05, 'num_epochs': 6, 'batch_size': 64, 'l2_regularization': 0.07661651345787301, 'dropout_rate': 0.09132757403062867} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\Alex\anaconda3\envs\pt\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Alex\AppData\Local\Temp\ipykernel_29292\701095809.py", line 9, in objective
    val_accuracy, avg_val_loss, train_accuracy_valence, train_accuracy_energy, train_accuracy_danceability, train_loss = modified_train_and_evaluate(params)
                                                                                                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Alex\AppData\Local\Temp\ipykernel_29292\4064620243.py", line 57, in modified_train_and_evaluate
    train_

KeyboardInterrupt: 

## Owen original BERT

In [91]:
max_len = 128
class BertLayer(Layer):
    def __init__(self, **kwargs):
        super(BertLayer, self).__init__(**kwargs)
        self.bert_model = TFBertModel.from_pretrained('bert-base-uncased')

    def call(self, inputs):
        input_ids, attention_mask = inputs
        bert_output = self.bert_model([input_ids, attention_mask])
        cls_token = bert_output.last_hidden_state[:, 0, :]
        return cls_token

def build_bert_model():
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')

    bert_layer = BertLayer()
    cls_token = bert_layer([input_ids, attention_mask])
    cls_token = Dropout(0.3)(cls_token)

    dense_valence = Dense(y_valence.shape[1], activation='softmax', name='valence_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    dense_energy = Dense(y_energy.shape[1], activation='softmax', name='energy_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    dense_danceability = Dense(y_danceability.shape[1], activation='softmax', name='danceability_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=[dense_valence, dense_energy, dense_danceability])
    return model

In [92]:
model = build_bert_model()
model.summary()

In [93]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# 设置学习率调度器
num_train_steps = len(X_train_input_ids) // 16 * 5  # 数据量 / batch_size * epochs
num_warmup_steps = num_train_steps // 10  # 通常设置为训练步骤的10%

optimizer = Adam(learning_rate=tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=2e-5,
    decay_steps=num_train_steps,
    end_learning_rate=0.0
))

model.compile(optimizer=optimizer,
              loss={'valence_output': 'categorical_crossentropy', 
                    'energy_output': 'categorical_crossentropy', 
                    'danceability_output': 'categorical_crossentropy'},
              metrics={'valence_output': 'accuracy', 
                       'energy_output': 'accuracy', 
                       'danceability_output': 'accuracy'})

history = model.fit(
    [X_train_input_ids, X_train_attention_masks],
    {'valence_output': y_train_valence, 'energy_output': y_train_energy, 'danceability_output': y_train_danceability},
    validation_data=([X_val_input_ids, X_val_attention_masks], {'valence_output': y_val_valence, 'energy_output': y_val_energy, 'danceability_output': y_val_danceability}),
    epochs=5,
    batch_size=16,
    callbacks=[early_stopping]
)

Epoch 1/5
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 561ms/step - danceability_output_accuracy: 0.3695 - energy_output_accuracy: 0.3343 - loss: 3.7649 - valence_output_accuracy: 0.3540 - val_danceability_output_accuracy: 0.4167 - val_energy_output_accuracy: 0.3542 - val_loss: 3.5277 - val_valence_output_accuracy: 0.3958
Epoch 2/5
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 523ms/step - danceability_output_accuracy: 0.3970 - energy_output_accuracy: 0.3431 - loss: 3.7302 - valence_output_accuracy: 0.3741 - val_danceability_output_accuracy: 0.4167 - val_energy_output_accuracy: 0.3333 - val_loss: 3.5124 - val_valence_output_accuracy: 0.3958
Epoch 3/5
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 531ms/step - danceability_output_accuracy: 0.3628 - energy_output_accuracy: 0.4341 - loss: 3.5981 - valence_output_accuracy: 0.4067 - val_danceability_output_accuracy: 0.4271 - val_energy_output_accuracy: 0.3333 - val_loss: 3.5001 - val_

In [None]:
loss, valence_output_loss, energy_output_loss, danceability_output_loss, accuracy_valence, accuracy_energy, accuracy_danceability = model.evaluate(
    [X_test_input_ids, X_test_attention_masks], 
    [y_test_valence, y_test_energy, y_test_danceability]
)

print(f'Test Loss: {loss}, valence_output_loss: {valence_output_loss}, energy_output_loss: {energy_output_loss}, danceability_output_loss: {danceability_output_loss}')
print(f'Test Accuracy Valence: {accuracy_valence}, Test Accuracy Energy: {accuracy_energy}, Test Accuracy Danceability: {accuracy_danceability}')

I have implemented lr and weight decay onto this, therefore it has warm up and decay now with l2 reg, and i only ran 5 epochs because its quite slow on my mac, I noticed the loss is still decreasing drastically, therefore i believe runing more epochs will eventually boost the acc by a lot, can you guys make it 10-15 epochs and test out whats going on at that. Thx, ill now push this version onto github.

# RoBERTa

## Baseline RoBERTa

In [148]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_len = 128

X_train_input_ids, X_train_attention_masks = tokenize(X_train, tokenizer, max_len)
X_val_input_ids, X_val_attention_masks = tokenize(X_val, tokenizer, max_len)
X_test_input_ids, X_test_attention_masks = tokenize(X_test, tokenizer, max_len)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [152]:
class RoBERTaLayer(Layer):
    def __init__(self, **kwargs):
        super(RoBERTaLayer, self).__init__(**kwargs)
        self.roberta_model = TFRobertaModel.from_pretrained('roberta-base')

    def call(self, inputs):
        input_ids, attention_mask = inputs
        roberta_output = self.roberta_model([input_ids, attention_mask])
        cls_token = roberta_output.last_hidden_state[:, 0, :]
        return cls_token

def build_bert_model():
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')

    bert_layer = BertLayer()
    cls_token = bert_layer([input_ids, attention_mask])
    cls_token = Dropout(0.3)(cls_token)

    dense_valence = Dense(y_valence.shape[1], activation='softmax', name='valence_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    dense_energy = Dense(y_energy.shape[1], activation='softmax', name='energy_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    dense_danceability = Dense(y_danceability.shape[1], activation='softmax', name='danceability_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=[dense_valence, dense_energy, dense_danceability])
    return model

def build_roberta_model():
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')
    
    roberta_layer = RoBERTaLayer()
    cls_token = roberta_layer([input_ids, attention_mask])
    cls_token = Dropout(0.3)(cls_token)

    dense_valence = Dense(y_valence.shape[1], activation='softmax', name='valence_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    dense_energy = Dense(y_energy.shape[1], activation='softmax', name='energy_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    dense_danceability = Dense(y_danceability.shape[1], activation='softmax', name='danceability_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=[dense_valence, dense_energy, dense_danceability])
    return model

def get_optimizer_and_scheduler(num_train_steps, num_warmup_steps):
    optimizer = Adam(learning_rate=2e-5, weight_decay=0.01)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_steps
    )
    return optimizer, lr_scheduler

num_train_steps = len(X_train_input_ids) // 16 * 5
num_warmup_steps = num_train_steps // 10 

In [153]:
model = build_roberta_model()

In [154]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# 设置学习率调度器
num_train_steps = len(X_train_input_ids) // 16 * 5
num_warmup_steps = num_train_steps // 10 

optimizer = Adam(learning_rate=tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=2e-5,
    decay_steps=num_train_steps,
    end_learning_rate=0.0
))

model.compile(optimizer=optimizer,
              loss={'valence_output': 'categorical_crossentropy', 
                    'energy_output': 'categorical_crossentropy', 
                    'danceability_output': 'categorical_crossentropy'},
              metrics={'valence_output': 'accuracy', 
                       'energy_output': 'accuracy', 
                       'danceability_output': 'accuracy'})

history = model.fit(
    [X_train_input_ids, X_train_attention_masks],
    {'valence_output': y_train_valence, 'energy_output': y_train_energy, 'danceability_output': y_train_danceability},
    validation_data=([X_val_input_ids, X_val_attention_masks], {'valence_output': y_val_valence, 'energy_output': y_val_energy, 'danceability_output': y_val_danceability}),
    epochs=5,
    batch_size=16,
    callbacks=[early_stopping]
)

Epoch 1/5
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 586ms/step - danceability_output_accuracy: 0.3918 - energy_output_accuracy: 0.4023 - loss: 3.7120 - valence_output_accuracy: 0.3704 - val_danceability_output_accuracy: 0.2917 - val_energy_output_accuracy: 0.3438 - val_loss: 3.7568 - val_valence_output_accuracy: 0.3958
Epoch 2/5
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 551ms/step - danceability_output_accuracy: 0.3769 - energy_output_accuracy: 0.3743 - loss: 3.7413 - valence_output_accuracy: 0.4184 - val_danceability_output_accuracy: 0.2917 - val_energy_output_accuracy: 0.3438 - val_loss: 3.7448 - val_valence_output_accuracy: 0.3958
Epoch 3/5
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 529ms/step - danceability_output_accuracy: 0.4002 - energy_output_accuracy: 0.4204 - loss: 3.6848 - valence_output_accuracy: 0.3888 - val_danceability_output_accuracy: 0.2917 - val_energy_output_accuracy: 0.3438 - val_loss: 3.7368 - val_

ValueError: not enough values to unpack (expected 7, got 4)

In [None]:
roberta_loss, roberta_valence_output_loss, roberta_energy_output_loss, roberta_danceability_output_loss, roberta_accuracy_valence, roberta_accuracy_energy, roberta_accuracy_danceability = model.evaluate(
    [X_test_input_ids, X_test_attention_masks], 
    [y_test_valence, y_test_energy, y_test_danceability]
)


print(f'Test Loss: {roberta_loss}, valence_output_loss: {roberta_valence_output_loss}, energy_output_loss: {roberta_energy_output_loss}, danceability_output_loss: {roberta_accuracy_valence}')
print(f'Test Accuracy Valence: {roberta_accuracy_valence}, Test Accuracy Energy: {roberta_accuracy_energy}, Test Accuracy Danceability: {roberta_accuracy_danceability}')

## Hyperparameter tuning

In [168]:
class MultiLabelRoBERTa(torch.nn.Module):
    def __init__(self, model_name, num_labels_valence, num_labels_energy, num_labels_danceability, dropout_rate):
        super(MultiLabelRoBERTa, self).__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.classifier_valence = torch.nn.Linear(self.roberta.config.hidden_size, num_labels_valence)
        self.classifier_energy = torch.nn.Linear(self.roberta.config.hidden_size, num_labels_energy)
        self.classifier_danceability = torch.nn.Linear(self.roberta.config.hidden_size, num_labels_danceability)
    
    def forward(self, input_ids, attention_mask, labels_valence=None, labels_energy=None, labels_danceability=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = outputs[0][:, 0, :]
        cls_token = self.dropout(cls_token)
        
        logits_valence = self.classifier_valence(cls_token)
        logits_energy = self.classifier_energy(cls_token)
        logits_danceability = self.classifier_danceability(cls_token)
        
        loss = 0
        if labels_valence is not None and labels_energy is not None and labels_danceability is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss_valence = loss_fct(logits_valence, labels_valence)
            loss_energy = loss_fct(logits_energy, labels_energy)
            loss_danceability = loss_fct(logits_danceability, labels_danceability)
            loss = (loss_valence + loss_energy + loss_danceability) / 3
        
        return loss, logits_valence, logits_energy, logits_danceability

In [169]:
def train_and_evaluate_roberta(params):
    lr = params['lr']
    num_epochs = int(params['num_epochs'])
    batch_size = int(params['batch_size'])
    weight_decay = params['weight_decay']
    dropout_rate = params['dropout_rate']
    
    torch.cuda.empty_cache()
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = MultiLabelRoBERTa('roberta-base', num_labels_valence=3, num_labels_energy=3, num_labels_danceability=3, dropout_rate=dropout_rate).to(device)
    
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    train_dataset = MultiLabelDataset(X_train_input_ids, X_train_attention_masks, y_train_valence, y_train_energy, y_train_danceability)
    val_dataset = MultiLabelDataset(X_val_input_ids, X_val_attention_masks, y_val_valence, y_val_energy, y_val_danceability)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    
    num_training_steps = len(train_dataloader) * num_epochs
    warmup_steps = int(0.1 * num_training_steps)
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=num_training_steps)

    best_val_loss = float('inf')
    best_val_accuracy = float(0)
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_predictions_valence = []
        train_predictions_energy = []
        train_predictions_danceability = []
        train_labels_valence = []
        train_labels_energy = []
        train_labels_danceability = []
        progress_bar = tqdm(train_dataloader, desc='Training', leave=False)

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels_valence = batch['labels_valence'].to(device)
            labels_energy = batch['labels_energy'].to(device)
            labels_danceability = batch['labels_danceability'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels_valence=labels_valence, labels_energy=labels_energy, labels_danceability=labels_danceability)
            loss, logits_valence, logits_energy, logits_danceability = outputs
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()

            train_loss += loss.item()
            train_predictions_valence.extend(logits_valence.argmax(dim=-1).cpu().numpy())
            train_predictions_energy.extend(logits_energy.argmax(dim=-1).cpu().numpy())
            train_predictions_danceability.extend(logits_danceability.argmax(dim=-1).cpu().numpy())
            train_labels_valence.extend(labels_valence.cpu().numpy())
            train_labels_energy.extend(labels_energy.cpu().numpy())
            train_labels_danceability.extend(labels_danceability.cpu().numpy())
            progress_bar.set_postfix({'loss': loss.item()})

        train_accuracy_valence = metrics.accuracy_score(train_labels_valence, train_predictions_valence)
        train_accuracy_energy = metrics.accuracy_score(train_labels_energy, train_predictions_energy)
        train_accuracy_danceability = metrics.accuracy_score(train_labels_danceability, train_predictions_danceability)

        model.eval()
        val_loss = 0.0
        val_predictions_valence = []
        val_predictions_energy = []
        val_predictions_danceability = []
        val_labels_valence = []
        val_labels_energy = []
        val_labels_danceability = []
        progress_bar = tqdm(val_dataloader, desc='Validation', leave=False)

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels_valence = batch['labels_valence'].to(device)
            labels_energy = batch['labels_energy'].to(device)
            labels_danceability = batch['labels_danceability'].to(device)

            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask, labels_valence=labels_valence, labels_energy=labels_energy, labels_danceability=labels_danceability)
                loss, logits_valence, logits_energy, logits_danceability = outputs
                val_loss += loss.item()
                val_predictions_valence.extend(logits_valence.argmax(dim=-1).cpu().numpy())
                val_predictions_energy.extend(logits_energy.argmax(dim=-1).cpu().numpy())
                val_predictions_danceability.extend(logits_danceability.argmax(dim=-1).cpu().numpy())
                val_labels_valence.extend(labels_valence.cpu().numpy())
                val_labels_energy.extend(labels_energy.cpu().numpy())
                val_labels_danceability.extend(labels_danceability.cpu().numpy())
                progress_bar.set_postfix({'val_loss': loss.item()})

        avg_val_loss = val_loss / len(val_dataloader)
        val_accuracy_valence = metrics.accuracy_score(val_labels_valence, val_predictions_valence)
        val_accuracy_energy = metrics.accuracy_score(val_labels_energy, val_predictions_energy)
        val_accuracy_danceability = metrics.accuracy_score(val_labels_danceability, val_predictions_danceability)

        val_accuracy = (val_accuracy_valence + val_accuracy_energy + val_accuracy_danceability) / 3

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_model2.pt')
            
    model.load_state_dict(torch.load('best_model2.pt'))
    print(f'No. of Epoch: {epoch}; Validation accuracy: {val_accuracy * 100:.2f}%; train accuracies: valence {train_accuracy_valence * 100:.2f}%, energy {train_accuracy_energy * 100:.2f}%, danceability {train_accuracy_danceability * 100:.2f}%')
    return val_accuracy, avg_val_loss, train_accuracy_valence, train_accuracy_energy, train_accuracy_danceability, train_loss

In [170]:
def objective_roberta(trial):
    params = {
        'lr': trial.suggest_float('lr', 2e-5, 5e-5, log=True),
        'num_epochs': trial.suggest_int('num_epochs', 4, 10),
        'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64]),
        'weight_decay': trial.suggest_float('weight_decay', 0.01, 0.3),
        'dropout_rate': trial.suggest_float('dropout_rate', 0.05, 0.3),
    }
    val_accuracy, avg_val_loss, train_accuracy_valence, train_accuracy_energy, train_accuracy_danceability, train_loss = train_and_evaluate_roberta(params)
    trial.set_user_attr("val_accuracy", val_accuracy)
    trial.set_user_attr("avg_val_loss", avg_val_loss)
    trial.set_user_attr("train_accuracy_valence", train_accuracy_valence)
    trial.set_user_attr("train_accuracy_energy", train_accuracy_energy)
    trial.set_user_attr("train_accuracy_danceability", train_accuracy_danceability)
    trial.set_user_attr("train_loss", train_loss)
    return avg_val_loss

In [171]:
logging.set_verbosity_error()
study = optuna.create_study(direction='minimize')
study.optimize(objective_roberta, n_trials=10)

print('Best hyperparameters: ', study.best_params)
print('Best value (negative validation loss): ', study.best_value)

[I 2024-05-21 03:30:02,593] A new study created in memory with name: no-name-acfb86c7-63a1-4e71-a741-752f7f1351f6
[I 2024-05-21 03:31:10,816] Trial 0 finished with value: 1.2109988331794739 and parameters: {'lr': 4.384470994489393e-05, 'num_epochs': 10, 'batch_size': 16, 'weight_decay': 0.2860366644807206, 'dropout_rate': 0.23459909429263764}. Best is trial 0 with value: 1.2109988331794739.


No. of Epoch: 9; Validation accuracy: 50.69%; train accuracies: valence 91.15%, energy 96.61%, danceability 96.61%


[I 2024-05-21 03:31:53,847] Trial 1 finished with value: 1.0424423615137737 and parameters: {'lr': 2.0285129598819828e-05, 'num_epochs': 7, 'batch_size': 32, 'weight_decay': 0.2840274803138773, 'dropout_rate': 0.09074442945309126}. Best is trial 1 with value: 1.0424423615137737.


No. of Epoch: 6; Validation accuracy: 51.39%; train accuracies: valence 56.77%, energy 59.11%, danceability 61.46%


[I 2024-05-21 03:32:58,306] Trial 2 finished with value: 1.3006866077582042 and parameters: {'lr': 3.536790789689419e-05, 'num_epochs': 10, 'batch_size': 16, 'weight_decay': 0.08689906860285412, 'dropout_rate': 0.2237163346888606}. Best is trial 1 with value: 1.0424423615137737.


No. of Epoch: 9; Validation accuracy: 54.51%; train accuracies: valence 95.83%, energy 96.35%, danceability 97.14%


[I 2024-05-21 03:33:52,944] Trial 3 finished with value: 1.0459251801172893 and parameters: {'lr': 3.610123408214526e-05, 'num_epochs': 9, 'batch_size': 32, 'weight_decay': 0.18054326543648724, 'dropout_rate': 0.1693009320676429}. Best is trial 1 with value: 1.0424423615137737.


No. of Epoch: 8; Validation accuracy: 50.69%; train accuracies: valence 78.65%, energy 79.43%, danceability 84.38%


[I 2024-05-21 03:34:44,043] Trial 4 finished with value: 1.2035074432690938 and parameters: {'lr': 4.2946125666529956e-05, 'num_epochs': 8, 'batch_size': 16, 'weight_decay': 0.031957494834824955, 'dropout_rate': 0.23271782673791308}. Best is trial 1 with value: 1.0424423615137737.


No. of Epoch: 7; Validation accuracy: 48.61%; train accuracies: valence 84.11%, energy 88.28%, danceability 90.10%


[I 2024-05-21 03:35:32,446] Trial 5 finished with value: 1.0352118611335754 and parameters: {'lr': 2.5536352909221795e-05, 'num_epochs': 8, 'batch_size': 32, 'weight_decay': 0.06635864020785948, 'dropout_rate': 0.12413734556795548}. Best is trial 5 with value: 1.0352118611335754.


No. of Epoch: 7; Validation accuracy: 49.65%; train accuracies: valence 64.84%, energy 65.89%, danceability 75.78%


[I 2024-05-21 03:36:29,448] Trial 6 finished with value: 1.0600566864013672 and parameters: {'lr': 2.0316497489308853e-05, 'num_epochs': 9, 'batch_size': 16, 'weight_decay': 0.26850133740329657, 'dropout_rate': 0.22690355822248143}. Best is trial 5 with value: 1.0352118611335754.


No. of Epoch: 8; Validation accuracy: 52.08%; train accuracies: valence 79.17%, energy 84.38%, danceability 82.55%


[I 2024-05-21 03:50:48,609] Trial 7 finished with value: 1.0242444276809692 and parameters: {'lr': 2.832711467215106e-05, 'num_epochs': 9, 'batch_size': 64, 'weight_decay': 0.17436334634435374, 'dropout_rate': 0.059400665179551546}. Best is trial 7 with value: 1.0242444276809692.


No. of Epoch: 8; Validation accuracy: 45.49%; train accuracies: valence 53.91%, energy 57.03%, danceability 68.23%


[I 2024-05-21 03:51:44,111] Trial 8 finished with value: 1.3093683520952861 and parameters: {'lr': 3.368508129655881e-05, 'num_epochs': 10, 'batch_size': 16, 'weight_decay': 0.03044307001614459, 'dropout_rate': 0.12874849588960435}. Best is trial 7 with value: 1.0242444276809692.


No. of Epoch: 9; Validation accuracy: 51.39%; train accuracies: valence 95.83%, energy 97.66%, danceability 97.14%


[I 2024-05-21 03:52:31,647] Trial 9 finished with value: 1.0784907341003418 and parameters: {'lr': 4.3168364339364895e-05, 'num_epochs': 9, 'batch_size': 32, 'weight_decay': 0.2648664296201911, 'dropout_rate': 0.2877628419608818}. Best is trial 7 with value: 1.0242444276809692.


No. of Epoch: 8; Validation accuracy: 49.65%; train accuracies: valence 75.00%, energy 72.14%, danceability 76.82%
Best hyperparameters:  {'lr': 2.832711467215106e-05, 'num_epochs': 9, 'batch_size': 64, 'weight_decay': 0.17436334634435374, 'dropout_rate': 0.059400665179551546}
Best value (negative validation loss):  1.0242444276809692
