In [13]:
import pandas as pd
import numpy as np
import re
import os
import zipfile
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import tensorflow as tf
from keras_tuner import RandomSearch, HyperParameters, Objective
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D
from keras.optimizers import Adam, RMSprop, SGD
from keras.callbacks import EarlyStopping, LearningRateScheduler, Callback
from transformers import BertTokenizer, TFBertModel, get_linear_schedule_with_warmup, WarmUp, AdamW, RobertaTokenizer, TFRobertaModel

In [6]:
# 解压 cleaned_lyrics.zip 文件
with zipfile.ZipFile('sampled.zip', 'r') as zip_ref:
    zip_ref.extractall('sampled')

# 获取所有歌词文件的路径
lyrics_files = {os.path.splitext(f)[0]: os.path.join('sampled', f) for f in os.listdir('sampled')}

# 读取 filtered_dataset.csv 文件
data = pd.read_csv('sampled_dataset.csv')

def read_lyrics(record_id):
    file_path = lyrics_files.get(str(record_id))
    if file_path and os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    return ''

# 读取歌词并添加到数据框中
data['lyrics'] = data['record_id'].apply(read_lyrics)


In [14]:
data

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,record_id,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,...,instrumentalness,liveness,valence,tempo,time_signature,track_genre,valence_bin,energy_bin,danceability_bin,lyrics
0,1458,2834,2834,20097,4b98LXC0QUWGBteJ5uwVQY,Doja Cat,Summer Music Festival Hits,Boss Bitch,0,134239,...,0.000000,0.2030,0.575,125.993,4,dance,1,2,2,mmm not tryna ah not tryna not tryna yeah not ...
1,1557,2967,2967,22816,58Z83tSbShyHxCwqTCE8M6,Goatwhore,Vengeful Ascension,"Under the Flesh, Into the Soul",21,273266,...,0.110000,0.1070,0.347,170.015,4,death-metal,1,2,0,world grave apathetic cold selfish prison cove...
2,425,616,616,3220,0AOmbw8AwDnwXhHC3OhdVB,Thousand Foot Krutch,The End Is Where We Begin,Courtesy Call,72,236898,...,0.000000,0.0822,0.445,164.079,4,alternative,1,1,1,hey comes danger club get started man not gonn...
3,989,2054,2054,14581,5Jaj6nLjHCizmcPddJLO3k,Blippi,"Blippi Tunes, Vol. 2: Machines (Music for Todd...",The Train Song,53,207428,...,0.000019,0.3250,0.662,139.895,4,children,2,1,2,choo choo comes train choo choo comes train ro...
4,383,562,562,3717,2oaK4JLVnmRGIO9ytBE1bt,Red Hot Chili Peppers,The Getaway,Dark Necessities,74,302000,...,0.019900,0.1100,0.197,91.959,4,alternative,0,2,2,comin light day got many moons deep play keep ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,658,1229,1229,8662,3hSy2AUoElgIk6fjhYnRH3,Elvin Bishop,Sure Feels Good: The Best Of Elvin Bishop,Fooled Around And Fell In Love,57,276933,...,0.080500,0.0953,0.610,113.463,3,blues,1,1,1,million girls love em leave em alone not care ...
596,5818,10682,10682,113479,5ELZpvTDGorz9BIE9zaBoZ,Tenth Avenue North,Followers,I Have This Hope,52,204800,...,0.000000,0.1890,0.110,108.009,4,world-music,0,1,1,walk great unknown questions come questions go...
597,265,426,426,2185,3mJV4kByjmgU3ubU7JPp9W,Marilyn Manson,Halloween 2022,You And Me And The Devil Makes 3,0,264266,...,0.834000,0.2030,0.399,128.021,4,alt-rock,1,2,1,like rolling stone hill hades want lie gonna l...
598,5031,9325,9325,94717,5xUpsNCW71S58c8TycsqNa,Ollie,Sunsets & Goodbyes,what if,39,173615,...,0.000627,0.0723,0.283,146.006,4,sad,0,0,2,yeah call one day everything yeah call next no...


In [15]:
len(lyrics_files)

600

In [16]:
data

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,record_id,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,...,instrumentalness,liveness,valence,tempo,time_signature,track_genre,valence_bin,energy_bin,danceability_bin,lyrics
0,1458,2834,2834,20097,4b98LXC0QUWGBteJ5uwVQY,Doja Cat,Summer Music Festival Hits,Boss Bitch,0,134239,...,0.000000,0.2030,0.575,125.993,4,dance,1,2,2,mmm not tryna ah not tryna not tryna yeah not ...
1,1557,2967,2967,22816,58Z83tSbShyHxCwqTCE8M6,Goatwhore,Vengeful Ascension,"Under the Flesh, Into the Soul",21,273266,...,0.110000,0.1070,0.347,170.015,4,death-metal,1,2,0,world grave apathetic cold selfish prison cove...
2,425,616,616,3220,0AOmbw8AwDnwXhHC3OhdVB,Thousand Foot Krutch,The End Is Where We Begin,Courtesy Call,72,236898,...,0.000000,0.0822,0.445,164.079,4,alternative,1,1,1,hey comes danger club get started man not gonn...
3,989,2054,2054,14581,5Jaj6nLjHCizmcPddJLO3k,Blippi,"Blippi Tunes, Vol. 2: Machines (Music for Todd...",The Train Song,53,207428,...,0.000019,0.3250,0.662,139.895,4,children,2,1,2,choo choo comes train choo choo comes train ro...
4,383,562,562,3717,2oaK4JLVnmRGIO9ytBE1bt,Red Hot Chili Peppers,The Getaway,Dark Necessities,74,302000,...,0.019900,0.1100,0.197,91.959,4,alternative,0,2,2,comin light day got many moons deep play keep ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,658,1229,1229,8662,3hSy2AUoElgIk6fjhYnRH3,Elvin Bishop,Sure Feels Good: The Best Of Elvin Bishop,Fooled Around And Fell In Love,57,276933,...,0.080500,0.0953,0.610,113.463,3,blues,1,1,1,million girls love em leave em alone not care ...
596,5818,10682,10682,113479,5ELZpvTDGorz9BIE9zaBoZ,Tenth Avenue North,Followers,I Have This Hope,52,204800,...,0.000000,0.1890,0.110,108.009,4,world-music,0,1,1,walk great unknown questions come questions go...
597,265,426,426,2185,3mJV4kByjmgU3ubU7JPp9W,Marilyn Manson,Halloween 2022,You And Me And The Devil Makes 3,0,264266,...,0.834000,0.2030,0.399,128.021,4,alt-rock,1,2,1,like rolling stone hill hades want lie gonna l...
598,5031,9325,9325,94717,5xUpsNCW71S58c8TycsqNa,Ollie,Sunsets & Goodbyes,what if,39,173615,...,0.000627,0.0723,0.283,146.006,4,sad,0,0,2,yeah call one day everything yeah call next no...


In [8]:
# 使用 Tokenizer 处理文本
max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['lyrics'])
sequences = tokenizer.texts_to_sequences(data['lyrics'])
X_lyrics = pad_sequences(sequences, maxlen=max_len)

# 准备标签
y_valence = to_categorical(data['valence_bin'].values)
y_energy = to_categorical(data['energy_bin'].values)
y_danceability = to_categorical(data['danceability_bin'].values)

# 拆分数据集
X_train_val, X_test, y_train_val_valence, y_test_valence, y_train_val_energy, y_test_energy, y_train_val_danceability, y_test_danceability = train_test_split(
    X_lyrics, y_valence, y_energy, y_danceability, test_size=0.2, random_state=42)

X_train, X_val, y_train_valence, y_val_valence, y_train_energy, y_val_energy, y_train_danceability, y_val_danceability = train_test_split(
    X_train_val, y_train_val_valence, y_train_val_energy, y_train_val_danceability, test_size=0.2, random_state=42)


In [18]:
X_train

array([[   9,   80,   77, ...,  718, 3619,   32],
       [  28,  344,  780, ...,  344,  780,   32],
       [  77,  609,  198, ...,  609,  198,   32],
       ...,
       [   1,    1,  433, ...,   28,   44,   32],
       [ 653, 1380,  591, ...,    4,  203,   32],
       [  58,  135,   13, ..., 1744,  313,   32]], dtype=int32)

### DNN (Tuned)

In [19]:
from keras_tuner import RandomSearch, HyperParameters, Objective
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Embedding, Flatten
from keras.callbacks import EarlyStopping


# 构建模型函数
def build_model(hp):
    inputs = Input(shape=(max_len,))
    x = Embedding(input_dim=hp.Int('input_dim', min_value=1000, max_value=10000, step=1000),
                  output_dim=hp.Int('output_dim', min_value=32, max_value=128, step=32),
                  input_length=max_len)(inputs)
    x = Flatten()(x)

    num_layers = hp.Int('num_layers', min_value=1, max_value=5, step=1)
    for i in range(num_layers):
        if i == 0:
            x = Dense(units=hp.Int(f'units_layer{i+1}', min_value=32, max_value=512, step=32), activation='relu')(x)
        else:
            x = Dense(units=hp.Int(f'units_layer{i+1}', min_value=32, max_value=512, step=32), activation='relu')(x)
        x = Dropout(rate=hp.Float(f'dropout_layer{i+1}', min_value=0.0, max_value=0.5, step=0.1))(x)

    x = Dense(units=hp.Int('units_final', min_value=32, max_value=512, step=32),
              activation='relu',
              kernel_regularizer=tf.keras.regularizers.l2(hp.Choice('l2_regularization', values=[0.0, 1e-4, 1e-3])),
              kernel_initializer=hp.Choice('kernel_initializer', values=['glorot_uniform', 'he_normal']))(x)

    
    output_valence = Dense(y_valence.shape[1], activation='softmax', name='valence_output')(x)
    output_energy = Dense(y_energy.shape[1], activation='softmax', name='energy_output')(x)
    output_danceability = Dense(y_danceability.shape[1], activation='softmax', name='danceability_output')(x)
    
    model = Model(inputs=inputs, outputs=[output_valence, output_energy, output_danceability])

    optimizer_choice = hp.Choice('optimizer', values=['adam', 'rmsprop', 'sgd'])
    learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])

    if optimizer_choice == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_choice == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    else:
        optimizer = SGD(learning_rate=learning_rate)

    model.compile(optimizer=optimizer,
                  loss={'valence_output': 'categorical_crossentropy', 
                        'energy_output': 'categorical_crossentropy', 
                        'danceability_output': 'categorical_crossentropy'},
                  metrics={'valence_output': 'accuracy', 
                           'energy_output': 'accuracy', 
                           'danceability_output': 'accuracy'})
    return model

# 超参数调优
tuner = RandomSearch(
    build_model,
    objective=Objective('val_valence_output_accuracy', direction='max'),
    max_trials=10,
    executions_per_trial=1,
    directory='tuner_dir',
    project_name='dnn_mood_detection_600'
)

# 启动调优过程
tuner.search(X_train, [y_train_valence, y_train_energy, y_train_danceability], 
             epochs=20, 
             validation_data=(X_val, [y_val_valence, y_val_energy, y_val_danceability]), 
             callbacks=[EarlyStopping(patience=3)])

# 获取最佳模型
best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hyperparameters.values)

# 评估模型

loss, valence_output_loss, energy_output_loss, danceability_output_loss, accuracy_valence, accuracy_energy, accuracy_danceability = best_model.evaluate(X_test, [y_test_valence, y_test_energy, y_test_danceability])
print(f'Test Loss: {loss}, valence_output_loss: {valence_output_loss}, energy_output_loss: {energy_output_loss}, danceability_output_loss: {danceability_output_loss}, Test Accuracy Valence: {accuracy_valence}, Test Accuracy Energy: {accuracy_energy}, Test Accuracy Danceability: {accuracy_danceability}')

Trial 10 Complete [00h 00m 03s]
val_valence_output_accuracy: 0.5

Best val_valence_output_accuracy So Far: 0.5
Total elapsed time: 00h 00m 20s




{'input_dim': 6000, 'output_dim': 32, 'num_layers': 2, 'units_layer1': 320, 'dropout_layer1': 0.1, 'units_final': 320, 'l2_regularization': 0.0, 'kernel_initializer': 'he_normal', 'optimizer': 'adam', 'learning_rate': 1e-05, 'units_layer2': 160, 'dropout_layer2': 0.2, 'units_layer3': 256, 'dropout_layer3': 0.4, 'units_layer4': 288, 'dropout_layer4': 0.0}
Test Loss: 3.2663047313690186, valence_output_loss: 1.094298005104065, energy_output_loss: 1.0904593467712402, danceability_output_loss: 1.081547498703003, Test Accuracy Valence: 0.40833333134651184, Test Accuracy Energy: 0.44999998807907104, Test Accuracy Danceability: 0.4166666567325592


In [20]:
metrics = best_model.evaluate(X_test, [y_test_valence, y_test_energy, y_test_danceability])
#print(f'Test Loss: {loss}, Test Accuracy Valence: {accuracy_valence}, Test Accuracy Energy: {accuracy_energy}, Test Accuracy Danceability: {accuracy_danceability}')
print(metrics)

[3.2663047313690186, 1.094298005104065, 1.0904593467712402, 1.081547498703003, 0.40833333134651184, 0.44999998807907104, 0.4166666567325592]


### CNN (Tuned)

In [21]:
# 构建 CNN 模型函数
def build_cnn_model(hp):
    inputs = Input(shape=(max_len,))
    x = Embedding(input_dim=max_words, output_dim=hp.Int('embedding_output_dim', min_value=32, max_value=128, step=32), input_length=max_len)(inputs)
    x = tf.keras.layers.Conv1D(filters=hp.Int('filters', min_value=32, max_value=128, step=32), kernel_size=hp.Int('kernel_size', min_value=3, max_value=7, step=2), activation='relu')(x)
    x = tf.keras.layers.AveragePooling1D(pool_size=hp.Int('pool_size', min_value=2, max_value=5, step=1))(x)
    x = Flatten()(x)
    
    num_layers = hp.Int('num_layers', min_value=1, max_value=3, step=1)
    for i in range(num_layers):
        x = Dense(units=hp.Int(f'dense_units_{i+1}', min_value=32, max_value=512, step=32), activation='relu')(x)
        x = Dropout(rate=hp.Float(f'dropout_{i+1}', min_value=0.0, max_value=0.5, step=0.1))(x)
    
    output_valence = Dense(y_valence.shape[1], activation='softmax', name='valence_output')(x)
    output_energy = Dense(y_energy.shape[1], activation='softmax', name='energy_output')(x)
    output_danceability = Dense(y_danceability.shape[1], activation='softmax', name='danceability_output')(x)
    
    model = Model(inputs=inputs, outputs=[output_valence, output_energy, output_danceability])

    optimizer_choice = hp.Choice('optimizer', values=['adam', 'rmsprop', 'sgd'])
    learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])

    if optimizer_choice == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_choice == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    else:
        optimizer = SGD(learning_rate=learning_rate)

    model.compile(optimizer=optimizer,
                  loss={'valence_output': 'categorical_crossentropy', 
                        'energy_output': 'categorical_crossentropy', 
                        'danceability_output': 'categorical_crossentropy'},
                  metrics={'valence_output': 'accuracy', 
                           'energy_output': 'accuracy', 
                           'danceability_output': 'accuracy'})
    return model

# 超参数调优
tuner = RandomSearch(
    build_cnn_model,
    objective=Objective('val_valence_output_accuracy', direction='max'),
    max_trials=10,
    executions_per_trial=1,
    directory='tuner_dir',
    project_name='cnn_mood_detection_600'
)

# 启动调优过程
tuner.search(X_train, [y_train_valence, y_train_energy, y_train_danceability], 
             epochs=20, 
             validation_data=(X_val, [y_val_valence, y_val_energy, y_val_danceability]), 
             callbacks=[EarlyStopping(patience=3)])

# 获取最佳模型
best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hyperparameters.values)

# 评估模型
loss, valence_output_loss, energy_output_loss, danceability_output_loss, accuracy_valence, accuracy_energy, accuracy_danceability = best_model.evaluate(X_test, [y_test_valence, y_test_energy, y_test_danceability])
print(f'Test Loss: {loss}, valence_output_loss: {valence_output_loss}, energy_output_loss: {energy_output_loss}, danceability_output_loss: {danceability_output_loss}, Test Accuracy Valence: {accuracy_valence}, Test Accuracy Energy: {accuracy_energy}, Test Accuracy Danceability: {accuracy_danceability}')

Trial 10 Complete [00h 00m 02s]
val_valence_output_accuracy: 0.5

Best val_valence_output_accuracy So Far: 0.5625
Total elapsed time: 00h 00m 20s




{'embedding_output_dim': 128, 'filters': 96, 'kernel_size': 7, 'pool_size': 2, 'num_layers': 1, 'dense_units_1': 192, 'dropout_1': 0.1, 'optimizer': 'adam', 'learning_rate': 0.001, 'dense_units_2': 192, 'dropout_2': 0.0, 'dense_units_3': 320, 'dropout_3': 0.30000000000000004}
Test Loss: 3.132678270339966, valence_output_loss: 1.0609853267669678, energy_output_loss: 1.0360546112060547, danceability_output_loss: 1.0356380939483643, Test Accuracy Valence: 0.4166666567325592, Test Accuracy Energy: 0.4583333432674408, Test Accuracy Danceability: 0.4583333432674408


### ADAM BERT (Untuned)

In [10]:
# 拆分数据集
X_train_val, X_test, y_train_val_valence, y_test_valence, y_train_val_energy, y_test_energy, y_train_val_danceability, y_test_danceability = train_test_split(
    data['lyrics'], y_valence, y_energy, y_danceability, test_size=0.2, random_state=42)

X_train, X_val, y_train_valence, y_val_valence, y_train_energy, y_val_energy, y_train_danceability, y_val_danceability = train_test_split(
    X_train_val, y_train_val_valence, y_train_val_energy, y_train_val_danceability, test_size=0.2, random_state=42)

# 使用 BertTokenizer 和 TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(sentences, tokenizer, max_len=128):
    input_ids, attention_masks = [], []
    for sent in sentences:
        encoded = tokenizer.encode_plus(
            text=sent,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            truncation=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids), np.array(attention_masks)

max_len = 128

X_train_input_ids, X_train_attention_masks = tokenize(X_train, tokenizer, max_len)
X_val_input_ids, X_val_attention_masks = tokenize(X_val, tokenizer, max_len)
X_test_input_ids, X_test_attention_masks = tokenize(X_test, tokenizer, max_len)

# 构建BERT模型
def build_bert_model():
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')
    
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')
    bert_output = bert_model(input_ids, attention_mask=attention_mask)[0]
    cls_token = bert_output[:, 0, :]
    
    # 添加 Dropout 和 L2 正则化
    cls_token = Dropout(0.3)(cls_token)

    dense_valence = Dense(y_valence.shape[1], activation='softmax', name='valence_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    dense_energy = Dense(y_energy.shape[1], activation='softmax', name='energy_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    dense_danceability = Dense(y_danceability.shape[1], activation='softmax', name='danceability_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=[dense_valence, dense_energy, dense_danceability])
    return model

# 构建并训练模型
model = build_bert_model()

early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# 设置学习率调度器
num_train_steps = len(X_train_input_ids) // 16 * 5  # 数据量 / batch_size * epochs
num_warmup_steps = num_train_steps // 10  # 通常设置为训练步骤的10%

optimizer = Adam(learning_rate=tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=2e-5,
    decay_steps=num_train_steps,
    end_learning_rate=0.0
))

model.compile(optimizer=optimizer,
              loss={'valence_output': 'categorical_crossentropy', 
                    'energy_output': 'categorical_crossentropy', 
                    'danceability_output': 'categorical_crossentropy'},
              metrics={'valence_output': 'accuracy', 
                       'energy_output': 'accuracy', 
                       'danceability_output': 'accuracy'})

history = model.fit(
    [X_train_input_ids, X_train_attention_masks],
    {'valence_output': y_train_valence, 'energy_output': y_train_energy, 'danceability_output': y_train_danceability},
    validation_data=([X_val_input_ids, X_val_attention_masks], {'valence_output': y_val_valence, 'energy_output': y_val_energy, 'danceability_output': y_val_danceability}),
    epochs=5,
    batch_size=16,
    callbacks=[early_stopping]
)

# 评估模型
loss, valence_output_loss, energy_output_loss, danceability_output_loss, accuracy_valence, accuracy_energy, accuracy_danceability = model.evaluate(
    [X_test_input_ids, X_test_attention_masks], 
    [y_test_valence, y_test_energy, y_test_danceability]
)

print(f'Test Loss: {loss}, valence_output_loss: {valence_output_loss}, energy_output_loss: {energy_output_loss}, danceability_output_loss: {danceability_output_loss}')
print(f'Test Accuracy Valence: {accuracy_valence}, Test Accuracy Energy: {accuracy_energy}, Test Accuracy Danceability: {accuracy_danceability}')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/5








Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 3.170804738998413, valence_output_loss: 1.0291998386383057, energy_output_loss: 1.0383217334747314, danceability_output_loss: 0.9236698150634766
Test Accuracy Valence: 0.4583333432674408, Test Accuracy Energy: 0.46666666865348816, Test Accuracy Danceability: 0.5666666626930237


I have implemented lr and weight decay onto this, therefore it has warm up and decay now with l2 reg, and i only ran 5 epochs because its quite slow on my mac, I noticed the loss is still decreasing drastically, therefore i believe runing more epochs will eventually boost the acc by a lot, can you guys make it 10-15 epochs and test out whats going on at that. Thx, ill now push this version onto github.

### ADAM roBERTa (Untuned)

In [21]:
# 拆分数据集
X_train_val, X_test, y_train_val_valence, y_test_valence, y_train_val_energy, y_test_energy, y_train_val_danceability, y_test_danceability = train_test_split(
    data['lyrics'], y_valence, y_energy, y_danceability, test_size=0.2, random_state=42)

X_train, X_val, y_train_valence, y_val_valence, y_train_energy, y_val_energy, y_train_danceability, y_val_danceability = train_test_split(
    X_train_val, y_train_val_valence, y_train_val_energy, y_train_val_danceability, test_size=0.2, random_state=42)

# 使用 RobertaTokenizer 和 TFRobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize(sentences, tokenizer, max_len=128):
    input_ids, attention_masks = [], []
    for sent in sentences:
        encoded = tokenizer.encode_plus(
            text=sent,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            truncation=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids), np.array(attention_masks)

max_len = 128

X_train_input_ids, X_train_attention_masks = tokenize(X_train, tokenizer, max_len)
X_val_input_ids, X_val_attention_masks = tokenize(X_val, tokenizer, max_len)
X_test_input_ids, X_test_attention_masks = tokenize(X_test, tokenizer, max_len)

# 构建RoBERTa模型
def build_roberta_model():
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')
    
    roberta_model = TFRobertaModel.from_pretrained('roberta-base')
    roberta_output = roberta_model(input_ids, attention_mask=attention_mask)[0]
    cls_token = roberta_output[:, 0, :]
    
    # 添加 Dropout 和 L2 正则化
    cls_token = Dropout(0.3)(cls_token)

    dense_valence = Dense(y_valence.shape[1], activation='softmax', name='valence_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    dense_energy = Dense(y_energy.shape[1], activation='softmax', name='energy_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    dense_danceability = Dense(y_danceability.shape[1], activation='softmax', name='danceability_output', kernel_regularizer=tf.keras.regularizers.l2(0.01))(cls_token)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=[dense_valence, dense_energy, dense_danceability])
    return model

# 设置学习率调度器
def get_optimizer_and_scheduler(num_train_steps, num_warmup_steps):
    optimizer = Adam(learning_rate=2e-5, weight_decay=0.01)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_steps
    )
    return optimizer, lr_scheduler

num_train_steps = len(X_train_input_ids) // 16 * 5  # 数据量 / batch_size * epochs
num_warmup_steps = num_train_steps // 10  # 通常设置为训练步骤的10%

# 构建并训练BERT模型
model = build_roberta_model()

early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# 设置学习率调度器
num_train_steps = len(X_train_input_ids) // 16 * 5  # 数据量 / batch_size * epochs
num_warmup_steps = num_train_steps // 10  # 通常设置为训练步骤的10%

optimizer = Adam(learning_rate=tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=2e-5,
    decay_steps=num_train_steps,
    end_learning_rate=0.0
))

model.compile(optimizer=optimizer,
              loss={'valence_output': 'categorical_crossentropy', 
                    'energy_output': 'categorical_crossentropy', 
                    'danceability_output': 'categorical_crossentropy'},
              metrics={'valence_output': 'accuracy', 
                       'energy_output': 'accuracy', 
                       'danceability_output': 'accuracy'})

history = model.fit(
    [X_train_input_ids, X_train_attention_masks],
    {'valence_output': y_train_valence, 'energy_output': y_train_energy, 'danceability_output': y_train_danceability},
    validation_data=([X_val_input_ids, X_val_attention_masks], {'valence_output': y_val_valence, 'energy_output': y_val_energy, 'danceability_output': y_val_danceability}),
    epochs=5,
    batch_size=16,
    callbacks=[early_stopping]
)

# 评估RoBERTa模型
roberta_loss, roberta_valence_output_loss, roberta_energy_output_loss, roberta_danceability_output_loss, roberta_accuracy_valence, roberta_accuracy_energy, roberta_accuracy_danceability = model.evaluate(
    [X_test_input_ids, X_test_attention_masks], 
    [y_test_valence, y_test_energy, y_test_danceability]
)


print(f'Test Loss: {roberta_loss}, valence_output_loss: {roberta_valence_output_loss}, energy_output_loss: {roberta_energy_output_loss}, danceability_output_loss: {roberta_accuracy_valence}')
print(f'Test Accuracy Valence: {roberta_accuracy_valence}, Test Accuracy Energy: {roberta_accuracy_energy}, Test Accuracy Danceability: {roberta_accuracy_danceability}')


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.embeddings.position_ids', 'lm_head.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Epoch 1/5




KeyboardInterrupt: 

Test Loss: 3.2314350605010986, valence_output_loss: 1.0374451875686646, energy_output_loss: 1.0355204343795776, danceability_output_loss: 0.5083333253860474
Test Accuracy Valence: 0.5083333253860474, Test Accuracy Energy: 0.44999998807907104, Test Accuracy Danceability: 0.5083333253860474
