In [1]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# Exemplo rápido para confirmar que está usando a GPU
x = torch.rand(5, 5).to(device)
y = torch.rand(5, 5).to(device)
z = x @ y
print("Tensor on GPU:", z.is_cuda)

Number of GPU:  1
GPU Name:  NVIDIA GeForce RTX 3050 Laptop GPU
Using device: cuda
Tensor on GPU: True


In [None]:
import torch
import os
import pandas as pd
import re
import numpy as np
import pickle
import random
from tqdm import tqdm

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import spacy
import nltk
from collections import Counter

In [None]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading 'punkt' NLTK package...")
    nltk.download('punkt')
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    print("Downloading 'wordnet' NLTK package...")
    nltk.download('wordnet')

## Separação dos dados

In [None]:
"""
Realiza o pré-processamento completo para um fluxo de pré-treinamento e
fine-tuning, tudo em uma única execução. O processo é dividido em três fases:
(1) Isola as letras do artista alvo em conjuntos exclusivos de treino,
validação e teste para evitar vazamento de dados. (2) Constrói um corpus geral
com as letras de todos os outros artistas (e as letras não exclusivas do
artista alvo), cria e salva um tokenizador geral e os datasets tokenizados
correspondentes. (3) Utiliza o tokenizador geral para processar e salvar os
datasets exclusivos do artista alvo, deixando-os prontos para o fine-tuning.
"""

CSV_FOLDER_PATH = 'csv'
CHUNK_SIZE = 10000

SEQUENCE_LENGTH = 100 

# Diretórios base de saída
BASE_OUTPUT_DIR_GENERAL = 'processed_data_general_char_split'
BASE_OUTPUT_DIR_ARTIST = 'processed_data_by_artist_char_split' 

TOKENIZER_SUBDIR = 'tokenizers'
DATA_SUBDIR = 'data'

TARGET_ARTIST = 'ArianaGrande' 
NUM_SONGS_TO_PROCESS = -1

TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.1
TEST_SPLIT = 0.1 
RANDOM_SEED = 42

CLEAN_TARGET_ARTIST_NAME = TARGET_ARTIST.replace(" ", "_").replace(".", "").lower()

# Frase placeholder a ser removida
PLACEHOLDER_PHRASE = "lyrics for this song have yet to be released please check back once the song has been released"


# --- Funções Auxiliares ---
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s.,!?;:\'\-]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    return ""

def save_object(obj, path):
    with open(path, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"Objeto salvo em: {path}")

def load_object(path):
    if os.path.exists(path):
        with open(path, 'rb') as handle:
            obj = pickle.load(handle)
        print(f"Objeto carregado de: {path}")
        return obj
    return None

class CharacterTokenizer:
    def __init__(self):
        self.char_to_int = {}
        self.int_to_char = {}
        self.vocab_size = 0
    def fit_on_texts(self, texts):
        if isinstance(texts, list): all_chars = "".join(texts)
        else: all_chars = texts
        unique_chars = sorted(list(set(all_chars)))
        self.char_to_int = {char: i + 1 for i, char in enumerate(unique_chars)}
        self.int_to_char = {i + 1: char for i, char in enumerate(unique_chars)}
        self.vocab_size = len(self.char_to_int) + 1
    def texts_to_sequences(self, texts):
        if isinstance(texts, str): return [self.char_to_int.get(char, 0) for char in texts]
        elif isinstance(texts, list): return [[self.char_to_int.get(char, 0) for char in s] for s in texts]
        return []
    def sequences_to_texts(self, sequences):
        if isinstance(sequences[0], list): return ["".join([self.int_to_char.get(i, '') for i in seq]) for seq in sequences]
        else: return "".join([self.int_to_char.get(i, '') for i in sequences])

def create_sequences_from_lyrics_list(lyrics_list, tokenizer, sequence_length, description):
    all_sequences = []
    all_next_chars = []
    print(f"\n{description}: Criando sequências e tokenizando...")
    total_chars_in_subset = 0
    for lyric in tqdm(lyrics_list, desc=f"Processando {description} músicas"):
        tokenized_lyric = tokenizer.texts_to_sequences(lyric)
        total_chars_in_subset += len(tokenized_lyric)
        if len(tokenized_lyric) < sequence_length + 1:
            continue

        for i in range(0, len(tokenized_lyric) - sequence_length):
            seq = tokenized_lyric[i : i + sequence_length]
            next_c = tokenized_lyric[i + sequence_length]
            all_sequences.append(seq)
            all_next_chars.append(next_c)

    print(f"Total de caracteres processados no {description}: {total_chars_in_subset}")
    print(f"Total de sequências criadas para {description}: {len(all_sequences)}")

    if not all_sequences:
        print(f"Nenhuma sequência válida criada para {description}. Verifique o comprimento das letras ou SEQUENCE_LENGTH.")
        return np.array([]), np.array([])

    return np.array(all_sequences, dtype=np.int32), np.array(all_next_chars, dtype=np.int32)


# --- FASE 1: PRÉ-PROCESSAMENTO DO ARTISTA ALVO (PARA ISOLAR TESTE, TREINO E VALIDAÇÃO EXCLUSIVOS) ---
print("--- FASE 1: PRÉ-PROCESSAMENTO DO ARTISTA ALVO (PARA ISOLAR TESTE, TREINO E VALIDAÇÃO EXCLUSIVOS) ---")

ARTIST_LYRICS_CLEAN_ALL = [] 
TEST_LYRICS_ARTIST_FINAL = [] 
VAL_LYRICS_ARTIST_EXCLUSIVE = [] 
TRAIN_LYRICS_ARTIST_EXCLUSIVE = [] 

CSV_FILE_PATH_TARGET_ARTIST = os.path.join(CSV_FOLDER_PATH, f'{TARGET_ARTIST}.csv')

print(f"Coletando e limpando TODAS as letras de '{TARGET_ARTIST}' do arquivo '{CSV_FILE_PATH_TARGET_ARTIST}' para separação...")
try:
    df_target_artist_full = pd.read_csv(CSV_FILE_PATH_TARGET_ARTIST, usecols=['Lyric'])

    initial_lyrics_count_artist_all = len(df_target_artist_full)
    removed_lyrics_count_artist_all = 0

    for lyric_val in tqdm(df_target_artist_full['Lyric'], desc=f"Limpando e filtrando letras de {TARGET_ARTIST}"):
        cleaned_lyric = clean_text(lyric_val)
        
        if PLACEHOLDER_PHRASE in cleaned_lyric:
            removed_lyrics_count_artist_all += 1
            continue
        
        if cleaned_lyric:
            ARTIST_LYRICS_CLEAN_ALL.append(cleaned_lyric)
    
    print(f"Total de letras iniciais para '{TARGET_ARTIST}': {initial_lyrics_count_artist_all}")
    print(f"Total de letras removidas (placeholder) para '{TARGET_ARTIST}': {removed_lyrics_count_artist_all}")
    print(f"Total de letras coletadas e limpas de '{TARGET_ARTIST}': {len(ARTIST_LYRICS_CLEAN_ALL)}")

    random.seed(RANDOM_SEED) 
    random.shuffle(ARTIST_LYRICS_CLEAN_ALL)

    num_total_artist_lyrics = len(ARTIST_LYRICS_CLEAN_ALL)
    
    num_test_artist_final = int(num_total_artist_lyrics * TEST_SPLIT)
    num_val_artist_exclusive = int(num_total_artist_lyrics * VAL_SPLIT)
    num_train_artist_exclusive = num_total_artist_lyrics - num_test_artist_final - num_val_artist_exclusive

    if num_train_artist_exclusive < 0:
        num_train_artist_exclusive = 0 

    TEST_LYRICS_ARTIST_FINAL = ARTIST_LYRICS_CLEAN_ALL[:num_test_artist_final]
    VAL_LYRICS_ARTIST_EXCLUSIVE = ARTIST_LYRICS_CLEAN_ALL[num_test_artist_final : num_test_artist_final + num_val_artist_exclusive]
    TRAIN_LYRICS_ARTIST_EXCLUSIVE = ARTIST_LYRICS_CLEAN_ALL[num_test_artist_final + num_val_artist_exclusive : num_test_artist_final + num_val_artist_exclusive + num_train_artist_exclusive]
    
    print(f"\nDivisão das músicas de '{TARGET_ARTIST}' (Exclusivas para Fine-tuning):")
    print(f"Músicas reservadas para TESTE FINAL: {len(TEST_LYRICS_ARTIST_FINAL)} músicas")
    print(f"Músicas reservadas para VALIDAÇÃO EXCLUSIVA: {len(VAL_LYRICS_ARTIST_EXCLUSIVE)} músicas")
    print(f"Músicas reservadas para TREINO EXCLUSIVO: {len(TRAIN_LYRICS_ARTIST_EXCLUSIVE)} músicas")


except FileNotFoundError:
    print(f"Erro: Arquivo CSV do artista alvo não encontrado em '{CSV_FILE_PATH_TARGET_ARTIST}'. Não será possível evitar vazamento de dados.")
    exit()
except ValueError as e:
    print(f"Erro na separação de dados de '{TARGET_ARTIST}': {e}. Não será possível evitar vazamento de dados.")
    exit()


# --- FASE 2: PRÉ-PROCESSAMENTO PARA O MODELO GERAL (TODOS OS ARTISTAS, EXCLUINDO DADOS EXCLUSIVOS DO ARTISTA ALVO) ---
print("\n--- FASE 2: PRÉ-PROCESSAMENTO PARA O MODELO GERAL (Excluindo dados exclusivos do artista alvo) ---")

ALL_ARTISTS_LYRICS_FOR_GENERAL_MODEL = [] 
GENERAL_OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR_GENERAL)
os.makedirs(os.path.join(GENERAL_OUTPUT_DIR, TOKENIZER_SUBDIR), exist_ok=True)
os.makedirs(os.path.join(GENERAL_OUTPUT_DIR, DATA_SUBDIR), exist_ok=True)

EXCLUSIVE_ARTIST_DATA_SET = set(TEST_LYRICS_ARTIST_FINAL + VAL_LYRICS_ARTIST_EXCLUSIVE + TRAIN_LYRICS_ARTIST_EXCLUSIVE)

for filename in os.listdir(CSV_FOLDER_PATH):
    if filename.endswith(".csv"):
        artist_name_current_file = filename.replace(".csv", "")
        csv_file_path_artist = os.path.join(CSV_FOLDER_PATH, filename)
        
        try:
            df_artist_current = pd.read_csv(csv_file_path_artist, usecols=['Lyric'])
            
            for lyric_val in tqdm(df_artist_current['Lyric'], desc=f"Coletando corpus geral de {artist_name_current_file}"):
                cleaned_lyric = clean_text(lyric_val)
                
                if PLACEHOLDER_PHRASE in cleaned_lyric:
                    continue
                
                if artist_name_current_file == TARGET_ARTIST and cleaned_lyric in EXCLUSIVE_ARTIST_DATA_SET:
                    continue 
                
                if cleaned_lyric:
                    ALL_ARTISTS_LYRICS_FOR_GENERAL_MODEL.append(cleaned_lyric)

        except Exception as e:
            print(f"Erro ao coletar letras de '{artist_name_current_file}' para corpus geral: {e}. Pulando.")

print(f"\nTotal de letras no Corpus GERAL (APÓS exclusão de todos os dados exclusivos do artista alvo): {len(ALL_ARTISTS_LYRICS_FOR_GENERAL_MODEL)}")


random.seed(RANDOM_SEED) 
random.shuffle(ALL_ARTISTS_LYRICS_FOR_GENERAL_MODEL)

# Dividir o corpus geral para treinamento do modelo base
num_total_lyrics_general = len(ALL_ARTISTS_LYRICS_FOR_GENERAL_MODEL)
num_train_general = int(num_total_lyrics_general * TRAIN_SPLIT)
num_val_general = int(num_total_lyrics_general * VAL_SPLIT)
num_test_general = num_total_lyrics_general - num_train_general - num_val_general

train_lyrics_general = ALL_ARTISTS_LYRICS_FOR_GENERAL_MODEL[:num_train_general]
val_lyrics_general = ALL_ARTISTS_LYRICS_FOR_GENERAL_MODEL[num_train_general : num_train_general + num_val_general]
test_lyrics_general = ALL_ARTISTS_LYRICS_FOR_GENERAL_MODEL[num_train_general + num_val_general :] 

print(f"\nDivisão das músicas (Corpus GERAL FINAL):")
print(f"Treino Geral: {len(train_lyrics_general)} músicas")
print(f"Validação Geral: {len(val_lyrics_general)} músicas")
print(f"Teste Geral: {len(test_lyrics_general)} músicas")

# Criar e ajustar o Tokenizador Geral
lyrics_for_vocab_general = " ".join(train_lyrics_general)
print("\nPasso 4: Construindo vocabulário de caracteres GERAL com as letras de TREINO GERAL...")
tokenizer_general = CharacterTokenizer()
tokenizer_general.fit_on_texts(lyrics_for_vocab_general)
total_chars_general = tokenizer_general.vocab_size
print(f"Vocabulário de caracteres GERAL construído. Tamanho do vocabulário: {total_chars_general}")

TOKENIZER_SAVE_PATH_GENERAL = os.path.join(GENERAL_OUTPUT_DIR, TOKENIZER_SUBDIR, f'char_tokenizer_general.pkl')
save_object(tokenizer_general, TOKENIZER_SAVE_PATH_GENERAL)

# Criar sequências e tokenizar para o conjunto GERAL (Treino, Val, Teste)
X_train_general, y_train_general = create_sequences_from_lyrics_list(train_lyrics_general, tokenizer_general, SEQUENCE_LENGTH, "Conjunto de Treino Geral")
X_val_general, y_val_general = create_sequences_from_lyrics_list(val_lyrics_general, tokenizer_general, SEQUENCE_LENGTH, "Conjunto de Validação Geral")
X_test_general, y_test_general = create_sequences_from_lyrics_list(test_lyrics_general, tokenizer_general, SEQUENCE_LENGTH, "Conjunto de Teste Geral")

print(f"\nShapes finais dos dados tokenizados (Corpus Geral):")
print(f"X_train_general shape: {X_train_general.shape}, y_train_general shape: {y_train_general.shape}")
print(f"X_val_general shape: {X_val_general.shape}, y_val_general shape: {y_val_general.shape}")
print(f"X_test_general shape: {X_test_general.shape}, y_test_general shape: {y_test_general.shape}")

# Salvar dados tokenizados do corpus GERAL
output_file_x_train_general = os.path.join(GENERAL_OUTPUT_DIR, DATA_SUBDIR, f'X_train_general_char.npy')
output_file_y_train_general = os.path.join(GENERAL_OUTPUT_DIR, DATA_SUBDIR, f'y_train_general_char.npy')
np.save(output_file_x_train_general, X_train_general)
np.save(output_file_y_train_general, y_train_general)
print(f"Dados de TREINO GERAL salvos em {output_file_x_train_general} e {output_file_y_train_general}")

output_file_x_val_general = os.path.join(GENERAL_OUTPUT_DIR, DATA_SUBDIR, f'X_val_general_char.npy')
output_file_y_val_general = os.path.join(GENERAL_OUTPUT_DIR, DATA_SUBDIR, f'y_val_general_char.npy')
np.save(output_file_x_val_general, X_val_general)
np.save(output_file_y_val_general, y_val_general)
print(f"Dados de VALIDAÇÃO GERAL salvos em {output_file_x_val_general} e {output_file_y_val_general}")

output_file_x_test_general = os.path.join(GENERAL_OUTPUT_DIR, DATA_SUBDIR, f'X_test_general_char.npy')
output_file_y_test_general = os.path.join(GENERAL_OUTPUT_DIR, DATA_SUBDIR, f'y_test_general_char.npy')
np.save(output_file_x_test_general, X_test_general)
np.save(output_file_y_test_general, y_test_general)
print(f"Dados de TESTE GERAL salvos em {output_file_x_test_general} e {output_file_y_test_general}")

output_file_vocab_size_general = os.path.join(GENERAL_OUTPUT_DIR, TOKENIZER_SUBDIR, f'vocab_size_general_char.pkl')
save_object(total_chars_general, output_file_vocab_size_general)
print(f"Tamanho do vocabulário GERAL salvo em {output_file_vocab_size_general}")

print(f"\nPré-processamento para o Corpus GERAL (caractere a caractere) concluído.")


# --- FASE 3: PRÉ-PROCESSAMENTO PARA O ARTISTA ALVO (FINE-TUNING) ---
print("\n--- FASE 3: PRÉ-PROCESSAMENTO PARA O ARTISTA ALVO (FINE-TUNING) ---")

ARTIST_OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR_ARTIST, CLEAN_TARGET_ARTIST_NAME)
os.makedirs(os.path.join(ARTIST_OUTPUT_DIR, TOKENIZER_SUBDIR), exist_ok=True)
os.makedirs(os.path.join(ARTIST_OUTPUT_DIR, DATA_SUBDIR), exist_ok=True)

train_lyrics_artist = TRAIN_LYRICS_ARTIST_EXCLUSIVE
val_lyrics_artist = VAL_LYRICS_ARTIST_EXCLUSIVE
test_lyrics_artist = TEST_LYRICS_ARTIST_FINAL 

print(f"\nDivisão das músicas (Artista Alvo '{TARGET_ARTIST}' para Fine-tuning):")
print(f"Treino Artista: {len(train_lyrics_artist)} músicas")
print(f"Validação Artista: {len(val_lyrics_artist)} músicas")
print(f"Teste Artista: {len(test_lyrics_artist)} músicas (Exclusivo, NÃO Visto no Treino Geral!)")

print(f"\nReutilizando o tokenizer geral ({TOKENIZER_SAVE_PATH_GENERAL}) para os dados do artista alvo...")
try:
    tokenizer_for_artist_data = load_object(TOKENIZER_SAVE_PATH_GENERAL)
except FileNotFoundError:
    print("Erro: Tokenizer geral não encontrado. Não é possível processar dados do artista alvo. Por favor, rode a FASE 2 primeiro.")
    exit()

X_train_artist, y_train_artist = create_sequences_from_lyrics_list(train_lyrics_artist, tokenizer_for_artist_data, SEQUENCE_LENGTH, "Conjunto de Treino Artista")
X_val_artist, y_val_artist = create_sequences_from_lyrics_list(val_lyrics_artist, tokenizer_for_artist_data, SEQUENCE_LENGTH, "Conjunto de Validação Artista")
X_test_artist, y_test_artist = create_sequences_from_lyrics_list(test_lyrics_artist, tokenizer_for_artist_data, SEQUENCE_LENGTH, "Conjunto de Teste Artista (Exclusivo)")

print(f"\nShapes finais dos dados tokenizados (Artista Alvo '{TARGET_ARTIST}'):")
print(f"X_train_artist shape: {X_train_artist.shape}, y_train_artist shape: {y_train_artist.shape}")
print(f"X_val_artist shape: {X_val_artist.shape}, y_val_artist shape: {y_val_artist.shape}")
print(f"X_test_artist shape: {X_test_artist.shape}, y_test_artist shape: {y_test_artist.shape}")

# Salvar dados tokenizados do ARTISTA ALVO
output_file_x_train_artist = os.path.join(ARTIST_OUTPUT_DIR, DATA_SUBDIR, f'X_train_{CLEAN_TARGET_ARTIST_NAME}_char.npy')
output_file_y_train_artist = os.path.join(ARTIST_OUTPUT_DIR, DATA_SUBDIR, f'y_train_{CLEAN_TARGET_ARTIST_NAME}_char.npy')
np.save(output_file_x_train_artist, X_train_artist)
np.save(output_file_y_train_artist, y_train_artist)
print(f"Dados de TREINO do Artista salvos em {output_file_x_train_artist} e {output_file_y_train_artist}")

output_file_x_val_artist = os.path.join(ARTIST_OUTPUT_DIR, DATA_SUBDIR, f'X_val_{CLEAN_TARGET_ARTIST_NAME}_char.npy')
output_file_y_val_artist = os.path.join(ARTIST_OUTPUT_DIR, DATA_SUBDIR, f'y_val_{CLEAN_TARGET_ARTIST_NAME}_char.npy')
np.save(output_file_x_val_artist, X_val_artist)
np.save(output_file_y_val_artist, y_val_artist)
print(f"Dados de VALIDAÇÃO do Artista salvos em {output_file_x_val_artist} e {output_file_y_val_artist}")

output_file_x_test_artist = os.path.join(ARTIST_OUTPUT_DIR, DATA_SUBDIR, f'X_test_{CLEAN_TARGET_ARTIST_NAME}_char.npy')
output_file_y_test_artist = os.path.join(ARTIST_OUTPUT_DIR, DATA_SUBDIR, f'y_test_{CLEAN_TARGET_ARTIST_NAME}_char.npy')
np.save(output_file_x_test_artist, X_test_artist)
np.save(output_file_y_test_artist, y_test_artist)
print(f"Dados de TESTE do Artista salvos em {output_file_x_test_artist} e {output_file_y_test_artist}")

output_file_vocab_size_artist = os.path.join(ARTIST_OUTPUT_DIR, TOKENIZER_SUBDIR, f'vocab_size_{CLEAN_TARGET_ARTIST_NAME}_char.pkl')
save_object(total_chars_general, output_file_vocab_size_artist) 
print(f"Tamanho do vocabulário GERAL (para o artista alvo) salvo em {output_file_vocab_size_artist}")


print(f"\nPré-processamento para o artista '{TARGET_ARTIST}' (caractere a caractere) concluído.")
print(f"Dados tokenizados para o artista salvo no diretório: {ARTIST_OUTPUT_DIR}")

## Treino

In [None]:
"""
Define as configurações para as fases de pré-treinamento e fine-tuning,
incluindo diretórios de modelo, épocas e taxas de aprendizado. Também define
as classes PyTorch essenciais: a classe 'LyricsDataset' para carregar os
dados tokenizados de forma eficiente e a arquitetura do modelo 'CharLSTM'.
"""
# --- Configurações de Treinamento e Modelo ---
MODEL_DIR_BASE_GENERAL = 'models_general_char_split'
MODEL_DIR_BASE_ARTIST_FINE_TUNED = 'models_by_artist_char_split_finetuned'
BATCH_SIZE = 128
EPOCHS_GENERAL = 5
EPOCHS_FINE_TUNE = 10
LEARNING_RATE_GENERAL = 0.001
LEARNING_RATE_FINE_TUNE = 0.0001
EMBEDDING_DIM = 256
HIDDEN_DIM = 256
N_LAYERS = 4
DROPOUT_RATE = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

# --- Caminhos Específicos dos Modelos ---
GENERAL_DATA_DIR = os.path.join(BASE_OUTPUT_DIR_GENERAL)
ARTIST_DATA_DIR = os.path.join(BASE_OUTPUT_DIR_ARTIST, CLEAN_TARGET_ARTIST_NAME)
MODEL_DIR_GENERAL = os.path.join(MODEL_DIR_BASE_GENERAL)
MODEL_DIR_ARTIST_FINE_TUNED = os.path.join(MODEL_DIR_BASE_ARTIST_FINE_TUNED, CLEAN_TARGET_ARTIST_NAME)
os.makedirs(MODEL_DIR_GENERAL, exist_ok=True)
os.makedirs(MODEL_DIR_ARTIST_FINE_TUNED, exist_ok=True)


# --- Definições de Classes PyTorch ---
class LyricsDataset(Dataset):
    """Carrega dados tokenizados .npy de forma eficiente para o PyTorch."""
    def __init__(self, x_tokens_path, y_tokens_path):
        self.x_data = np.load(x_tokens_path, mmap_mode='r')
        self.y_data = np.load(y_tokens_path, mmap_mode='r')
    def __len__(self):
        return len(self.x_data)
    def __getitem__(self, idx):
        x_sample = torch.tensor(self.x_data[idx], dtype=torch.long)
        y_sample = torch.tensor(self.y_data[idx], dtype=torch.long)
        return x_sample, y_sample

class CharLSTM(nn.Module):
    """Arquitetura do modelo LSTM para geração de caracteres."""
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout_rate, dropout_strategy="none"):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.dropout_strategy = dropout_strategy
        self.dropout_rate = dropout_rate
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        lstm_dropout_param = dropout_rate if dropout_strategy == "between_lstm" and n_layers > 1 else 0.0
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            n_layers,
                            dropout=lstm_dropout_param,
                            batch_first=True,
                            bidirectional=True)
        
        self.dropout_layer = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        final_lstm_output = output[:, -1, :]
        if self.dropout_strategy == "before_fc":
            final_lstm_output = self.dropout_layer(final_lstm_output)
        prediction = self.fc(final_lstm_output)
        return prediction

In [None]:
"""
Carrega o corpus geral pré-processado e o tokenizador correspondente. Em
seguida, executa a primeira fase do treinamento: o pré-treinamento do modelo
CharLSTM no grande corpus de letras de múltiplos artistas. O modelo com a menor
perda de validação e seu histórico de treinamento são salvos no disco para a
subsequente fase de fine-tuning.
"""
try:
    tokenizer_general = load_object(os.path.join(GENERAL_DATA_DIR, TOKENIZER_SUBDIR, 'char_tokenizer_general.pkl'))
    total_chars_general = load_object(os.path.join(GENERAL_DATA_DIR, TOKENIZER_SUBDIR, 'vocab_size_general_char.pkl'))
    print(f"Vocabulário de caracteres geral carregado. Tamanho: {total_chars_general}")
except FileNotFoundError as e:
    print(f"Erro: Arquivos do tokenizer geral não encontrados. Detalhes: {e}.")
    exit()

print("\n--- Iniciando Fase 1: Treinamento do Modelo Geral ---")
X_train_general_path = os.path.join(GENERAL_DATA_DIR, DATA_SUBDIR, 'X_train_general_char.npy')
y_train_general_path = os.path.join(GENERAL_DATA_DIR, DATA_SUBDIR, 'y_train_general_char.npy')
X_val_general_path = os.path.join(GENERAL_DATA_DIR, DATA_SUBDIR, 'X_val_general_char.npy')
y_val_general_path = os.path.join(GENERAL_DATA_DIR, DATA_SUBDIR, 'y_val_general_char.npy')

if not all(os.path.exists(p) for p in [X_train_general_path, y_train_general_path, X_val_general_path, y_val_general_path]):
    print(f"Dados gerais de treino/validação não encontrados em {GENERAL_DATA_DIR}/{DATA_SUBDIR}. Execute o pré-processamento.")
    exit()

train_dataset_general = LyricsDataset(X_train_general_path, y_train_general_path)
val_dataset_general = LyricsDataset(X_val_general_path, y_val_general_path)
train_loader_general = DataLoader(train_dataset_general, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader_general = DataLoader(val_dataset_general, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
print(f"Dataset de treino geral: {len(train_dataset_general)} amostras.")
print(f"Dataset de validação geral: {len(val_dataset_general)} amostras.")

CURRENT_DROPOUT_STRATEGY_GENERAL = "none"

model_general = CharLSTM(
    vocab_size=total_chars_general,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    n_layers=N_LAYERS,
    dropout_rate=DROPOUT_RATE,
    dropout_strategy=CURRENT_DROPOUT_STRATEGY_GENERAL
).to(device)

print("\nModelo Geral (Base):", model_general, sep='\n')
criterion_general = nn.CrossEntropyLoss()
optimizer_general = optim.Adam(model_general.parameters(), lr=LEARNING_RATE_GENERAL)

best_val_loss_general = float('inf')
model_general_save_filepath = os.path.join(MODEL_DIR_GENERAL, f"model_weights_general_strategy_{CURRENT_DROPOUT_STRATEGY_GENERAL}_best.pt")
history_general = {'train_loss': [], 'val_loss': [], 'train_accuracy': [], 'val_accuracy': [], 'dropout_strategy': CURRENT_DROPOUT_STRATEGY_GENERAL}

print(f"Iniciando treinamento geral com estratégia de dropout: '{CURRENT_DROPOUT_STRATEGY_GENERAL}'...")
for epoch in range(EPOCHS_GENERAL):
    model_general.train()
    train_loss, correct_predictions, total_predictions = 0.0, 0, 0
    for inputs, labels in tqdm(train_loader_general, desc=f"Epoch {epoch+1}/{EPOCHS_GENERAL} [Train General]"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer_general.zero_grad()
        outputs = model_general(inputs)
        loss = criterion_general(outputs, labels)
        loss.backward()
        optimizer_general.step()
        train_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()
    
    history_general['train_loss'].append(train_loss / len(train_dataset_general))
    history_general['train_accuracy'].append(correct_predictions / total_predictions)

    model_general.eval()
    val_loss, correct_predictions, total_predictions = 0.0, 0, 0
    with torch.no_grad():
        for inputs, labels in tqdm(val_loader_general, desc=f"Epoch {epoch+1}/{EPOCHS_GENERAL} [Validation General]"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model_general(inputs)
            loss = criterion_general(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

    current_val_loss = val_loss / len(val_dataset_general)
    history_general['val_loss'].append(current_val_loss)
    history_general['val_accuracy'].append(correct_predictions / total_predictions)
    
    print(f"Epoch {epoch+1}/{EPOCHS_GENERAL}: Train Loss: {history_general['train_loss'][-1]:.4f}, Val Loss: {current_val_loss:.4f}")

    if current_val_loss < best_val_loss_general:
        best_val_loss_general = current_val_loss
        torch.save(model_general.state_dict(), model_general_save_filepath)
        print(f"Modelo Geral salvo: Melhor val_loss em {best_val_loss_general:.4f}")

history_general_filepath = os.path.join(MODEL_DIR_GENERAL, f"training_history_general_strategy_{CURRENT_DROPOUT_STRATEGY_GENERAL}.pkl")
with open(history_general_filepath, 'wb') as f:
    pickle.dump(history_general, f)

print(f"\nTreinamento geral concluído. Melhor modelo salvo em: {model_general_save_filepath}")

In [None]:
"""
Executa a segunda fase do treinamento: o fine-tuning. Carrega os datasets
exclusivos do artista alvo, inicializa uma nova arquitetura de modelo e
carrega os pesos do modelo geral pré-treinado na etapa anterior. Em seguida,
continua o treinamento (fine-tuning), geralmente com uma taxa de aprendizado
menor, usando apenas os dados do artista. Salva o modelo especializado
resultante e seu histórico de treinamento.
"""
print("\n--- Iniciando Fase 2: Fine-tuning para o Artista Alvo ---")

X_train_artist_path = os.path.join(ARTIST_DATA_DIR, DATA_SUBDIR, f'X_train_{CLEAN_TARGET_ARTIST_NAME}_char.npy')
y_train_artist_path = os.path.join(ARTIST_DATA_DIR, DATA_SUBDIR, f'y_train_{CLEAN_TARGET_ARTIST_NAME}_char.npy')
X_val_artist_path = os.path.join(ARTIST_DATA_DIR, DATA_SUBDIR, f'X_val_{CLEAN_TARGET_ARTIST_NAME}_char.npy')
y_val_artist_path = os.path.join(ARTIST_DATA_DIR, DATA_SUBDIR, f'y_val_{CLEAN_TARGET_ARTIST_NAME}_char.npy')

if not all(os.path.exists(p) for p in [X_train_artist_path, y_train_artist_path, X_val_artist_path, y_val_artist_path]):
    print(f"Dados do artista alvo não encontrados em {ARTIST_DATA_DIR}/{DATA_SUBDIR}. Execute o pré-processamento.")
    exit()

train_dataset_artist = LyricsDataset(X_train_artist_path, y_train_artist_path)
val_dataset_artist = LyricsDataset(X_val_artist_path, y_val_artist_path)
train_loader_artist = DataLoader(train_dataset_artist, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader_artist = DataLoader(val_dataset_artist, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
print(f"Dataset de treino do artista: {len(train_dataset_artist)} amostras.")
print(f"Dataset de validação do artista: {len(val_dataset_artist)} amostras.")

CURRENT_DROPOUT_STRATEGY_FINE_TUNE = "none"

model_fine_tune = CharLSTM(
    vocab_size=total_chars_general,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    n_layers=N_LAYERS,
    dropout_rate=DROPOUT_RATE,
    dropout_strategy=CURRENT_DROPOUT_STRATEGY_FINE_TUNE
).to(device)

model_general_save_filepath_for_fine_tune = os.path.join(MODEL_DIR_GENERAL, f"model_weights_general_strategy_{CURRENT_DROPOUT_STRATEGY_FINE_TUNE}_best.pt")
try:
    model_fine_tune.load_state_dict(torch.load(model_general_save_filepath_for_fine_tune, map_location=device))
    print(f"Modelo pré-treinado carregado de: {model_general_save_filepath_for_fine_tune}")
except FileNotFoundError:
    print(f"Erro: Modelo geral pré-treinado com estratégia '{CURRENT_DROPOUT_STRATEGY_FINE_TUNE}' não encontrado. Treine o modelo geral primeiro.")
    exit()

print("\nModelo para Fine-tuning (inicializado com pesos gerais):", model_fine_tune, sep='\n')
criterion_fine_tune = nn.CrossEntropyLoss()
optimizer_fine_tune = optim.Adam(model_fine_tune.parameters(), lr=LEARNING_RATE_FINE_TUNE)

best_val_loss_fine_tune = float('inf')
model_fine_tune_save_filepath = os.path.join(MODEL_DIR_ARTIST_FINE_TUNED, f"model_weights_{CLEAN_TARGET_ARTIST_NAME}_finetuned_strategy_{CURRENT_DROPOUT_STRATEGY_FINE_TUNE}_best.pt")
history_fine_tune = {'train_loss': [], 'val_loss': [], 'train_accuracy': [], 'val_accuracy': [], 'dropout_strategy': CURRENT_DROPOUT_STRATEGY_FINE_TUNE}

print(f"Iniciando fine-tuning para '{TARGET_ARTIST}' com estratégia de dropout: '{CURRENT_DROPOUT_STRATEGY_FINE_TUNE}'...")
for epoch in range(EPOCHS_FINE_TUNE):
    model_fine_tune.train()
    train_loss, correct_predictions, total_predictions = 0.0, 0, 0
    for inputs, labels in tqdm(train_loader_artist, desc=f"Epoch {epoch+1}/{EPOCHS_FINE_TUNE} [Fine-tune]"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer_fine_tune.zero_grad()
        outputs = model_fine_tune(inputs)
        loss = criterion_fine_tune(outputs, labels)
        loss.backward()
        optimizer_fine_tune.step()
        train_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

    history_fine_tune['train_loss'].append(train_loss / len(train_dataset_artist))
    history_fine_tune['train_accuracy'].append(correct_predictions / total_predictions)
    
    model_fine_tune.eval()
    val_loss, correct_predictions, total_predictions = 0.0, 0, 0
    with torch.no_grad():
        for inputs, labels in tqdm(val_loader_artist, desc=f"Epoch {epoch+1}/{EPOCHS_FINE_TUNE} [Validation Fine-tune]"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model_fine_tune(inputs)
            loss = criterion_fine_tune(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

    current_val_loss = val_loss / len(val_dataset_artist)
    history_fine_tune['val_loss'].append(current_val_loss)
    history_fine_tune['val_accuracy'].append(correct_predictions / total_predictions)

    print(f"Epoch {epoch+1}/{EPOCHS_FINE_TUNE}: Train Loss: {history_fine_tune['train_loss'][-1]:.4f}, Val Loss: {current_val_loss:.4f}")

    if current_val_loss < best_val_loss_fine_tune:
        best_val_loss_fine_tune = current_val_loss
        torch.save(model_fine_tune.state_dict(), model_fine_tune_save_filepath)
        print(f"Modelo Fine-tuned salvo: Melhor val_loss em {best_val_loss_fine_tune:.4f}")

history_fine_tune_filepath = os.path.join(MODEL_DIR_ARTIST_FINE_TUNED, f"training_history_{CLEAN_TARGET_ARTIST_NAME}_finetuned_strategy_{CURRENT_DROPOUT_STRATEGY_FINE_TUNE}.pkl")
with open(history_fine_tune_filepath, 'wb') as f:
    pickle.dump(history_fine_tune, f)

print(f"\nHistórico de fine-tuning salvo em: {history_fine_tune_filepath}")
print(f"Fine-tuning para '{TARGET_ARTIST}' concluído. Melhor modelo salvo em: {model_fine_tune_save_filepath}")

Usando dispositivo: cuda
Objeto carregado de: processed_data_general_char_split\tokenizers\char_tokenizer_general.pkl
Objeto carregado de: processed_data_general_char_split\tokenizers\vocab_size_general_char.pkl
Vocabulário de caracteres GERAL carregado. Tamanho do vocabulário: 36

--- FASE 1: TREINAMENTO DO MODELO GERAL ---
Dataset de treino geral com 7797308 amostras, 60917 batches.
Dataset de validação geral com 1020540 amostras, 7973 batches.

Modelo Geral (Base):
CharLSTM(
  (embedding): Embedding(36, 256)
  (lstm): LSTM(256, 256, num_layers=4, batch_first=True, bidirectional=True)
  (dropout_layer): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=36, bias=True)
)
Iniciando treinamento GERAL com estratégia de dropout: 'none'...


Epoch 1/3 [Train General]: 100%|██████████| 60917/60917 [1:53:21<00:00,  8.96it/s]
Epoch 1/3 [Validation General]: 100%|██████████| 7973/7973 [04:56<00:00, 26.89it/s]


Epoch 1/3: Train Loss: 1.2711, Train Acc: 0.6024, Val Loss: 1.2332, Val Acc: 0.6149
Modelo Geral salvo: Melhor val_loss em 1.2332


Epoch 2/3 [Train General]: 100%|██████████| 60917/60917 [1:51:37<00:00,  9.10it/s]
Epoch 2/3 [Validation General]: 100%|██████████| 7973/7973 [04:57<00:00, 26.78it/s]


Epoch 2/3: Train Loss: 1.1536, Train Acc: 0.6386, Val Loss: 1.2172, Val Acc: 0.6209
Modelo Geral salvo: Melhor val_loss em 1.2172


Epoch 3/3 [Train General]: 100%|██████████| 60917/60917 [1:55:54<00:00,  8.76it/s]
Epoch 3/3 [Validation General]: 100%|██████████| 7973/7973 [05:00<00:00, 26.56it/s]


Epoch 3/3: Train Loss: 1.1402, Train Acc: 0.6430, Val Loss: 1.2172, Val Acc: 0.6205
Modelo Geral salvo: Melhor val_loss em 1.2172

Histórico de treinamento GERAL salvo em: models_general_char_split\training_history_general_strategy_none.pkl

Treinamento GERAL concluído. Melhor modelo salvo em: models_general_char_split\model_weights_general_strategy_none_best.pt

--- FASE 2: FINE-TUNING PARA O ARTISTA ALVO ---
Dataset de treino do artista com 368522 amostras, 2880 batches.
Dataset de validação do artista com 39919 amostras, 312 batches.
Modelo pré-treinado carregado de: models_general_char_split\model_weights_general_strategy_none_best.pt

Modelo para Fine-tuning (inicializado com pesos gerais):
CharLSTM(
  (embedding): Embedding(36, 256)
  (lstm): LSTM(256, 256, num_layers=4, batch_first=True, bidirectional=True)
  (dropout_layer): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=36, bias=True)
)
Iniciando FINE-TUNING para 'ArianaGrande' com estratégia de dro

Epoch 1/10 [Fine-tune]: 100%|██████████| 2880/2880 [04:54<00:00,  9.78it/s]
Epoch 1/10 [Validation Fine-tune]: 100%|██████████| 312/312 [00:11<00:00, 27.75it/s]


Epoch 1/10: Train Loss: 1.0320, Train Acc: 0.6822, Val Loss: 0.9359, Val Acc: 0.7104
Modelo Fine-tuned salvo: Melhor val_loss em 0.9359


Epoch 2/10 [Fine-tune]: 100%|██████████| 2880/2880 [04:48<00:00,  9.98it/s]
Epoch 2/10 [Validation Fine-tune]: 100%|██████████| 312/312 [00:11<00:00, 27.89it/s]


Epoch 2/10: Train Loss: 0.8600, Train Acc: 0.7390, Val Loss: 0.8641, Val Acc: 0.7358
Modelo Fine-tuned salvo: Melhor val_loss em 0.8641


Epoch 3/10 [Fine-tune]: 100%|██████████| 2880/2880 [04:49<00:00,  9.94it/s]
Epoch 3/10 [Validation Fine-tune]: 100%|██████████| 312/312 [00:11<00:00, 27.79it/s]


Epoch 3/10: Train Loss: 0.7524, Train Acc: 0.7757, Val Loss: 0.8114, Val Acc: 0.7544
Modelo Fine-tuned salvo: Melhor val_loss em 0.8114


Epoch 4/10 [Fine-tune]: 100%|██████████| 2880/2880 [04:50<00:00,  9.92it/s]
Epoch 4/10 [Validation Fine-tune]: 100%|██████████| 312/312 [00:11<00:00, 27.71it/s]


Epoch 4/10: Train Loss: 0.6673, Train Acc: 0.8044, Val Loss: 0.7686, Val Acc: 0.7715
Modelo Fine-tuned salvo: Melhor val_loss em 0.7686


Epoch 5/10 [Fine-tune]: 100%|██████████| 2880/2880 [04:50<00:00,  9.90it/s]
Epoch 5/10 [Validation Fine-tune]: 100%|██████████| 312/312 [00:11<00:00, 27.63it/s]


Epoch 5/10: Train Loss: 0.5960, Train Acc: 0.8282, Val Loss: 0.7330, Val Acc: 0.7843
Modelo Fine-tuned salvo: Melhor val_loss em 0.7330


Epoch 6/10 [Fine-tune]: 100%|██████████| 2880/2880 [04:49<00:00,  9.94it/s]
Epoch 6/10 [Validation Fine-tune]: 100%|██████████| 312/312 [00:11<00:00, 28.05it/s]


Epoch 6/10: Train Loss: 0.5340, Train Acc: 0.8490, Val Loss: 0.7037, Val Acc: 0.7950
Modelo Fine-tuned salvo: Melhor val_loss em 0.7037


Epoch 7/10 [Fine-tune]: 100%|██████████| 2880/2880 [04:49<00:00,  9.95it/s]
Epoch 7/10 [Validation Fine-tune]: 100%|██████████| 312/312 [00:11<00:00, 27.74it/s]


Epoch 7/10: Train Loss: 0.4791, Train Acc: 0.8674, Val Loss: 0.6797, Val Acc: 0.8042
Modelo Fine-tuned salvo: Melhor val_loss em 0.6797


Epoch 8/10 [Fine-tune]: 100%|██████████| 2880/2880 [04:50<00:00,  9.91it/s]
Epoch 8/10 [Validation Fine-tune]: 100%|██████████| 312/312 [00:11<00:00, 27.68it/s]


Epoch 8/10: Train Loss: 0.4296, Train Acc: 0.8831, Val Loss: 0.6602, Val Acc: 0.8110
Modelo Fine-tuned salvo: Melhor val_loss em 0.6602


Epoch 9/10 [Fine-tune]: 100%|██████████| 2880/2880 [04:49<00:00,  9.94it/s]
Epoch 9/10 [Validation Fine-tune]: 100%|██████████| 312/312 [00:11<00:00, 27.82it/s]


Epoch 9/10: Train Loss: 0.3848, Train Acc: 0.8974, Val Loss: 0.6410, Val Acc: 0.8199
Modelo Fine-tuned salvo: Melhor val_loss em 0.6410


Epoch 10/10 [Fine-tune]: 100%|██████████| 2880/2880 [04:49<00:00,  9.93it/s]
Epoch 10/10 [Validation Fine-tune]: 100%|██████████| 312/312 [00:11<00:00, 27.92it/s]


Epoch 10/10: Train Loss: 0.3443, Train Acc: 0.9101, Val Loss: 0.6273, Val Acc: 0.8264
Modelo Fine-tuned salvo: Melhor val_loss em 0.6273

Histórico de fine-tuning salvo em: models_by_artist_char_split_finetuned\arianagrande\training_history_arianagrande_finetuned_strategy_none.pkl

Fine-tuning para 'ArianaGrande' com estratégia 'none' concluído.
O melhor modelo fine-tuned foi salvo em: models_by_artist_char_split_finetuned\arianagrande\model_weights_arianagrande_finetuned_strategy_none_best.pt


## Teste

In [None]:
"""
Executa um fluxo de avaliação completo e autônomo para um modelo de geração de
texto fine-tuned. Este script define todas as configurações, funções e classes
necessárias, prepara os dados de avaliação (incluindo o vocabulário de referência
de todo o corpus e as letras de teste do artista alvo), carrega o modelo
fine-tuned treinado, e então executa uma avaliação música a música. Para cada
música no conjunto de teste, ele gera uma nova letra de comprimento equivalente e
calcula um conjunto de métricas de qualidade de texto (e.g., Distinct-N, TTR,
taxa de repetição). Finalmente, agrega e exibe as métricas médias, comparando o
desempenho do texto gerado com o das letras originais.
"""

def get_word_ngrams(text, n):
    words = nltk.word_tokenize(text.lower())
    if len(words) < n: return []
    return list(nltk.ngrams(words, n))

def calculate_distinct_n_ratio(text, n):
    ngrams = get_word_ngrams(text, n)
    if not ngrams: return 0.0
    return len(set(ngrams)) / len(ngrams)

def calculate_text_quality_metrics_updated(text_to_analyze, reference_words_set):
    words = nltk.word_tokenize(text_to_analyze.lower())
    metrics = {
        'Total Words': len(words), 'Unique Words': 0, 'TTR (Distinct-1 Ratio)': 0.0,
        'Repetition Rate (1-Distinct-1)': 0.0, 'Distinct-2 Ratio (Bigrams)': 0.0,
        'Repetition Rate (1-Distinct-2) (Bigrams)': 0.0, 'Distinct-3 Ratio (Trigrams)': 0.0,
        'Repetition Rate (1-Distinct-3) (Trigrams)': 0.0, 'Generated Unique Words Existing in Corpus': 0,
        'Existence Rate (Generated Unique Words)': 0.0
    }
    if not words: return metrics
    unique_words = set(words)
    metrics['Unique Words'] = len(unique_words)
    metrics['TTR (Distinct-1 Ratio)'] = calculate_distinct_n_ratio(text_to_analyze, 1)
    metrics['Repetition Rate (1-Distinct-1)'] = 1 - metrics['TTR (Distinct-1 Ratio)']
    metrics['Distinct-2 Ratio (Bigrams)'] = calculate_distinct_n_ratio(text_to_analyze, 2)
    metrics['Repetition Rate (1-Distinct-2) (Bigrams)'] = 1 - metrics['Distinct-2 Ratio (Bigrams)']
    metrics['Distinct-3 Ratio (Trigrams)'] = calculate_distinct_n_ratio(text_to_analyze, 3)
    metrics['Repetition Rate (1-Distinct-3) (Trigrams)'] = 1 - metrics['Distinct-3 Ratio (Trigrams)']
    existing_words_in_generated = sum(1 for word in unique_words if word in reference_words_set)
    metrics['Generated Unique Words Existing in Corpus'] = existing_words_in_generated
    metrics['Existence Rate (Generated Unique Words)'] = existing_words_in_generated / len(unique_words) if unique_words else 0.0
    return metrics

def generate_text_by_word_count(model, tokenizer, seed_text, target_word_count, temperature=0.8, sequence_length=SEQUENCE_LENGTH, device=device, max_chars_per_word=25):
    model.eval()
    processed_seed_text = clean_text(seed_text)
    if len(processed_seed_text) < sequence_length:
        processed_seed_text = ' ' * (sequence_length - len(processed_seed_text)) + processed_seed_text
    else:
        processed_seed_text = processed_seed_text[-sequence_length:]
    
    generated_text = processed_seed_text
    current_word_count = len(nltk.word_tokenize(processed_seed_text))
    pbar = tqdm(total=max(0, target_word_count - current_word_count), desc="Generating words")
    max_chars_to_generate = target_word_count * max_chars_per_word * 2
    chars_generated = 0
    
    while current_word_count < target_word_count and chars_generated < max_chars_to_generate:
        current_sequence_str = generated_text[-sequence_length:]
        tokenized_input = tokenizer.texts_to_sequences(current_sequence_str)
        if not tokenized_input or all(t == 0 for t in tokenized_input): break
        
        input_tensor = torch.tensor(tokenized_input, dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            predictions = model(input_tensor).squeeze(0)
        
        if temperature <= 0:
            next_char_id = torch.argmax(predictions, dim=-1).item()
        else:
            probabilities = torch.softmax(predictions / temperature, dim=-1)
            if probabilities.sum() == 0 or torch.isnan(probabilities).any():
                next_char_id = random.randint(1, tokenizer.vocab_size - 1)
            else:
                next_char_id = torch.multinomial(probabilities, 1).item()
        
        next_char = tokenizer.int_to_char.get(next_char_id, '')
        generated_text += next_char
        chars_generated += 1
        
        if chars_generated % 50 == 0 or next_char.isspace() or next_char in '!?,.':
            new_word_count = len(nltk.word_tokenize(generated_text))
            if new_word_count > current_word_count:
                pbar.update(new_word_count - current_word_count)
                current_word_count = new_word_count
    pbar.close()
    return generated_text

# --- Preparação dos Dados e Recursos de Avaliação ---
try:
    tokenizer_general = load_object(os.path.join(BASE_OUTPUT_DIR_GENERAL, TOKENIZER_SUBDIR, 'char_tokenizer_general.pkl'))
    total_chars_general = load_object(os.path.join(BASE_OUTPUT_DIR_GENERAL, TOKENIZER_SUBDIR, 'vocab_size_general_char.pkl'))
    print(f"General character vocabulary loaded. Size: {total_chars_general}")
    
    existing_words_in_corpus = set()
    for filename in os.listdir(CSV_FOLDER_PATH):
        if filename.endswith(".csv"):
            df_temp = pd.read_csv(os.path.join(CSV_FOLDER_PATH, filename), usecols=['Lyric'])
            for lyric_val in df_temp['Lyric']:
                cleaned_lyric = clean_text(lyric_val)
                if cleaned_lyric and PLACEHOLDER_PHRASE not in cleaned_lyric:
                    existing_words_in_corpus.update(nltk.word_tokenize(cleaned_lyric))
    print(f"Size of existing word vocabulary in general corpus: {len(existing_words_in_corpus)}")

    ARTIST_LYRICS_CLEAN_ALL_FOR_PROMPTS = []
    df_target_artist_full = pd.read_csv(CSV_FILE_PATH_TARGET_ARTIST, usecols=['Lyric'])
    for lyric_val in df_target_artist_full['Lyric']:
        cleaned_lyric = clean_text(lyric_val)
        if cleaned_lyric and PLACEHOLDER_PHRASE not in cleaned_lyric:
            ARTIST_LYRICS_CLEAN_ALL_FOR_PROMPTS.append(cleaned_lyric)
    
    random.seed(RANDOM_SEED)
    random.shuffle(ARTIST_LYRICS_CLEAN_ALL_FOR_PROMPTS)
    num_test_artist_final = int(len(ARTIST_LYRICS_CLEAN_ALL_FOR_PROMPTS) * TEST_SPLIT)
    TEST_LYRICS_ARTIST_FINAL_SONGS = ARTIST_LYRICS_CLEAN_ALL_FOR_PROMPTS[:num_test_artist_final]
    print(f"Total {len(TEST_LYRICS_ARTIST_FINAL_SONGS)} songs in artist's test set for evaluation.")

except Exception as e:
    print(f"Error preparing data: {e}")
    TEST_LYRICS_ARTIST_FINAL_SONGS = []
    existing_words_in_corpus = set()
    exit()

# --- Carregamento do Modelo Fine-Tuned ---
MODEL_DIR_ARTIST_FINE_TUNED_SPECIFIC = os.path.join(MODEL_DIR_BASE_ARTIST_FINE_TUNED, CLEAN_TARGET_ARTIST_NAME)
MODEL_PATH = os.path.join(MODEL_DIR_ARTIST_FINE_TUNED_SPECIFIC, f'model_weights_{CLEAN_TARGET_ARTIST_NAME}_finetuned_strategy_none_best.pt')
print(f"\nAttempting to load fine-tuned model from: {MODEL_PATH}")
try:
    dropout_strategy_for_model = 'none'
    model_fine_tuned = CharLSTM(
        vocab_size=total_chars_general, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM,
        n_layers=N_LAYERS, dropout_rate=DROPOUT_RATE, dropout_strategy=dropout_strategy_for_model
    ).to(device)
    model_fine_tuned.load_state_dict(torch.load(MODEL_PATH, map_location=device))
    model_fine_tuned.eval()
    print("Fine-tuned model loaded successfully.")
except FileNotFoundError:
    print(f"Error: Fine-tuned model not found at {MODEL_PATH}.")
    exit()
except Exception as e:
    print(f"An error occurred while loading the fine-tuned model: {e}")
    exit()

# --- Execução da Avaliação Música a Música ---
dropout_strategy_for_display = dropout_strategy_for_model
print(f"\n--- Avaliação de Geração (Modelo Fine-tuned com Dropout: {dropout_strategy_for_display}) ---")

all_generated_lyrics_metrics = []
all_actual_lyrics_metrics = []
if not TEST_LYRICS_ARTIST_FINAL_SONGS:
    print("WARNING: No artist test songs available for generation.")
else:
    for original_lyric_text in tqdm(TEST_LYRICS_ARTIST_FINAL_SONGS, desc=f"Generating & Evaluating (Dropout: {dropout_strategy_for_display})"):
        target_word_count = max(len(nltk.word_tokenize(original_lyric_text)), 20)
        if len(original_lyric_text) < SEQUENCE_LENGTH:
            seed_text = ' ' * (SEQUENCE_LENGTH - len(original_lyric_text)) + original_lyric_text
        else:
            seed_text = original_lyric_text[:SEQUENCE_LENGTH]
        
        generated_lyric_text = generate_text_by_word_count(
            model_fine_tuned, tokenizer_general, seed_text, target_word_count,
            temperature=0.8, sequence_length=SEQUENCE_LENGTH, device=device
        )
        
        all_generated_lyrics_metrics.append(calculate_text_quality_metrics_updated(generated_lyric_text, existing_words_in_corpus))
        all_actual_lyrics_metrics.append(calculate_text_quality_metrics_updated(original_lyric_text, existing_words_in_corpus))

    if all_generated_lyrics_metrics:
        metric_keys = all_generated_lyrics_metrics[0].keys()
        avg_generated_metrics = {k: np.mean([d[k] for d in all_generated_lyrics_metrics]) for k in metric_keys}
        avg_actual_metrics = {k: np.mean([d[k] for d in all_actual_lyrics_metrics]) for k in metric_keys}
        
        print("\n" + "="*80)
        print("--- MÉDIAS DAS MÉTRICAS DE GERAÇÃO (Dropout: {}) ---".format(dropout_strategy_for_display))
        print("--- Letras Geradas (Média) ---")
        for key, value in avg_generated_metrics.items():
            print(f"{key}: {value:.4f}")
        
        print("\n--- Letras Originais do Teste (Média) ---")
        for key, value in avg_actual_metrics.items():
            print(f"{key}: {value:.4f}")
        print("="*80 + "\n")
    else:
        print("No metrics generated.")

Downloading 'wordnet' NLTK package...
Using device: cuda
Object loaded from: processed_data_general_char_split\tokenizers\char_tokenizer_general.pkl
Object loaded from: processed_data_general_char_split\tokenizers\vocab_size_general_char.pkl
General character vocabulary loaded. Vocabulary size: 36


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


spaCy model 'en_core_web_sm' loaded.
Size of existing word vocabulary in general corpus (for metrics): 40909
Total 29 songs in the artist's test set for full lyric generation evaluation.

Attempting to load fine-tuned model from: models_by_artist_char_split_finetuned\arianagrande\arianagrande_char_lstm_fine_tuned_model.pth
Error: Fine-tuned model not found at models_by_artist_char_split_finetuned\arianagrande\arianagrande_char_lstm_fine_tuned_model.pth. Please ensure the fine-tuning script has been run and the model saved correctly.

--- Avaliação de Geração de Músicas (Música a Música) para 'ArianaGrande' (Modelo Fine-tuned com Dropout: none) ---


Generating words: 100%|██████████| 3/3 [00:02<00:00,  1.25it/s][00:00<?, ?it/s]
Generating words: 100%|██████████| 673/673 [06:56<00:00,  1.62it/s]02<01:07,  2.41s/it]
Generating words: 100%|██████████| 775/775 [07:05<00:00,  1.82it/s]58<1:50:42, 246.03s/it]
Generating words: 100%|██████████| 519/519 [05:13<00:00,  1.65it/s]04<2:22:06, 327.93s/it]
Generating words: 514it [06:02,  1.42it/s]14%|█▍        | 4/29 [19:18<2:14:17, 322.31s/it]
Generating words: 100%|██████████| 402/402 [05:24<00:00,  1.24it/s]20<2:14:46, 336.95s/it]
Generating words: 100%|██████████| 372/372 [05:05<00:00,  1.22it/s]45<2:07:30, 332.62s/it]
Generating words: 100%|██████████| 381/381 [04:49<00:00,  1.32it/s]50<1:58:41, 323.69s/it]
Generating words: 100%|██████████| 14/14 [00:09<00:00,  1.55it/s]0:40<1:49:29, 312.81s/it]
Generating words: 100%|██████████| 316/316 [03:38<00:00,  1.45it/s]49<1:12:37, 217.86s/it]
Generating words: 100%|██████████| 526/526 [05:20<00:00,  1.64it/s]:27<1:09:03, 218.09s/it]
Generating w


--- MÉDIAS DAS MÉTRICAS DE GERAÇÃO (Dropout: none) ---
--- Letras Geradas (Média) ---
Total Words (Avg): 344.52
Unique Words (Avg): 308.24
TTR (Distinct-1 Ratio) (Avg): 0.8900
Repetition Rate (1-Distinct-1) (Avg): 0.1100
Distinct-2 Ratio (Bigrams) (Avg): 0.9826
Repetition Rate (1-Distinct-2) (Bigrams) (Avg): 0.0174
Distinct-3 Ratio (Trigrams) (Avg): 0.9909
Repetition Rate (1-Distinct-3) (Trigrams) (Avg): 0.0091
Generated Unique Words Existing in Corpus (Avg): 33.07
Existence Rate (Generated Unique Words) (Avg): 0.18%
-----------------------------------------------------

--- Letras Originais do Teste (Média) ---
Total Words (Avg): 342.93
Unique Words (Avg): 91.79
TTR (Distinct-1 Ratio) (Avg): 0.3644
Repetition Rate (1-Distinct-1) (Avg): 0.6356
Distinct-2 Ratio (Bigrams) (Avg): 0.5176
Repetition Rate (1-Distinct-2) (Bigrams) (Avg): 0.4824
Distinct-3 Ratio (Trigrams) (Avg): 0.5837
Repetition Rate (1-Distinct-3) (Trigrams) (Avg): 0.4163
Actual Unique Words Existing in Corpus (Avg): 91.79




: 

In [None]:
def calculate_text_quality_metrics(text_to_analyze, reference_words_set):
    words = nltk.word_tokenize(text_to_analyze.lower())

    metrics = {
        'total_words': len(words),
        'unique_words': 0,
        'ttr': 0.0,
        'existing_words_count': 0,
        'existence_rate': 0.0,
        'repetition_rate_words': 0.0,
        'repetition_rate_3grams_char': 0.0
    }

    if not words:
        return metrics

    unique_words = set(words)
    metrics['unique_words'] = len(unique_words)
    metrics['ttr'] = len(unique_words) / len(words)

    existing_words_in_generated = 0
    for word in unique_words:
        if word in reference_words_set:
            existing_words_in_generated += 1
    metrics['existing_words_count'] = existing_words_in_generated
    metrics['existence_rate'] = existing_words_in_generated / len(unique_words) if len(unique_words) > 0 else 0.0

    # Repetition of words (proportion of repeated tokens)
    metrics['repetition_rate_words'] = (len(words) - len(unique_words)) / len(words) if len(words) > 0 else 0.0

    # Repetition of 3-character n-grams
    char_3grams = [text_to_analyze[i:i+3] for i in range(len(text_to_analyze) - 2)]
    if len(char_3grams) > 0:
        unique_char_3grams = set(char_3grams)
        metrics['repetition_rate_3grams_char'] = (len(char_3grams) - len(unique_char_3grams)) / len(char_3grams)
    else:
        metrics['repetition_rate_3grams_char'] = 0.0

    return metrics

tokenizer_general = None
total_chars_general = 0

try:
    tokenizer_general = load_object(os.path.join(BASE_OUTPUT_DIR_GENERAL, TOKENIZER_SUBDIR, f'char_tokenizer_general.pkl'))
    total_chars_general = load_object(os.path.join(BASE_OUTPUT_DIR_GENERAL, TOKENIZER_SUBDIR, f'vocab_size_general_char.pkl'))
    print(f"General character vocabulary loaded. Vocabulary size: {total_chars_general}")
except FileNotFoundError as e:
    print(f"Error: General tokenizer files not found. Details: {e}. Please run preprocessing Phase 1.")
    exit()

nlp = None
try:
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model 'en_core_web_sm' loaded.")
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model 'en_core_web_sm' downloaded and loaded.")

prompts_for_eval = []
true_next_words = []
true_next_pos_tags = []
existing_words_in_corpus = set()

try:
    all_lyrics_for_vocab_building = []
    for filename in os.listdir(CSV_FOLDER_PATH):
        if filename.endswith(".csv"):
            csv_path = os.path.join(CSV_FOLDER_PATH, filename)
            df_temp = pd.read_csv(csv_path, usecols=['Lyric'])
            for lyric_val in df_temp['Lyric']:
                cleaned_lyric = clean_text(lyric_val)
                if cleaned_lyric and PLACEHOLDER_PHRASE not in cleaned_lyric:
                    all_lyrics_for_vocab_building.append(cleaned_lyric)
    
    for lyric in all_lyrics_for_vocab_building:
        words = nltk.word_tokenize(lyric)
        existing_words_in_corpus.update(words)
    
    print(f"Size of existing word vocabulary in general corpus (for metrics): {len(existing_words_in_corpus)}")

    ARTIST_LYRICS_CLEAN_ALL = []
    df_target_artist_full = pd.read_csv(CSV_FILE_PATH_TARGET_ARTIST, usecols=['Lyric'])
    for lyric_val in df_target_artist_full['Lyric']:
        cleaned_lyric = clean_text(lyric_val)
        if PLACEHOLDER_PHRASE in cleaned_lyric: continue
        if cleaned_lyric: ARTIST_LYRICS_CLEAN_ALL.append(cleaned_lyric)
    
    random.seed(RANDOM_SEED)
    random.shuffle(ARTIST_LYRICS_CLEAN_ALL)
    
    num_test_artist_final = int(len(ARTIST_LYRICS_CLEAN_ALL) * TEST_SPLIT)
    TEST_LYRICS_ARTIST_FINAL_FOR_GEN = ARTIST_LYRICS_CLEAN_ALL[:num_test_artist_final]
    
    if not TEST_LYRICS_ARTIST_FINAL_FOR_GEN:
        raise ValueError("TEST_LYRICS_ARTIST_FINAL set is empty. Check collection and split in preprocessing.")
    
    print(f"\nGenerating prompts and targets for evaluation from {len(TEST_LYRICS_ARTIST_FINAL_FOR_GEN)} exclusive artist test songs...")
    for lyric_text in tqdm(TEST_LYRICS_ARTIST_FINAL_FOR_GEN, desc="Generating prompts and targets"):
        tokenized_full_lyric = nltk.word_tokenize(lyric_text)
        
        for target_word_idx in range(1, len(tokenized_full_lyric)):
            current_target_word = tokenized_full_lyric[target_word_idx]
            preceding_words_list = tokenized_full_lyric[:target_word_idx]
            context_string_before_target_word = " ".join(preceding_words_list) + " " 

            if len(context_string_before_target_word) < SEQUENCE_LENGTH:
                final_prompt_chars = ' ' * (SEQUENCE_LENGTH - len(context_string_before_target_word)) + context_string_before_target_word
            else:
                final_prompt_chars = context_string_before_target_word[-SEQUENCE_LENGTH:]
            
            if current_target_word:
                prompts_for_eval.append(final_prompt_chars)
                true_next_words.append(current_target_word)
    
    print(f"Total prompts and targets generated for evaluation: {len(prompts_for_eval)}") 
    if not prompts_for_eval: 
        raise ValueError("Could not generate enough prompts for evaluation. Check test lyrics length and SEQUENCE_LENGTH.")
    
except Exception as e:
    print(f"Error loading test data or preparing prompts: {e}")
    prompts_for_eval = []
    true_next_words = []
    true_next_pos_tags = [] 
    existing_words_in_corpus = set() 
    pass 

if true_next_words and nlp:
    print("\nPerforming POS Tagging of true words...")
    batch_size_spacy = 1000
    for i in tqdm(range(0, len(true_next_words), batch_size_spacy), desc="POS Tagging true words"):
        batch_words = true_next_words[i:i + batch_size_spacy]
        docs = list(nlp.pipe(batch_words))
        for doc in docs:
            true_next_pos_tags.append(doc[0].pos_ if doc and doc[0] else "UNKNOWN")
else:
    print("No true words to perform POS Tagging or spaCy model not loaded. Skipping POS Tagging.")

Downloading 'wordnet' NLTK package...


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Using device: cuda
Object loaded from: processed_data_general_char_split\tokenizers\char_tokenizer_general.pkl
Object loaded from: processed_data_general_char_split\tokenizers\vocab_size_general_char.pkl
General character vocabulary loaded. Vocabulary size: 36
spaCy model 'en_core_web_sm' loaded.
Size of existing word vocabulary in general corpus (for metrics): 40909

Generating prompts and targets for evaluation from 29 exclusive artist test songs...


Generating prompts and targets: 100%|██████████| 29/29 [00:00<00:00, 264.26it/s]


Total prompts and targets generated for evaluation: 9916

Performing POS Tagging of true words...


POS Tagging true words: 100%|██████████| 10/10 [00:16<00:00,  1.64s/it]


In [None]:
"""
Executa uma avaliação granular do modelo fine-tuned, focada na predição da
próxima palavra. Carrega o modelo fine-tuned e itera sobre os prompts de teste
preparados na célula anterior. Calcula e exibe a acurácia da predição da
próxima palavra, a acurácia da etiqueta gramatical (POS tag), uma análise
de acurácia por classe gramatical e uma matriz de confusão. Adicionalmente,
compara as métricas de qualidade de texto (TTR, etc.) entre o conjunto de todas
as palavras geradas e o conjunto de todas as palavras reais do teste.
"""
print("\n--- Avaliação de Nível de Palavra para o Modelo Fine-Tuned ---")

CURRENT_DROPOUT_STRATEGY_FINE_TUNE = "none"
model_fine_tune_save_filepath = os.path.join(MODEL_DIR_BASE_ARTIST_FINE_TUNED, CLEAN_TARGET_ARTIST_NAME, f"model_weights_{CLEAN_TARGET_ARTIST_NAME}_finetuned_strategy_{CURRENT_DROPOUT_STRATEGY_FINE_TUNE}_best.pt")

try:
    model_fine_tune_eval = CharLSTM(
        vocab_size=total_chars_general,
        embedding_dim=EMBEDDING_DIM,
        hidden_dim=HIDDEN_DIM,
        n_layers=N_LAYERS,
        dropout_rate=DROPOUT_RATE,
        dropout_strategy=CURRENT_DROPOUT_STRATEGY_FINE_TUNE
    ).to(device)
    model_fine_tune_eval.load_state_dict(torch.load(model_fine_tune_save_filepath, map_location=device))
    model_fine_tune_eval.eval()
    print(f"Modelo fine-tuned carregado de: {model_fine_tune_save_filepath}")

    if not prompts_for_eval or not true_next_words or not nlp:
        print("\nAviso: Dados ou modelo spaCy insuficientes para avaliação de nível de palavra. Pulando.")
    else:
        correct_pos_predictions = 0
        correct_word_predictions = 0
        pos_confusion_matrix = {}
        generated_words_list = []
        generated_pos_tags_list = []

        print(f"\nIniciando avaliação de coesão gramatical e acurácia para {len(prompts_for_eval)} prompts...")
        for i in tqdm(range(len(prompts_for_eval)), desc="Avaliando Próxima Palavra & POS Tag"):
            prompt = prompts_for_eval[i]
            true_word = true_next_words[i]
            true_pos = true_next_pos_tags[i]

            # Esta função de geração determinística é definida na célula de setup
            generated_word = generate_text_by_word_count(model_fine_tune_eval, tokenizer_general, prompt, len(nltk.word_tokenize(prompt)) + 1, temperature=0, sequence_length=SEQUENCE_LENGTH, device=device).split()[-1]
            generated_words_list.append(generated_word)

            generated_pos = "NONE" if not generated_word else (nlp(generated_word)[0].pos_ if nlp(generated_word) and nlp(generated_word)[0] else "UNKNOWN")
            generated_pos_tags_list.append(generated_pos)

            pos_confusion_matrix.setdefault(true_pos, {}).setdefault(generated_pos, 0)
            pos_confusion_matrix[true_pos][generated_pos] += 1
            if generated_pos == true_pos:
                correct_pos_predictions += 1
            
            cleaned_true_word = re.sub(r'[^a-z0-9]', '', true_word).lower()
            cleaned_generated_word = re.sub(r'[^a-z0-9]', '', generated_word).lower()
            if cleaned_generated_word == cleaned_true_word and cleaned_true_word:
                correct_word_predictions += 1

        total_predictions = len(prompts_for_eval)
        overall_pos_accuracy = (correct_pos_predictions / total_predictions) * 100 if total_predictions > 0 else 0.0
        overall_word_accuracy = (correct_word_predictions / total_predictions) * 100 if total_predictions > 0 else 0.0

        print(f"\nResultados de Coesão Gramatical (POS) & Acurácia da Próxima Palavra (Modelo Fine-tuned)")
        print(f"Acurácia Geral da Próxima Etiqueta POS: {overall_pos_accuracy:.2f}%")
        print(f"Acurácia Geral da Próxima Palavra: {overall_word_accuracy:.2f}%")

        print("\nMétricas para PALAVRAS GERADAS (tarefa de predição)")
        generated_metrics = calculate_text_quality_metrics(" ".join(generated_words_list), existing_words_in_corpus)
        for key, value in generated_metrics.items():
            print(f"  {key}: {value:.4f}")

        print("\nMétricas para PALAVRAS REAIS (do conjunto de teste)")
        actual_metrics = calculate_text_quality_metrics(" ".join(true_next_words), existing_words_in_corpus)
        for key, value in actual_metrics.items():
            print(f"  {key}: {value:.4f}")

except FileNotFoundError:
    print(f"Erro: Modelo fine-tuned não encontrado em '{model_fine_tune_save_filepath}'.")


--- PART 1: Word-Level Evaluations for Fine-tuned Model (ALL Test Prompts) ---
Fine-tuned model loaded from: models_by_artist_char_split_finetuned\arianagrande\model_weights_arianagrande_finetuned_strategy_none_best.pt

Starting grammatical cohesion and next-word accuracy evaluation for 9916 test prompts...


Evaluating Next Word & POS Tag: 100%|██████████| 9916/9916 [11:24<00:00, 14.50it/s]



--- Results: Grammatical Cohesion (POS) & Next Word Accuracy (Fine-tuned Model) ---
Total prompts evaluated: 9916
Correct POS Tag Predictions: 5037
Overall Next POS Tag Accuracy: 50.80%
Correct Next Word Predictions: 2968
Overall Next Word Prediction Accuracy: 29.93%

--- POS Tag Accuracy per True POS Tag ---
  When true POS is 'ADP': 33.56% (Correct: 200/596)
  When true POS is 'PROPN': 27.56% (Correct: 113/410)
  When true POS is 'PRON': 76.76% (Correct: 2120/2762)
  When true POS is 'VERB': 52.46% (Correct: 980/1868)
  When true POS is 'AUX': 44.47% (Correct: 213/479)
  When true POS is 'NOUN': 47.96% (Correct: 600/1251)
  When true POS is 'ADJ': 34.71% (Correct: 118/340)
  When true POS is 'INTJ': 38.13% (Correct: 159/417)
  When true POS is 'PUNCT': 2.56% (Correct: 6/234)
  When true POS is 'SCONJ': 30.69% (Correct: 58/189)
  When true POS is 'ADV': 41.28% (Correct: 251/608)
  When true POS is 'CCONJ': 33.88% (Correct: 82/242)
  When true POS is 'PART': 19.06% (Correct: 69/362)
 

In [None]:
if not true_next_pos_tags:
    print("A lista 'true_next_pos_tags' está vazia. Não é possível calcular baselines.")
else:
    pos_counts = Counter(true_next_pos_tags)
    total_tags = len(true_next_pos_tags)

    # --- Calcular o Baseline (chutar sempre o mais comum) ---
    most_common_pos, most_common_count = pos_counts.most_common(1)[0]
    baseline_majority_accuracy = (most_common_count / total_tags) * 100
    print(f"Classe gramatical mais comum: '{most_common_pos}' ({most_common_count} ocorrências)")
    print(f"**Acurácia do Baseline (sempre prevendo a mais comum): {baseline_majority_accuracy:.2f}%**")

    # --- Calcular o Baseline (chutar aleatoriamente baseado na distribuição) ---
    expected_random_accuracy = 0.0
    print("\nProbabilidades e Contribuições para o Baseline Aleatório Ponderado:")
    for pos_tag, count in pos_counts.items():
        probability = count / total_tags
        contribution = probability ** 2
        expected_random_accuracy += contribution
        print(f"  '{pos_tag}': Probabilidade = {probability:.4f}, Contribuição = {contribution:.4f}")

    expected_random_accuracy_percent = expected_random_accuracy * 100
    print(f"**Acurácia do Baseline (chutar aleatoriamente com base na distribuição): {expected_random_accuracy_percent:.2f}%**")

    # --- Comparação com o seu resultado (50.80%) ---
    your_model_accuracy = 50.80
    print(f"\nSua acurácia (Overall Next POS Tag Accuracy): {your_model_accuracy:.2f}%")

    if your_model_accuracy > baseline_majority_accuracy:
        print(f"O modelo ({your_model_accuracy:.2f}%) é melhor que o baseline de maioria ({baseline_majority_accuracy:.2f}%).")
    else:
        print(f"O modelo ({your_model_accuracy:.2f}%) é pior ou igual ao baseline de maioria ({baseline_majority_accuracy:.2f}%).")

    if your_model_accuracy > expected_random_accuracy_percent:
        print(f"O modelo ({your_model_accuracy:.2f}%) é melhor que o baseline aleatório ponderado ({expected_random_accuracy_percent:.2f}%).")
    else:
        print(f"O modelo ({your_model_accuracy:.2f}%) é pior ou igual ao baseline aleatório ponderado ({expected_random_accuracy_percent:.2f}%).")

Classe gramatical mais comum: 'PRON' (2762 ocorrências)
**Acurácia do Baseline (sempre prevendo a mais comum): 27.85%**

Probabilidades e Contribuições para o Baseline Aleatório Ponderado:
  'ADP': Probabilidade = 0.0601, Contribuição = 0.0036
  'PROPN': Probabilidade = 0.0413, Contribuição = 0.0017
  'PRON': Probabilidade = 0.2785, Contribuição = 0.0776
  'VERB': Probabilidade = 0.1884, Contribuição = 0.0355
  'AUX': Probabilidade = 0.0483, Contribuição = 0.0023
  'NOUN': Probabilidade = 0.1262, Contribuição = 0.0159
  'ADJ': Probabilidade = 0.0343, Contribuição = 0.0012
  'INTJ': Probabilidade = 0.0421, Contribuição = 0.0018
  'PUNCT': Probabilidade = 0.0236, Contribuição = 0.0006
  'SCONJ': Probabilidade = 0.0191, Contribuição = 0.0004
  'ADV': Probabilidade = 0.0613, Contribuição = 0.0038
  'CCONJ': Probabilidade = 0.0244, Contribuição = 0.0006
  'PART': Probabilidade = 0.0365, Contribuição = 0.0013
  'NUM': Probabilidade = 0.0127, Contribuição = 0.0002
  'X': Probabilidade = 0.003

In [None]:
"""
Avalia a qualidade do texto longo gerado pelo modelo fine-tuned, focando em
métricas de nível de caractere. Define funções para gerar texto longo,
calcular o Type-Token Ratio (TTR) de caracteres e a taxa de repetição de
n-gramas. Carrega o modelo fine-tuned, gera uma única amostra de texto
longa e, por fim, calcula e compara as métricas de diversidade e repetição
do texto gerado com as do conjunto de teste real do artista.
"""
print("\n--- Avaliação de Geração de Texto Longo (Nível de Caractere) ---")

def generate_long_text(model, tokenizer, seed_text, num_generate_chars=1000, temperature=0.8, sequence_length=SEQUENCE_LENGTH, device=device):
    model.eval()
    generated_text = seed_text
    for _ in tqdm(range(num_generate_chars), desc="Gerando texto longo"):
        current_sequence_str = generated_text[-sequence_length:]
        tokenized_input = tokenizer.texts_to_sequences(current_sequence_str)
        if not tokenized_input or all(t == 0 for t in tokenized_input):
            break
        input_tensor = torch.tensor(tokenized_input, dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            predictions = model(input_tensor).squeeze(0)
        
        if temperature <= 0:
            next_char_id = torch.argmax(predictions, dim=-1).item()
        else:
            probabilities = torch.softmax(predictions / temperature, dim=-1)
            next_char_id = torch.multinomial(probabilities, 1).item()
        
        generated_text += tokenizer.int_to_char.get(next_char_id, '')
    return generated_text

def calculate_char_ttr(text):
    if not text: return 0.0
    return len(set(text)) / len(text)

def calculate_ngram_repetition_rate(text, n=3):
    if len(text) < n: return 0.0
    ngrams = [text[i:i+n] for i in range(len(text) - n + 1)]
    if not ngrams: return 0.0
    return (len(ngrams) - len(set(ngrams))) / len(ngrams)

CURRENT_DROPOUT_STRATEGY_FINE_TUNE = "none"
model_fine_tune_save_filepath = os.path.join(MODEL_DIR_BASE_ARTIST_FINE_TUNED, CLEAN_TARGET_ARTIST_NAME, f"model_weights_{CLEAN_TARGET_ARTIST_NAME}_finetuned_strategy_{CURRENT_DROPOUT_STRATEGY_FINE_TUNE}_best.pt")

try:
    model_fine_tune_eval_long_gen = CharLSTM(
        vocab_size=total_chars_general,
        embedding_dim=EMBEDDING_DIM,
        hidden_dim=HIDDEN_DIM,
        n_layers=N_LAYERS,
        dropout_rate=DROPOUT_RATE,
        dropout_strategy=CURRENT_DROPOUT_STRATEGY_FINE_TUNE
    ).to(device)
    model_fine_tune_eval_long_gen.load_state_dict(torch.load(model_fine_tune_save_filepath, map_location=device))
    model_fine_tune_eval_long_gen.eval()
    print(f"Modelo fine-tuned carregado de: {model_fine_tune_save_filepath}")

    NUM_CHARS_FOR_LONG_TEXT_GEN = 5000
    GENERATION_TEMPERATURE = 0.8
    print(f"\nGerando texto longo de {NUM_CHARS_FOR_LONG_TEXT_GEN} caracteres...")

    if not TEST_LYRICS_ARTIST_FINAL_FOR_GEN:
        print("Aviso: Nenhuma música de teste disponível para usar como seed.")
        generated_long_text_fine_tune = ""
    else:
        random.seed(RANDOM_SEED)
        idx_for_seed = random.randint(0, len(TEST_LYRICS_ARTIST_FINAL_FOR_GEN) - 1)
        sample_seed_text = clean_text(TEST_LYRICS_ARTIST_FINAL_FOR_GEN[idx_for_seed])
        
        if len(sample_seed_text) < SEQUENCE_LENGTH:
            sample_seed_text = ' ' * (SEQUENCE_LENGTH - len(sample_seed_text)) + sample_seed_text
        else:
            sample_seed_text = sample_seed_text[-SEQUENCE_LENGTH:]
        
        print(f"Seed inicial: '{sample_seed_text}'")
        generated_long_text_fine_tune = generate_long_text(
            model_fine_tune_eval_long_gen, tokenizer_general, sample_seed_text,
            num_generate_chars=NUM_CHARS_FOR_LONG_TEXT_GEN,
            temperature=GENERATION_TEMPERATURE,
            sequence_length=SEQUENCE_LENGTH,
            device=device
        )
        print(f"Amostra Gerada (início): '{generated_long_text_fine_tune[:200]}...'")

        print("\nMétricas de Nível de Caractere (Texto Gerado)")
        print(f"  TTR de Caracteres: {calculate_char_ttr(generated_long_text_fine_tune):.4f}")
        print(f"  Taxa de Repetição de 3-gramas: {calculate_ngram_repetition_rate(generated_long_text_fine_tune, n=3):.4f}")

    if TEST_LYRICS_ARTIST_FINAL_FOR_GEN:
        print("\nMétricas de Nível de Caractere (Conjunto de Teste Real do Artista)")
        combined_actual_test_lyrics = " ".join([clean_text(lyric) for lyric in TEST_LYRICS_ARTIST_FINAL_FOR_GEN])
        print(f"  TTR de Caracteres: {calculate_char_ttr(combined_actual_test_lyrics):.4f}")
        print(f"  Taxa de Repetição de 3-gramas: {calculate_ngram_repetition_rate(combined_actual_test_lyrics, n=3):.4f}")

except FileNotFoundError:
    print(f"Erro: Modelo fine-tuned não encontrado em '{model_fine_tune_save_filepath}'.")

In [None]:
"""
Define e executa uma função de avaliação padronizada para calcular as métricas
de desempenho fundamentais (Perda, Acurácia, Perplexidade). A função é
projetada para ser reutilizável, carregando o modelo e o conjunto de teste
apropriados com base no tipo de modelo especificado ('Geral' ou 'Fine-tuned').
Por fim, a célula executa esta avaliação tanto para o modelo geral quanto
para o modelo fine-tuned, exibindo seus resultados.
"""
print("\n--- Avaliação de Perda e Acurácia (Modelos Geral e Fine-tuned) ---")

def run_accuracy_loss_evaluation(model_type, dropout_strategy_to_load, tokenizer_obj, device_obj):
    """Executa a avaliação de perda e acurácia para um tipo de modelo e estratégia de dropout."""
    print(f"\nAvaliando: Modelo {model_type} (Dropout: '{dropout_strategy_to_load}')")

    if model_type == "Geral":
        model_dir = MODEL_DIR_BASE_GENERAL
        model_filename = f"model_weights_general_strategy_{dropout_strategy_to_load}_best.pt"
        X_test_path = os.path.join(BASE_OUTPUT_DIR_GENERAL, DATA_SUBDIR, 'X_test_general_char.npy')
        y_test_path = os.path.join(BASE_OUTPUT_DIR_GENERAL, DATA_SUBDIR, 'y_test_general_char.npy')
    elif model_type == "Fine-tuned":
        model_dir = os.path.join(MODEL_DIR_BASE_ARTIST_FINE_TUNED, CLEAN_TARGET_ARTIST_NAME)
        model_filename = f"model_weights_{CLEAN_TARGET_ARTIST_NAME}_finetuned_strategy_{dropout_strategy_to_load}_best.pt"
        X_test_path = os.path.join(BASE_OUTPUT_DIR_ARTIST, CLEAN_TARGET_ARTIST_NAME, DATA_SUBDIR, f'X_test_{CLEAN_TARGET_ARTIST_NAME}_char.npy')
        y_test_path = os.path.join(BASE_OUTPUT_DIR_ARTIST, CLEAN_TARGET_ARTIST_NAME, DATA_SUBDIR, f'y_test_{CLEAN_TARGET_ARTIST_NAME}_char.npy')
    else:
        print(f"Tipo de modelo inválido: {model_type}")
        return

    model_filepath = os.path.join(model_dir, model_filename)
    if not os.path.exists(model_filepath):
        print(f"  - Erro: Modelo não encontrado em '{model_filepath}'.")
        return

    model = CharLSTM(
        vocab_size=total_chars_general, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM,
        n_layers=N_LAYERS, dropout_rate=DROPOUT_RATE, dropout_strategy=dropout_strategy_to_load
    ).to(device_obj)
    model.load_state_dict(torch.load(model_filepath, map_location=device_obj))
    model.eval()
    print(f"  - Modelo '{model_filename}' carregado.")

    if not (os.path.exists(X_test_path) and os.path.exists(y_test_path)):
        print(f"  - Aviso: Dados de teste não encontrados em '{X_test_path}'.")
        return

    test_dataset = LyricsDataset(X_test_path, y_test_path)
    if len(test_dataset) == 0:
        print(f"  - Aviso: Dataset de teste em '{X_test_path}' está vazio.")
        return
        
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)
    criterion = nn.CrossEntropyLoss()
    total_loss, correct_predictions, total_samples = 0.0, 0, 0

    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc=f"Calculando Loss/Acc ({model_type})"):
            inputs, labels = inputs.to(device_obj), labels.to(device_obj)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    avg_loss = total_loss / total_samples if total_samples > 0 else float('nan')
    accuracy = correct_predictions / total_samples if total_samples > 0 else float('nan')
    perplexity = np.exp(avg_loss) if not np.isnan(avg_loss) else float('nan')

    print(f"  - Perda (Cross-Entropy): {avg_loss:.4f}")
    print(f"  - Acurácia (Próximo Caractere): {accuracy:.4f}")
    print(f"  - Perplexidade: {perplexity:.4f}")

# Executa a avaliação para ambos os modelos com a estratégia de dropout 'none'
run_accuracy_loss_evaluation("Geral", "none", tokenizer_general, device)
run_accuracy_loss_evaluation("Fine-tuned", "none", tokenizer_general, device)


--- PART 3: Loss and Accuracy Evaluation (General & Fine-tuned Models) ---

--- ACCURACY & LOSS EVALUATION: Geral Model (Dropout: 'none') ---
Model 'model_weights_general_strategy_none_best.pt' loaded.


Calculating Loss/Acc (Geral): 100%|██████████| 3709/3709 [03:58<00:00, 15.55it/s]



--- Results: Loss & Accuracy (Geral - Dropout: none) ---
Loss (Cross-Entropy): 1.1837
Accuracy (Next Character): 0.6324
Perplexity: 3.2664
-----------------------------------------------------

--- ACCURACY & LOSS EVALUATION: Fine-tuned Model (Dropout: 'none') ---
Model 'model_weights_arianagrande_finetuned_strategy_none_best.pt' loaded.


Calculating Loss/Acc (Fine-tuned): 100%|██████████| 165/165 [00:10<00:00, 15.88it/s]


--- Results: Loss & Accuracy (Fine-tuned - Dropout: none) ---
Loss (Cross-Entropy): 0.7528
Accuracy (Next Character): 0.7932
Perplexity: 2.1230
-----------------------------------------------------





## Comparação das letras geradas a partir de prompts do teste

In [None]:
"""
Realiza uma comparação qualitativa lado a lado entre o modelo geral e o modelo
fine-tuned. Define uma função de geração de texto, carrega o tokenizador e os
pesos de ambos os modelos, e prepara uma lista de prompts a partir do conjunto
de teste exclusivo do artista. Em seguida, para vários exemplos aleatórios,
exibe o prompt, a continuação real da música e as continuações geradas por
cada um dos dois modelos, permitindo uma análise visual da melhoria obtida com
o fine-tuning.
"""
def generate_text_char_level(model, tokenizer, seed_text, num_generate=500, temperature=0.8, sequence_length=SEQUENCE_LENGTH, device=device):
    model.eval()
    processed_seed_text = clean_text(seed_text)
    if len(processed_seed_text) < sequence_length:
        processed_seed_text = ' ' * (sequence_length - len(processed_seed_text)) + processed_seed_text
    elif len(processed_seed_text) > sequence_length:
        processed_seed_text = processed_seed_text[-sequence_length:]
    
    generated_text = processed_seed_text
    for _ in range(num_generate):
        current_sequence_str = generated_text[-sequence_length:]
        tokenized_input = tokenizer.texts_to_sequences(current_sequence_str)
        if not tokenized_input or all(t == 0 for t in tokenized_input):
            break
        
        input_tensor = torch.tensor(tokenized_input, dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            predictions = model(input_tensor).squeeze(0)
        
        if temperature > 0:
            predictions = predictions / temperature
        
        probabilities = torch.softmax(predictions, dim=-1)
        next_char_id = torch.multinomial(probabilities, 1).item()
        generated_text += tokenizer.int_to_char.get(next_char_id, '')
    return generated_text

try:
    tokenizer_general = load_object(os.path.join(BASE_OUTPUT_DIR_GENERAL, TOKENIZER_SUBDIR, 'char_tokenizer_general.pkl'))
    total_chars_general = load_object(os.path.join(BASE_OUTPUT_DIR_GENERAL, TOKENIZER_SUBDIR, 'vocab_size_general_char.pkl'))
    print(f"Vocabulário de caracteres geral carregado. Tamanho: {total_chars_general}")
except FileNotFoundError as e:
    print(f"Erro: Arquivos do tokenizer geral não encontrados. {e}")
    exit()

MODEL_DROPOUT_STRATEGY_GENERAL = "none"
MODEL_SAVE_FILEPATH_GENERAL = os.path.join(MODEL_DIR_BASE_GENERAL, f"model_weights_general_strategy_{MODEL_DROPOUT_STRATEGY_GENERAL}_best.pt")
model_general = CharLSTM(total_chars_general, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT_RATE, MODEL_DROPOUT_STRATEGY_GENERAL).to(device)
try:
    model_general.load_state_dict(torch.load(MODEL_SAVE_FILEPATH_GENERAL, map_location=device))
    model_general.eval()
    print(f"Modelo Geral carregado de: {MODEL_SAVE_FILEPATH_GENERAL}")
except FileNotFoundError:
    print(f"Aviso: Modelo Geral não encontrado em '{MODEL_SAVE_FILEPATH_GENERAL}'. Será pulado na comparação.")
    model_general = None

MODEL_DROPOUT_STRATEGY_FINE_TUNE = "none"
MODEL_SAVE_FILEPATH_FINE_TUNE = os.path.join(MODEL_DIR_BASE_ARTIST_FINE_TUNED, CLEAN_TARGET_ARTIST_NAME, f"model_weights_{CLEAN_TARGET_ARTIST_NAME}_finetuned_strategy_{MODEL_DROPOUT_STRATEGY_FINE_TUNE}_best.pt")
model_fine_tuned = CharLSTM(total_chars_general, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT_RATE, MODEL_DROPOUT_STRATEGY_FINE_TUNE).to(device)
try:
    model_fine_tuned.load_state_dict(torch.load(MODEL_SAVE_FILEPATH_FINE_TUNE, map_location=device))
    model_fine_tuned.eval()
    print(f"Modelo Fine-tuned carregado de: {MODEL_SAVE_FILEPATH_FINE_TUNE}")
except FileNotFoundError:
    print(f"Aviso: Modelo Fine-tuned não encontrado em '{MODEL_SAVE_FILEPATH_FINE_TUNE}'. Será pulado na comparação.")
    model_fine_tuned = None

try:
    ARTIST_LYRICS_CLEAN_ALL = []
    df_target_artist_full = pd.read_csv(CSV_FILE_PATH_TARGET_ARTIST, usecols=['Lyric'])
    for lyric_val in df_target_artist_full['Lyric']:
        cleaned_lyric = clean_text(lyric_val)
        if cleaned_lyric and PLACEHOLDER_PHRASE not in cleaned_lyric:
            ARTIST_LYRICS_CLEAN_ALL.append(cleaned_lyric)
    
    random.seed(RANDOM_SEED)
    random.shuffle(ARTIST_LYRICS_CLEAN_ALL)
    num_test_artist_final = int(len(ARTIST_LYRICS_CLEAN_ALL) * TEST_SPLIT)
    TEST_LYRICS_ARTIST_FINAL_FOR_GEN = ARTIST_LYRICS_CLEAN_ALL[:num_test_artist_final]
    print(f"\nColetadas {len(TEST_LYRICS_ARTIST_FINAL_FOR_GEN)} músicas de '{TARGET_ARTIST}' para seleção de prompts.")
except Exception as e:
    print(f"Erro ao carregar dados de teste do artista: {e}")
    TEST_LYRICS_ARTIST_FINAL_FOR_GEN = []

# --- Geração e Comparação de Exemplos ---
NUM_EXAMPLES = 10
NUM_CHARS_TO_GENERATE_PER_EXAMPLE = 250
GENERATION_TEMPERATURE = 0.7

print("\n--- Comparação de Geração de Texto (Modelo Geral vs. Fine-tuned) ---")
if not TEST_LYRICS_ARTIST_FINAL_FOR_GEN:
    print("Não há músicas de teste para gerar prompts. Impossível prosseguir.")
else:
    selected_lyric_indices = random.sample(range(len(TEST_LYRICS_ARTIST_FINAL_FOR_GEN)), min(NUM_EXAMPLES, len(TEST_LYRICS_ARTIST_FINAL_FOR_GEN)))
    for i, lyric_idx in enumerate(selected_lyric_indices):
        original_lyric = TEST_LYRICS_ARTIST_FINAL_FOR_GEN[lyric_idx]
        
        min_len_required = SEQUENCE_LENGTH + 20
        if len(original_lyric) < min_len_required:
            prompt_start_index = 0
        else:
            prompt_start_index = random.randint(0, len(original_lyric) - min_len_required)
            
        prompt = original_lyric[prompt_start_index : prompt_start_index + SEQUENCE_LENGTH]
        real_continuation = original_lyric[prompt_start_index + SEQUENCE_LENGTH : prompt_start_index + SEQUENCE_LENGTH + NUM_CHARS_TO_GENERATE_PER_EXAMPLE]

        print(f"\n--- Exemplo {i + 1} " + "="*65)
        print(f"PROMPT DE ENTRADA:\n'{prompt}'")
        print("\nCONTINUAÇÃO REAL:")
        print(f"'{real_continuation}'")

        if model_general:
            generated_general = generate_text_char_level(model_general, tokenizer_general, prompt, NUM_CHARS_TO_GENERATE_PER_EXAMPLE, GENERATION_TEMPERATURE)
            print("\nGERADO PELO MODELO GERAL:")
            print(f"'{generated_general[SEQUENCE_LENGTH:]}'")

        if model_fine_tuned:
            generated_fine_tuned = generate_text_char_level(model_fine_tuned, tokenizer_general, prompt, NUM_CHARS_TO_GENERATE_PER_EXAMPLE, GENERATION_TEMPERATURE)
            print("\nGERADO PELO MODELO FINE-TUNED:")
            print(f"'{generated_fine_tuned[SEQUENCE_LENGTH:]}'")
        print("="*80)

Usando dispositivo: cuda
Objeto carregado de: processed_data_general_char_split\tokenizers\char_tokenizer_general.pkl
Objeto carregado de: processed_data_general_char_split\tokenizers\vocab_size_general_char.pkl
Vocabulário de caracteres GERAL carregado. Tamanho do vocabulário: 36
Modelo Geral carregado de: models_general_char_split\model_weights_general_strategy_none_best.pt
Modelo Fine-tuned carregado de: models_by_artist_char_split_finetuned\arianagrande\model_weights_arianagrande_finetuned_strategy_none_best.pt

Coletadas 29 músicas do artista 'ArianaGrande' para seleção de prompts.

--- Comparação de Geração de Texto (Modelo Geral vs. Fine-tuned) ---
Artista Alvo: ArianaGrande
Quantidade de Exemplos: 10
Caracteres Gerados por Modelo: 100
Temperatura de Geração: 0

--- Exemplo 1 ---
PROMPT DE ENTRADA (100 caracteres):
'is the part when i say i don't want ya i'm stronger than i've been before this is the part when i br'

CONTINUAÇÃO REAL:
'eak free 'cause i can't resist it no more d

## Geração do próximo caractere

In [None]:
"""
Realiza uma comparação direta da predição do próximo caractere entre o modelo
geral e o modelo fine-tuned. A célula carrega o tokenizador, ambos os modelos
treinados e o conjunto de teste exclusivo do artista. Em seguida, seleciona
exemplos aleatórios deste conjunto de teste e, para cada um, exibe o prompt,
o caractere real seguinte e a predição de cada modelo, permitindo uma análise
qualitativa e quantitativa da melhoria na previsão de caracteres após o
fine-tuning.
"""
print("\n--- Comparação de Predição de Próximo Caractere ---")
try:
    tokenizer_general = load_object(os.path.join(BASE_OUTPUT_DIR_GENERAL, TOKENIZER_SUBDIR, 'char_tokenizer_general.pkl'))
    total_chars_general = load_object(os.path.join(BASE_OUTPUT_DIR_GENERAL, TOKENIZER_SUBDIR, 'vocab_size_general_char.pkl'))
    print(f"Vocabulário de caracteres geral carregado. Tamanho: {total_chars_general}")
except FileNotFoundError as e:
    print(f"Erro: Arquivos do tokenizer geral não encontrados. {e}")
    exit()

MODEL_DROPOUT_STRATEGY = "none" # Estratégia usada para ambos os modelos
MODEL_SAVE_FILEPATH_GENERAL = os.path.join(MODEL_DIR_BASE_GENERAL, f"model_weights_general_strategy_{MODEL_DROPOUT_STRATEGY}_best.pt")
MODEL_SAVE_FILEPATH_FINE_TUNE = os.path.join(MODEL_DIR_BASE_ARTIST_FINE_TUNED, CLEAN_TARGET_ARTIST_NAME, f"model_weights_{CLEAN_TARGET_ARTIST_NAME}_finetuned_strategy_{MODEL_DROPOUT_STRATEGY}_best.pt")

model_general = CharLSTM(total_chars_general, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT_RATE, MODEL_DROPOUT_STRATEGY).to(device)
try:
    model_general.load_state_dict(torch.load(MODEL_SAVE_FILEPATH_GENERAL, map_location=device))
    model_general.eval()
    print(f"Modelo Geral carregado de: {MODEL_SAVE_FILEPATH_GENERAL}")
except FileNotFoundError:
    print(f"Aviso: Modelo Geral não encontrado em '{MODEL_SAVE_FILEPATH_GENERAL}'.")
    model_general = None

model_fine_tuned = CharLSTM(total_chars_general, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT_RATE, MODEL_DROPOUT_STRATEGY).to(device)
try:
    model_fine_tuned.load_state_dict(torch.load(MODEL_SAVE_FILEPATH_FINE_TUNE, map_location=device))
    model_fine_tuned.eval()
    print(f"Modelo Fine-tuned carregado de: {MODEL_SAVE_FILEPATH_FINE_TUNE}")
except FileNotFoundError:
    print(f"Aviso: Modelo Fine-tuned não encontrado em '{MODEL_SAVE_FILEPATH_FINE_TUNE}'.")
    model_fine_tuned = None

X_test_artist_path = os.path.join(BASE_OUTPUT_DIR_ARTIST, CLEAN_TARGET_ARTIST_NAME, DATA_SUBDIR, f'X_test_{CLEAN_TARGET_ARTIST_NAME}_char.npy')
y_test_artist_path = os.path.join(BASE_OUTPUT_DIR_ARTIST, CLEAN_TARGET_ARTIST_NAME, DATA_SUBDIR, f'y_test_{CLEAN_TARGET_ARTIST_NAME}_char.npy')
try:
    test_dataset_artist = LyricsDataset(X_test_artist_path, y_test_artist_path)
    print(f"Dataset de teste do artista carregado com {len(test_dataset_artist)} amostras.")
except FileNotFoundError as e:
    print(f"Erro: Dados de teste do artista não encontrados. {e}")
    test_dataset_artist = None

if test_dataset_artist and (model_general or model_fine_tuned):
    num_examples_to_show = 10
    random_indices = random.sample(range(len(test_dataset_artist)), min(num_examples_to_show, len(test_dataset_artist)))

    with torch.no_grad():
        for i, idx in enumerate(random_indices):
            input_sequence, actual_next_char_id = test_dataset_artist[idx]
            input_sequence_batch = input_sequence.unsqueeze(0).to(device)
            
            predicted_char_general = "[Não Carregado]"
            if model_general:
                output_general = model_general(input_sequence_batch)
                _, predicted_id_general = torch.max(output_general, 1)
                predicted_char_general = tokenizer_general.sequences_to_texts([predicted_id_general.item()])

            predicted_char_fine_tuned = "[Não Carregado]"
            if model_fine_tuned:
                output_fine_tuned = model_fine_tuned(input_sequence_batch)
                _, predicted_id_fine_tuned = torch.max(output_fine_tuned, 1)
                predicted_char_fine_tuned = tokenizer_general.sequences_to_texts([predicted_id_fine_tuned.item()])

            input_text = tokenizer_general.sequences_to_texts(input_sequence.tolist())
            actual_char = tokenizer_general.sequences_to_texts([actual_next_char_id.item()])

            print(f"\n--- Exemplo {i+1} ---")
            print(f"  Prompt (últimos 40 chars): '...{input_text[-40:]}'")
            print(f"  Caracter Real:                '{actual_char}'")
            print(f"  Previsto (Modelo Geral):      '{predicted_char_general}' {'(Acertou)' if actual_char == predicted_char_general else ''}")
            print(f"  Previsto (Modelo Fine-tuned): '{predicted_char_fine_tuned}' {'(Acertou)' if actual_char == predicted_char_fine_tuned else ''}")

    print("\nComparação de predição de caracteres concluída.")
else:
    print("\nComparação não pôde ser executada devido à falta do dataset de teste ou de ambos os modelos.")

Usando dispositivo: cuda
Objeto carregado de: processed_data_general_char_split\tokenizers\char_tokenizer_general.pkl
Objeto carregado de: processed_data_general_char_split\tokenizers\vocab_size_general_char.pkl
Vocabulário de caracteres GERAL carregado. Tamanho do vocabulário: 36
Modelo Geral carregado de: models_general_char_split\model_weights_general_strategy_none_best.pt
Modelo Fine-tuned carregado de: models_by_artist_char_split_finetuned\arianagrande\model_weights_arianagrande_finetuned_strategy_none_best.pt
Dataset de teste do artista carregado com 42190 amostras.

--- Comparação de Predições de Próximo Caractere (10 Exemplos) ---
Usando prompts do conjunto de TESTE EXCLUSIVO do artista: ArianaGrande

--- Exemplo 1 ---
  Input (últimos 30 caracteres do prompt): 'ss stole you wouldn't let anyb'
  Caracter Real Seguinte:         'o'
  Previsto (Modelo Geral):        'o' (Acertou)
  Previsto (Modelo Fine-tuned):   'o' (Acertou)
------------------------------

--- Exemplo 2 ---
  I

## Geração de letras

In [None]:
"""
Define uma função para geração de texto e a utiliza para criar uma nova
letra de música a partir de um prompt inicial. Esta célula assume que os
modelos e o tokenizador estão disponíveis para serem carregados e que as
configurações base estão definidas. Ela carrega o modelo fine-tuned,
define um prompt e parâmetros de geração (temperatura, comprimento) e
exibe o texto resultante.
"""
def generate_text(model, tokenizer, seed_text, num_generate=500, temperature=0.8, sequence_length=SEQUENCE_LENGTH, device=device):
    """Gera texto caractere a caractere a partir de uma seed, controlando a criatividade com a temperatura."""
    model.eval()
    
    processed_seed_text = clean_text(seed_text)
    if len(processed_seed_text) < sequence_length:
        processed_seed_text = ' ' * (sequence_length - len(processed_seed_text)) + processed_seed_text
    elif len(processed_seed_text) > sequence_length:
        processed_seed_text = processed_seed_text[-sequence_length:]

    generated_text = processed_seed_text
    print(f"Iniciando geração com seed: '{generated_text}'")
    print("-" * 50)

    for _ in tqdm(range(num_generate), desc="Gerando caracteres"):
        current_sequence_str = generated_text[-sequence_length:]
        tokenized_input = tokenizer.texts_to_sequences(current_sequence_str)
        
        if not tokenized_input or all(t == 0 for t in tokenized_input):
            print("\n[Aviso: Sequência de input inválida, encerrando geração.]")
            break

        input_tensor = torch.tensor(tokenized_input, dtype=torch.long).unsqueeze(0).to(device)

        with torch.no_grad():
            predictions = model(input_tensor).squeeze(0)
        
        if temperature <= 0:
            next_char_id = torch.argmax(predictions, dim=-1).item()
        else:
            predictions = predictions / temperature
            probabilities = torch.softmax(predictions, dim=-1)
            next_char_id = torch.multinomial(probabilities, 1).item()

        next_char = tokenizer.int_to_char.get(next_char_id, '')
        generated_text += next_char
    return generated_text

TOKENIZER_PATH_GENERAL = os.path.join(BASE_OUTPUT_DIR_GENERAL, TOKENIZER_SUBDIR, 'char_tokenizer_general.pkl')
VOCAB_SIZE_PATH_GENERAL = os.path.join(BASE_OUTPUT_DIR_GENERAL, TOKENIZER_SUBDIR, 'vocab_size_general_char.pkl')
try:
    tokenizer = load_object(TOKENIZER_PATH_GENERAL)
    total_chars = load_object(VOCAB_SIZE_PATH_GENERAL)
    print(f"Vocabulário geral carregado. Tamanho: {total_chars}")
except FileNotFoundError as e:
    print(e)
    exit()

MODEL_DROPOUT_STRATEGY = "none"
MODEL_SAVE_FILEPATH_FINE_TUNED = os.path.join(MODEL_DIR_BASE_ARTIST_FINE_TUNED, CLEAN_TARGET_ARTIST_NAME, f"model_weights_{CLEAN_TARGET_ARTIST_NAME}_finetuned_strategy_{MODEL_DROPOUT_STRATEGY}_best.pt")

if not os.path.exists(MODEL_SAVE_FILEPATH_FINE_TUNED):
    print(f"Erro: Modelo fine-tuned não encontrado em '{MODEL_SAVE_FILEPATH_FINE_TUNED}'.")
    exit()

model = CharLSTM(
    vocab_size=total_chars,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    n_layers=N_LAYERS,
    dropout_rate=DROPOUT_RATE,
    dropout_strategy=MODEL_DROPOUT_STRATEGY
).to(device)

model.load_state_dict(torch.load(MODEL_SAVE_FILEPATH_FINE_TUNED, map_location=device))
print(f"Modelo fine-tuned de '{TARGET_ARTIST}' carregado de: {MODEL_SAVE_FILEPATH_FINE_TUNED}")

prompt_inicial = "don't want nobody else around me just need you right here you're like the only thing that i see it's"
num_chars_to_generate = 500
generation_temperature = 0.7

print("\n--- INICIANDO GERAÇÃO DE LETRA DE MÚSICA ---")
generated_lyric = generate_text(model, tokenizer, prompt_inicial, num_chars_to_generate, generation_temperature)

print("\n--- TEXTO GERADO ---")
print(generated_lyric[SEQUENCE_LENGTH:])
print("=" * 80)

Usando dispositivo: cuda
Objeto carregado de: processed_data_general_char_split\tokenizers\char_tokenizer_general.pkl
Objeto carregado de: processed_data_general_char_split\tokenizers\vocab_size_general_char.pkl
Vocabulário de caracteres GERAL carregado. Tamanho do vocabulário: 36
Modelo fine-tuned de 'ArianaGrande' carregado de: models_by_artist_char_split_finetuned\arianagrande\model_weights_arianagrande_finetuned_strategy_none_best.pt

--- INICIANDO GERAÇÃO DE LETRA DE MÚSICA ---
Artista: ArianaGrande
Prompt: 'don't want nobody else around me just need you right here you're like the only thing that i see it's'
Caracteres a gerar: 120
Temperatura: 0
Iniciando geração com seed (ajustada para 100 caracteres): 'don't want nobody else around me just need you right here you're like the only thing that i see it's'
--------------------------------------------------


Gerando caracteres: 100%|██████████| 120/120 [00:02<00:00, 58.31it/s]


--- APENAS O TEXTO NOVO GERADO ---
 love i just want to break your heart right back yeah all this time i was blind running 'round telling everybody my baby



