# Mục tiêu là sử dụng mô hình seq2seq để classification tất cả các nghệ sĩ đã biết (đã label) 
+ sau khi train xong thì loại bỏ FC cuối để lấy đc embedding vector của các tác giả

## Import thư viện 

In [35]:
import numpy as np 
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.model_selection import train_test_split
from itables import show  

# 1. Load data

In [36]:
vocab_df = pd.read_csv("../exps/Preproccessed/exp2_vocab.csv")  # columns: word,id
vocab = dict(zip(vocab_df["Artist Name"], vocab_df['Class']))
vocab_size = len(vocab) + 1  # +1 for unknown token
train_df = pd.read_csv("../exps/Preproccessed/exp2_NamesLabeling_Train.csv")  # columns: text,label
test_df  = pd.read_csv("../data/test.csv")

In [37]:

# -----------------------------
# 1. Load vocab
# -----------------------------
vocab_df = pd.read_csv("../exps/Preproccessed/exp2_vocab.csv")  # columns: word,id
vocab = dict(zip(vocab_df["Artist Name"], vocab_df['Class']))
vocab_size = len(vocab) + 1  # +1 for unknown token

# -----------------------------
# 2. Load train/test
# -----------------------------
train_df = pd.read_csv("../exps/Preproccessed/exp2_NamesLabeling_Train.csv")  # columns: text,label
test_df  = pd.read_csv("../data/test.csv")


# -----------------------------
# 3. Tokenizer for comma-separated text
# -----------------------------
def text_to_ids(text, vocab):
    # Split by comma, strip spaces, lowercase if needed
    tokens = [tok.strip() for tok in re.split(r',\s*', text)]
    ids = [vocab.get(tok, 0) for tok in tokens]  # unknown token -> 0
    return ids

train_df['seq'] = train_df['Artist Name'].apply(lambda x: text_to_ids(x, vocab))
test_df['seq']  = test_df['Artist Name'].apply(lambda x: text_to_ids(x, vocab))

# -----------------------------
# 4. Encode labels to integers
# -----------------------------
le = LabelEncoder()
train_df['Class'] = le.fit_transform(train_df['Class'])

# -----------------------------
# 5. Prepare sequences and Class
# -----------------------------
train_sequences = train_df['seq'].tolist()
train_Class    = train_df['Class'].tolist()

test_sequences = test_df['seq'].tolist()


X_train, X_val, y_train, y_val = train_test_split(
    train_sequences, train_Class, test_size=0.2, random_state=42, shuffle=True
)

# -----------------------------
# 6. PyTorch Dataset
# -----------------------------
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.y[idx], dtype=torch.long)

def collate_fn(batch):
    sequences, labels = zip(*batch)
    padded = nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=0)
    return padded, torch.tensor(labels)

train_dataset = TextDataset(train_sequences, train_Class)
val_dataset   = TextDataset(X_val, y_val)
# test_dataset  = TextDataset(test_sequences, test_labels)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
# test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

# -----------------------------
# 7. Define LSTM model
# -----------------------------
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=1, dropout=0.2):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, return_embedding=False):
        x = self.embedding(x)
        out, (h, c) = self.lstm(x)
        last_hidden = h[-1]  # shape: (batch, hidden_dim)
        if return_embedding:
            return last_hidden  # return vector instead of logits
        logits = self.fc(last_hidden)
        return logits


# -----------------------------
# 8. Training setup
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

model = LSTMClassifier(
    vocab_size=vocab_size,
    embed_dim=128,
    hidden_dim=16,
    num_classes=11,
    num_layers=2
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# -----------------------------
# 9. Training loop
# -----------------------------
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        logits = model(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS}, loss={total_loss:.4f}")

# -----------------------------
# 10. Evaluation
# -----------------------------
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        logits = model(X_batch)
        preds = torch.argmax(logits, dim=1)
        correct += (preds == y_batch).sum().item()
        total += len(y_batch)

print("Test Accuracy:", correct / total)


Epoch 1/10, loss=304.7251
Epoch 2/10, loss=174.2586
Epoch 3/10, loss=133.8014
Epoch 4/10, loss=123.7112
Epoch 5/10, loss=120.0811
Epoch 6/10, loss=114.7362
Epoch 7/10, loss=114.5876
Epoch 8/10, loss=113.3978
Epoch 9/10, loss=112.3976
Epoch 10/10, loss=112.2584
Test Accuracy: 0.8231292517006803


In [38]:
def get_embeddings_ordered(sequences, model, device, batch_size=64):
    # Tạo Dataset tạm thời
    # Lưu ý: Labels để None vì ta chỉ cần X để lấy vector, không cần tính loss
    dataset = TextDataset(sequences, [0]*len(sequences)) 
    
    # QUAN TRỌNG NHẤT: shuffle=False để giữ đúng thứ tự index
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    
    model.eval()
    embeddings = []
    
    with torch.no_grad():
        for X_batch, _ in loader:
            X_batch = X_batch.to(device)
            # return_embedding=True để lấy vector
            batch_emb = model(X_batch, return_embedding=True) 
            embeddings.append(batch_emb.cpu())
            
    # Nối các batch lại thành 1 tensor lớn
    return torch.cat(embeddings, dim=0).numpy()

In [39]:
# 2. Load lại File Gốc (Target Files)
# Chúng ta cần trích xuất đặc trưng cho file gốc mà bạn muốn ghép cột vào
# (Dù train trên file clean, nhưng predict phải chạy trên file gốc)
full_train_target = pd.read_csv("../data/train.csv")
full_test_target  = pd.read_csv("../data/test.csv")

print(f"Số dòng file gốc cần ghép: Train={len(full_train_target)}, Test={len(full_test_target)}")

# 3. Chuẩn bị Sequence cho file gốc
# Dùng vocab đã có để chuyển tên thành số
full_train_target['seq'] = full_train_target['Artist Name'].apply(lambda x: text_to_ids(x, vocab))
full_test_target['seq']  = full_test_target['Artist Name'].apply(lambda x: text_to_ids(x, vocab))

# 4. Chạy trích xuất
print("Đang trích xuất Train Embeddings...")
train_emb_data = get_embeddings_ordered(full_train_target['seq'].tolist(), model, device)

print("Đang trích xuất Test Embeddings...")
test_emb_data  = get_embeddings_ordered(full_test_target['seq'].tolist(), model, device)

# 5. Lưu kết quả
# Tạo tên cột động: emb_0, emb_1...
emb_dim = train_emb_data.shape[1]
col_names = [f"artist_emb_{i}" for i in range(emb_dim)]

df_train_emb = pd.DataFrame(train_emb_data, columns=col_names)
df_test_emb  = pd.DataFrame(test_emb_data, columns=col_names)

# Kiểm tra lần cuối
if len(df_train_emb) == len(full_train_target):
    print("✅ Check OK: Số lượng dòng khớp nhau tuyệt đối.")
    
    # Lưu file
    df_train_emb.to_csv("../exps/Preproccessed/train_embeddings_lstm.csv", index=False)
    df_test_emb.to_csv("../exps/Preproccessed/test_embeddings_lstm.csv", index=False)
    print("Đã lưu file thành công tại ../exps/Preproccessed/")
else:
    print(f"Lệch dòng! Gốc: {len(full_train_target)}, Emb: {len(df_train_emb)}")

# 6. (Tùy chọn) Ghép thử luôn để xem
train_final = pd.concat([full_train_target, df_train_emb], axis=1)
show(train_final.head())

Số dòng file gốc cần ghép: Train=14396, Test=3600
Đang trích xuất Train Embeddings...
Đang trích xuất Test Embeddings...
✅ Check OK: Số lượng dòng khớp nhau tuyệt đối.
Đã lưu file thành công tại ../exps/Preproccessed/


0
Loading ITables v2.5.2 from the internet...  (need help?)
