In [1]:
import pandas as pd

In [2]:
df = pd.read_json('./data/data.json', orient='records')
df.head()

Unnamed: 0,name,cmc,colours,type,subtype,power,toughness,loyalty,tokens
0,+Two Mace,0.125,"[1, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,"[1, 96, 0, 40, 186, 96, 114, 114, 10, 170, 12,..."
1,Aarakocra Sneak,0.25,"[0, 1, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.055556,0.2,0.0,"[1, 51, 31, 3, 0, 23, 4, 431, 9, 627, 2]"
2,"Aatchik, Emerald Radian",0.375,"[0, 0, 1, 0, 1]","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 1.0,...",0.166667,0.15,0.0,"[1, 31, 52, 23, 65, 5, 131, 162, 442, 0, 44, 4..."
3,Abaddon the Despoiler,0.3125,"[0, 1, 1, 1, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[0.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.277778,0.25,0.0,"[1, 126, 151, 11, 15, 30, 4, 29, 34, 11, 53, 2..."
4,Abandoned Campground,0.0,"[1, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,"[1, 3, 61, 23, 74, 159, 5, 37, 88, 688, 16, 12..."


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np

class MTGCardDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        # Get a single row from the dataframe
        row = self.dataframe.iloc[idx]
        
        # Extract token indices (for the RNN)
        original_tokens = row['tokens']
        shifted_tokens = [t + 1 for t in original_tokens]  # Add 1 to all indices Allows for padding
        tokens = torch.tensor(shifted_tokens, dtype=torch.long)
        
        # Extract numerical features (for the FC network)
        numerical_features = []
        
        # Add CMC (converted mana cost)
        numerical_features.append(row['cmc'])
        
        # Add other numerical features
        numerical_features.extend(row['colours'])
        numerical_features.extend(row['type'])
        numerical_features.extend(row['subtype'])
        numerical_features.append(row['power'])
        numerical_features.append(row['toughness'])
        numerical_features.append(row['loyalty'])
        
        # Convert to tensor
        numerical_features = torch.tensor(numerical_features, dtype=torch.float)
        
        return {
            'tokens': tokens,
            'numerical_features': numerical_features,
            'name': row['name']
        }

dataset = MTGCardDataset(df)  # df is your pandas dataframe

In [4]:
dataset[0]

{'tokens': tensor([  2,  97,   1,  41, 187,  97, 115, 115,  11, 171,  13,  14,   1,   5,
          15,  97, 100,  34,   6, 110,   3]),
 'numerical_features': tensor([ 0.1250,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000, -1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]),
 'name': '+Two Mace'}

In [5]:
def collate_mtg_cards(batch):
    """
    Custom collate function to handle variable-length token sequences.
    """
    # Extract each element from the batch
    names = [item['name'] for item in batch]
    token_sequences = [item['tokens'] for item in batch]
    numerical_features = [item['numerical_features'] for item in batch]
    
    # Get lengths of each sequence for packing
    lengths = torch.tensor([len(seq) for seq in token_sequences], dtype=torch.long)
    
    # Sort sequences by length in descending order for efficient packing
    sorted_indices = torch.argsort(lengths, descending=True)
    sorted_lengths = lengths[sorted_indices]
    sorted_token_sequences = [token_sequences[i] for i in sorted_indices]
    sorted_numerical_features = [numerical_features[i] for i in sorted_indices]
    sorted_names = [names[i] for i in sorted_indices]
    
    # Pad token sequences
    padded_tokens = pad_sequence(sorted_token_sequences, batch_first=True, padding_value=0)
    
    # Stack numerical features
    stacked_numerical_features = torch.stack(sorted_numerical_features)
    
    return {
        'tokens': padded_tokens,
        'token_lengths': sorted_lengths,
        'numerical_features': stacked_numerical_features,
        'names': sorted_names
    }




In [6]:
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Now create your datasets
train_dataset = MTGCardDataset(train_df)
val_dataset = MTGCardDataset(val_df)

# Create your dataloaders
train_dataloader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_mtg_cards
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=collate_mtg_cards
)

In [7]:
from gensim.models import Word2Vec
import numpy as np
import torch.nn as nn

def load_word2vec_embedding(model_path, padding_idx=0):
    word2vec_model = Word2Vec.load(model_path)
    
    # Get embedding dimension from the model
    embedding_dim = word2vec_model.vector_size
    vocab_size = len(word2vec_model.wv)
    
    # Initialize embedding matrix with zeros
    # Add 1 to vocab_size to account for padding token
    embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))
    
    # Fill the embedding matrix with word vectors
    for i, word in enumerate(word2vec_model.wv.index_to_key):
        # Add 1 to index to reserve index 0 for padding
        embedding_matrix[i + 1] = word2vec_model.wv[word]
    
    # Convert to torch tensor
    embedding_weights = torch.FloatTensor(embedding_matrix)
    
    # Create embedding layer initialized with pre-trained weights
    embedding_layer = nn.Embedding.from_pretrained(
        embedding_weights,
        padding_idx=padding_idx,
        freeze=False  # Set to True if you don't want to fine-tune the embeddings
    )
    
    return embedding_layer, vocab_size + 1  


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MTGCardAutoencoderWithTextDecoding(nn.Module):
    def __init__(self, vocab_size, embedding_layer, hidden_dim, numerical_dim, latent_dim):
        super(MTGCardAutoencoderWithTextDecoding, self).__init__()
        
        # Use pre-trained embedding layer
        self.embedding = embedding_layer
        embedding_dim = embedding_layer.embedding_dim
        
        # --- ENCODER COMPONENTS ---
        # RNN for encoding token sequences
        self.encoder_rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        
        # FC network for processing numerical features
        self.fc_numerical = nn.Sequential(
            nn.Linear(numerical_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )
        
        # Combined encoder to latent space
        self.fc_combined = nn.Linear(hidden_dim * 2, latent_dim)
        
        # --- DECODER COMPONENTS ---
        # Decoder for numerical features
        self.numerical_decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, numerical_dim),
            nn.Tanh()  # For normalized numerical features
        )
        
        # Decoder for token sequences
        self.decoder_initial = nn.Linear(latent_dim, hidden_dim)
        self.decoder_rnn = nn.GRU(embedding_dim + latent_dim, hidden_dim, batch_first=True)
        self.decoder_fc = nn.Linear(hidden_dim, vocab_size)
        
        # Store for later use
        self.vocab_size = vocab_size
        self.latent_dim = latent_dim
        self.hidden_dim = hidden_dim
        
    def encode(self, tokens, token_lengths, numerical_features):
        # Process token sequences with RNN
        embedded = self.embedding(tokens)
        
        # Pack padded sequence
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, token_lengths, batch_first=True
        )
        
        _, rnn_hidden = self.encoder_rnn(packed)
        rnn_hidden = rnn_hidden.squeeze(0)  # Shape: [batch_size, hidden_dim]
        
        # Process numerical features
        numerical_encoding = self.fc_numerical(numerical_features)
        
        # Combine both encodings
        combined = torch.cat([rnn_hidden, numerical_encoding], dim=1)
        latent = self.fc_combined(combined)
        
        return latent
    
    def decode_numerical(self, latent):
        # Decode numerical features
        return self.numerical_decoder(latent)
    
    def decode_tokens(self, latent, tokens, teacher_forcing_ratio=0.5):
        batch_size = latent.size(0)
        max_len = tokens.size(1)
        
        # Initialize tensor to store outputs
        outputs = torch.zeros(batch_size, max_len, self.vocab_size).to(latent.device)
        
        # Initialize decoder hidden state from latent vector
        hidden = self.decoder_initial(latent).unsqueeze(0)  # Add sequence dimension
        
        # First input is the special start token (we'll use zeros for simplicity)
        decoder_input = torch.zeros(batch_size, 1, self.embedding.embedding_dim).to(latent.device)
        
        # Expand latent vector to be concatenated with each input
        expanded_latent = latent.unsqueeze(1).expand(-1, max_len, -1)
        
        for t in range(max_len):
            # Concatenate decoder input with latent vector for each time step
            decoder_input_with_latent = torch.cat([decoder_input, latent.unsqueeze(1)], dim=2)
            
            # Feed through RNN
            output, hidden = self.decoder_rnn(decoder_input_with_latent, hidden)
            
            # Project to vocabulary space
            output = self.decoder_fc(output.squeeze(1))
            outputs[:, t] = output
            
            # Teacher forcing: use ground truth as next input with probability
            use_teacher_forcing = (torch.rand(1).item() < teacher_forcing_ratio)
            
            if use_teacher_forcing and t < max_len - 1:
                # Use actual next token from target
                decoder_input = self.embedding(tokens[:, t+1].unsqueeze(1))
            else:
                # Use own prediction
                top1 = output.max(1)[1]
                decoder_input = self.embedding(top1.unsqueeze(1))
        
        return outputs
    
    def forward(self, tokens, token_lengths, numerical_features, teacher_forcing_ratio=0.5):
        # Encode input to latent space
        latent = self.encode(tokens, token_lengths, numerical_features)
        
        # Decode numerical features
        reconstructed_numerical = self.decode_numerical(latent)
        
        # Decode token sequence
        reconstructed_tokens = self.decode_tokens(latent, tokens, teacher_forcing_ratio)
        
        return reconstructed_numerical, reconstructed_tokens, latent



In [9]:
# Initialize the model
hidden_dim = 128
numerical_dim = len(dataset[0]['numerical_features'])
latent_dim = 64  # Desired embedding dimension

embedding_layer, vocab_size = load_word2vec_embedding("./models/word2vec.model")

# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MTGCardAutoencoderWithTextDecoding(vocab_size, embedding_layer, hidden_dim, numerical_dim, latent_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Loss functions
numerical_criterion = nn.MSELoss()
token_criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding tokens (index 0)
alpha = 0.5  # Balance between numerical and token losses

## Evaluation

def evaluate(model, val_dataloader, numerical_criterion, token_criterion, device):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            # Extract inputs from batch
            tokens = batch['tokens'].to(device)
            token_lengths = batch['token_lengths']
            numerical_features = batch['numerical_features'].to(device)
            
            # Forward pass
            reconstructed_numerical, reconstructed_tokens, latent = model(
                tokens, token_lengths, numerical_features, teacher_forcing_ratio=0.5
            )
            
            # Compute numerical reconstruction loss
            numerical_loss = numerical_criterion(reconstructed_numerical, numerical_features)
            
            # Compute token reconstruction loss
            token_preds = reconstructed_tokens.view(-1, vocab_size)
            token_targets = tokens.view(-1)
            token_loss = token_criterion(token_preds, token_targets)
            
            # Combine losses
            loss = alpha * numerical_loss + (1 - alpha) * token_loss
            val_loss += loss.item()

    return val_loss / len(val_dataloader)


In [10]:
# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    numerical_losses = 0
    token_losses = 0
    
    for batch in train_dataloader:
        tokens = batch['tokens'].to(device)
        token_lengths = batch['token_lengths']
        numerical_features = batch['numerical_features'].to(device)
        
        # Forward pass
        reconstructed_numerical, reconstructed_tokens, latent = model(
            tokens, token_lengths, numerical_features, teacher_forcing_ratio=0.5
        )
        
        # Compute numerical reconstruction loss
        numerical_loss = numerical_criterion(reconstructed_numerical, numerical_features)
        
        # Compute token reconstruction loss
        # Reshape predictions to [batch_size*seq_len, vocab_size]
        token_preds = reconstructed_tokens.view(-1, vocab_size)
        # Reshape targets to [batch_size*seq_len]
        token_targets = tokens.view(-1)
        token_loss = token_criterion(token_preds, token_targets)
        
        # Combine losses        
        loss = alpha * numerical_loss + (1 - alpha) * token_loss
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        numerical_losses += numerical_loss.item()
        token_losses += token_loss.item()
    
    val_loss = evaluate(model, val_dataloader, numerical_criterion, token_criterion, device)

    print(f"\nEpoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {total_loss/len(train_dataloader):.6f} - ("
          f"Numerical: {numerical_losses/len(train_dataloader):.6f}, "
          f"Token: {token_losses/len(train_dataloader):.6f}), "
          f" Val Loss:{val_loss:.6f}")



Epoch [1/50], Train Loss: 1.884842 - (Numerical: 0.034641, Token: 3.735042),  Val Loss:1.373214

Epoch [2/50], Train Loss: 1.168256 - (Numerical: 0.016346, Token: 2.320165),  Val Loss:1.052724

Epoch [3/50], Train Loss: 0.944260 - (Numerical: 0.011963, Token: 1.876557),  Val Loss:0.911036

Epoch [4/50], Train Loss: 0.843249 - (Numerical: 0.009874, Token: 1.676625),  Val Loss:0.783415

Epoch [5/50], Train Loss: 0.778206 - (Numerical: 0.008637, Token: 1.547774),  Val Loss:0.777831

Epoch [6/50], Train Loss: 0.721446 - (Numerical: 0.007860, Token: 1.435031),  Val Loss:0.750574

Epoch [7/50], Train Loss: 0.694269 - (Numerical: 0.007215, Token: 1.381322),  Val Loss:0.703223

Epoch [8/50], Train Loss: 0.660767 - (Numerical: 0.006624, Token: 1.314911),  Val Loss:0.676384

Epoch [9/50], Train Loss: 0.631860 - (Numerical: 0.006307, Token: 1.257413),  Val Loss:0.657009

Epoch [10/50], Train Loss: 0.602680 - (Numerical: 0.005794, Token: 1.199566),  Val Loss:0.617204

Epoch [11/50], Train Loss: 0

In [11]:
torch.save(model.state_dict(), "./models/AE_weights.pt")

In [12]:
# Generate embeddings for all cards
model.eval()
all_embeddings = []
all_names = []

dataset = MTGCardDataset(df)  # df is your pandas dataframe

dataloader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_mtg_cards
)

with torch.no_grad():
    for batch in dataloader:
        tokens = batch['tokens'].to(device)
        token_lengths = batch['token_lengths']
        numerical_features = batch['numerical_features'].to(device)
        names = batch['names']
        
        _, _, latent = model(tokens, token_lengths, numerical_features)
        
        all_embeddings.append(latent.cpu().numpy())
        all_names.extend(names)

all_embeddings = np.vstack(all_embeddings)

In [None]:
import os
import json
from datetime import datetime

def save_embeddings(embeddings, names, directory="saved_embeddings"):
    """
    Save embeddings and corresponding card names using various methods
    
    Args:
        embeddings: NumPy array of embeddings
        names: List of card names
        directory: Directory to save files in
    """
    os.makedirs(directory, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{directory}/mtg_embeddings_{timestamp}.json"
    
    data = {
        'embeddings': embeddings.tolist(),
        'names': names,
        'metadata': {
            'date': timestamp,
            'embedding_dim': embeddings.shape[1],
            'num_cards': len(names)
        }
    }
    
    with open(filename, 'w') as f:
        json.dump(data, f)
    print(f"Saved embeddings and names to {filename}")
    return filename

save_path = save_embeddings(all_embeddings, all_names)


Saved embeddings and names to saved_embeddings/mtg_embeddings_20250728_153743.json
