In [2]:
import json
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torch.nn.functional as F
# import lightning as pl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from tqdm import tqdm
from datasets import load_dataset
from scipy.stats import norm, multivariate_normal
import ast

seed = 42
np.random.seed(seed)
# pl.seed_everything(seed)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

## Define Tag Categories

Define all possible tags for each category based on the dataset.

In [3]:
TAXONOMY = json.load(open("../data/concepts_to_tags.json", "r"))

CATEGORIES = list(TAXONOMY.keys())

# Reverse map for easy lookup (tag -> category)
TAG_TO_CATEGORY = {}
for cat, tags in TAXONOMY.items():
    for tag in tags:
        TAG_TO_CATEGORY[tag] = cat


In [4]:
tag_to_idx = {}
idx_to_tag = {}
cat_ranges = {} # Stores start/end index for each category

current_idx = 0
for cat in CATEGORIES:
    start = current_idx
    for tag in TAXONOMY[cat]:
        tag_to_idx[tag] = current_idx
        idx_to_tag[current_idx] = (cat, tag)
        current_idx += 1
    cat_ranges[cat] = (start, current_idx)

TOTAL_INPUT_DIM = current_idx
print(f"Total Input Dimension: {TOTAL_INPUT_DIM}")

Total Input Dimension: 400


## Prepare dataset

In [4]:
def process_data_multilabel(df: pd.DataFrame) -> np.ndarray:
    """
    Creates a Multi-Hot vector for every song.
    Example: [0, 1, 0, 1, 1, ...] where 1 means the tag is present.
    """
    processed_data = []

    for _, row in df.iterrows():
        raw_tags = ast.literal_eval(row['aspect_list'])
        raw_tags = [t.lower() for t in raw_tags]
            
        # Create Zero Vector
        vector = np.zeros(TOTAL_INPUT_DIM, dtype=np.float32)
        has_data = False
        
        for tag in raw_tags:
            if tag in tag_to_idx:
                idx = tag_to_idx[tag]
                vector[idx] = 1.0
                has_data = True
        
        # Only keep records that have at least one valid tag
        if has_data:
            processed_data.append(vector)
            
    return np.array(processed_data)

In [5]:
df = load_dataset("google/MusicCaps", split="train").to_pandas()

In [6]:
data = process_data_multilabel(df)
print(f"Processed data shape: {data.shape}")

Processed data shape: (5163, 400)


## VAE

In [7]:
class MultiLabelVAE(nn.Module):
    def __init__(self, input_dim, latent_dim=32, hidden_dim=128):
        super(MultiLabelVAE, self).__init__()
        
        # Encoder
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc2_logvar = nn.Linear(hidden_dim, latent_dim)
        
        # Decoder
        self.fc3 = nn.Linear(latent_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, input_dim) 
        
        # Dropout for the "Denoising" part (applied to input)
        self.input_dropout = nn.Dropout(p=0.3) 

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc2_mu(h1), self.fc2_logvar(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z, temperature=1.0):
        h3 = F.relu(self.fc3(z))
        logits = self.fc4(h3)
        return torch.sigmoid(logits / temperature)

    def forward(self, x, temperature=1.0):
        # Apply dropout to inputs during training -> forces model to learn correlations
        x_noisy = self.input_dropout(x)
        mu, logvar = self.encode(x_noisy)
        z = self.reparameterize(mu, logvar)
        recon = self.decode(z, temperature=temperature)
        return recon, mu, logvar

In [8]:
# Hyperparameters
input_dim = TOTAL_INPUT_DIM
latent_dim = 32
hidden_dim = 128
batch_size = 64
num_epochs = 300
learning_rate = 1e-3

In [9]:
# Model, Optimizer, Loss Function
model = MultiLabelVAE(input_dim, latent_dim, hidden_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
bce_loss_fn = nn.BCELoss(reduction='sum')

In [10]:
def vae_loss(recon_x, x, mu, logvar):
    BCE = bce_loss_fn(recon_x, x)
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD

In [11]:
# Prepare DataLoader
dataset = torch.utils.data.TensorDataset(torch.tensor(data))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [12]:
model.train()
for epoch in tqdm(range(num_epochs), desc="Training VAE"):
    total_loss = 0
    for batch in dataloader:
        inputs = batch[0].to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(inputs)
        loss = vae_loss(recon_batch, inputs, mu, logvar)
        loss.backward()
        total_loss += loss.item()
        optimizer.step()
    avg_loss = total_loss / len(dataloader.dataset)
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
# Save the trained model
torch.save(model.state_dict(), "../models/multilabel_vae.pth")

Training VAE:   4%|▎         | 11/300 [00:01<00:43,  6.69it/s]

Epoch [10/300], Loss: 21.9355


Training VAE:   7%|▋         | 21/300 [00:03<00:40,  6.91it/s]

Epoch [20/300], Loss: 21.2515


Training VAE:  10%|█         | 31/300 [00:04<00:39,  6.79it/s]

Epoch [30/300], Loss: 20.4310


Training VAE:  14%|█▎        | 41/300 [00:06<00:37,  6.95it/s]

Epoch [40/300], Loss: 19.9257


Training VAE:  17%|█▋        | 51/300 [00:07<00:35,  6.96it/s]

Epoch [50/300], Loss: 19.5690


Training VAE:  20%|██        | 61/300 [00:09<00:33,  7.05it/s]

Epoch [60/300], Loss: 19.1366


Training VAE:  24%|██▎       | 71/300 [00:10<00:32,  6.94it/s]

Epoch [70/300], Loss: 18.7781


Training VAE:  27%|██▋       | 81/300 [00:12<00:32,  6.78it/s]

Epoch [80/300], Loss: 18.5538


Training VAE:  30%|███       | 91/300 [00:13<00:30,  6.85it/s]

Epoch [90/300], Loss: 18.4329


Training VAE:  34%|███▎      | 101/300 [00:15<00:29,  6.77it/s]

Epoch [100/300], Loss: 18.3067


Training VAE:  37%|███▋      | 111/300 [00:16<00:27,  6.89it/s]

Epoch [110/300], Loss: 18.2259


Training VAE:  40%|████      | 121/300 [00:17<00:26,  6.81it/s]

Epoch [120/300], Loss: 18.1251


Training VAE:  44%|████▎     | 131/300 [00:19<00:24,  6.82it/s]

Epoch [130/300], Loss: 18.0294


Training VAE:  47%|████▋     | 141/300 [00:20<00:23,  6.89it/s]

Epoch [140/300], Loss: 18.0016


Training VAE:  50%|█████     | 151/300 [00:22<00:21,  6.88it/s]

Epoch [150/300], Loss: 17.9815


Training VAE:  54%|█████▎    | 161/300 [00:23<00:21,  6.34it/s]

Epoch [160/300], Loss: 17.9792


Training VAE:  57%|█████▋    | 171/300 [00:25<00:18,  6.85it/s]

Epoch [170/300], Loss: 17.8423


Training VAE:  60%|██████    | 181/300 [00:26<00:17,  6.88it/s]

Epoch [180/300], Loss: 17.7778


Training VAE:  64%|██████▎   | 191/300 [00:28<00:16,  6.79it/s]

Epoch [190/300], Loss: 17.8484


Training VAE:  67%|██████▋   | 201/300 [00:29<00:14,  6.73it/s]

Epoch [200/300], Loss: 17.7574


Training VAE:  70%|███████   | 211/300 [00:31<00:12,  7.08it/s]

Epoch [210/300], Loss: 17.7585


Training VAE:  74%|███████▎  | 221/300 [00:32<00:11,  6.68it/s]

Epoch [220/300], Loss: 17.7318


Training VAE:  77%|███████▋  | 231/300 [00:34<00:11,  6.10it/s]

Epoch [230/300], Loss: 17.6845


Training VAE:  80%|████████  | 241/300 [00:35<00:08,  6.90it/s]

Epoch [240/300], Loss: 17.6235


Training VAE:  84%|████████▎ | 251/300 [00:37<00:07,  6.74it/s]

Epoch [250/300], Loss: 17.6622


Training VAE:  87%|████████▋ | 261/300 [00:38<00:05,  6.56it/s]

Epoch [260/300], Loss: 17.6389


Training VAE:  90%|█████████ | 271/300 [00:40<00:04,  6.90it/s]

Epoch [270/300], Loss: 17.6376


Training VAE:  94%|█████████▎| 281/300 [00:41<00:02,  6.92it/s]

Epoch [280/300], Loss: 17.6013


Training VAE:  97%|█████████▋| 291/300 [00:43<00:01,  6.65it/s]

Epoch [290/300], Loss: 17.5909


Training VAE: 100%|██████████| 300/300 [00:44<00:00,  6.74it/s]

Epoch [300/300], Loss: 17.5829





## Generate tags

In [5]:
df = pd.read_csv("../data/mtg_jamendo/autotagging_top50tags_processed_cleaned.csv")
df['aspect_list'] = df['aspect_list'].apply(ast.literal_eval)
df['instrument_tags'] = df['instrument_tags'].apply(ast.literal_eval)
df['genre_tags'] = df['genre_tags'].apply(ast.literal_eval)
df['mood_tags'] = df['mood_tags'].apply(ast.literal_eval)
df

Unnamed: 0,id,tags,genre_tags,mood_tags,instrument_tags,aspect_list
0,track_0007391,"['genre---electronic', 'genre---pop', 'instrum...","[electronic, pop]",[emotional],"[bass, drums, guitar, keyboard]","[drums, bass, guitar, electronic, emotional, p..."
1,track_0015161,"['genre---instrumentalpop', 'genre---pop', 'ge...","[pop, rock]",[emotional],"[bass, drums]","[drums, bass, rock, emotional, pop]"
2,track_0015166,"['genre---dance', 'genre---electronic', 'genre...","[dance, electronic, pop, techno]",[emotional],[bass],"[bass, electronic, dance, techno, emotional, pop]"
3,track_0015167,"['genre---chillout', 'genre---easylistening', ...","[electronic, pop]",[emotional],"[bass, violin]","[bass, electronic, emotional, pop, violin]"
4,track_0015169,"['genre---electronic', 'genre---instrumentalpo...","[electronic, pop]",[emotional],"[bass, drums]","[drums, bass, electronic, emotional, pop]"
...,...,...,...,...,...,...
2036,track_1420702,"['genre---dance', 'genre---easylistening', 'ge...",[dance],"[funk, happy]","[bass, drums, keyboard]","[drums, bass, dance, funk, keyboard, happy]"
2037,track_1420704,"['genre---dance', 'genre---easylistening', 'in...",[dance],[happy],"[bass, drums, keyboard]","[drums, bass, dance, keyboard, happy]"
2038,track_1420705,"['genre---dance', 'genre---easylistening', 'in...",[dance],[happy],"[bass, drums, keyboard]","[drums, bass, dance, keyboard, happy]"
2039,track_1420706,"['genre---dance', 'genre---easylistening', 'in...",[dance],[happy],"[bass, drums, keyboard]","[drums, bass, dance, keyboard, happy]"


In [14]:
model = MultiLabelVAE(input_dim, latent_dim, hidden_dim).to(device)
model.load_state_dict(torch.load("../models/multilabel_vae.pth", map_location=device))
model.eval()

MultiLabelVAE(
  (fc1): Linear(in_features=400, out_features=128, bias=True)
  (fc2_mu): Linear(in_features=128, out_features=32, bias=True)
  (fc2_logvar): Linear(in_features=128, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=400, bias=True)
  (input_dropout): Dropout(p=0.3, inplace=False)
)

In [39]:
def generate_tags(model, seed_tags, requests, temperature=1.0):
    """
    seeds: List of tags we ALREADY have (e.g. ['rock', 'guitar'])
    requests: Dict of how many tags we want per category (e.g. {'instrument': 2, 'mood': 1})
    """
    model.eval()
    device = next(model.parameters()).device
    
    # 1. Build the Input Vector from Seeds
    input_vec = torch.zeros(1, TOTAL_INPUT_DIM).to(device)
    
    
    # Fill in the knowns
    for tag in seed_tags:
        if tag in tag_to_idx:
            input_vec[0, tag_to_idx[tag]] = 1.0
        else:
            print(f"Warning: Seed tag '{tag}' not in taxonomy.")

    with torch.no_grad():
        # 2. Encode to get the Latent Vibe (z)
        # Note: We don't use dropout here; we want the model to use all clues we gave it.
        mu, logvar = model.encode(input_vec)
        z = model.reparameterize(mu, logvar)
        
        # 3. Decode to get probabilities for EVERYTHING
        # Output shape: [1, Total_Dim] (Values 0.0 to 1.0)
        probs = model.decode(z, temperature=temperature)[0] 
        
        # 4. Extract Top-K for requested categories
        results = {}
        
        for category, count in requests.items():
            if count <= 0:
                results[f"generated_{category}_tags"] = []
                continue

            start, end = cat_ranges[category]
            
            # Slice the probabilities relevant to this category
            cat_probs = probs[start:end]
            
            # Get Top K indices for this slice
            # We ask for count + len(seeds) just in case the model predicts the seed tag again
            top_k_vals, top_k_indices = torch.topk(cat_probs, k=count + 5)
            
            # Convert slice-indices back to global-indices, then to strings
            found_tags = []
            for i in range(len(top_k_indices)):
                local_idx = top_k_indices[i].item()
                global_idx = start + local_idx
                tag_name = idx_to_tag[global_idx][1]
                
                # Don't return tags we already provided as seeds
                if tag_name not in seed_tags:
                    found_tags.append(tag_name)
                
                if len(found_tags) == count:
                    break
            
            results[f"generated_{category}_tags"] = found_tags
            
    return results

In [None]:
def generate_df(idx: int, tags_per_category: dict[str, int], temperature=1.0):
    row = df.iloc[idx]
    seed_tags = []
    for category in ['genre', 'instrument', 'mood', 'tempo']:
        if len(row[f"{category}_tags"]) > 1:
            seed_tags.append(np.random.choice(row[f"{category}_tags"]))
            tags_per_category[category] = tags_per_category.get(category, 1) - 1
        
    generated_tags = generate_tags(model, seed_tags, tags_per_category)
    _generated_tags = []
    for gtags in generated_tags.values():
        _generated_tags.extend(gtags)

    res_entry = {
        'id': row['id'],
        'original_aspect_list': row['aspect_list'],
        'aspect_list': seed_tags + _generated_tags,
        **generated_tags
    }
    return pd.DataFrame([res_entry])

In [6]:
CATEGORIES = [
    "tempo",
    "genre",
    "mood",
    "instrument"
]
N_CATEGORIES = len(CATEGORIES)
N_SAMPLES_TO_GENERATE = len(df)

# --- 1. SYNTHETIC DATA GENERATION (Replace with your actual data) ---
# We simulate a dataset where tag counts are discrete and correlated.
# Max counts are defined for simulation purposes.
MAX_COUNTS = {
    "tempo": 4,
    "genre": 4, 
    "mood": 5, 
    "instrument": 6, 
}
MEANS = {
    "tempo": 1.24,
    "genre": 1.46, 
    "mood": 1.71, 
    "instrument": 2.48,
}
VARIANCES = {
    "tempo": 0.6,
    "genre": 0.87,
    "mood": 1.09,
    "instrument": 1.37,
}

In [9]:
def generate_synthetic_correlated_data(n_records):
    """
    Creates synthetic discrete count data that serves as the 'real' dataset.
    This step is highly important: it determines the statistics (R and ECDFs)
    that the Copula will try to match.
    """
    print("--- 1. Generating Synthetic Data ---")

    # Define the desired correlation matrix (e.g., high correlation between Genre and Instrument)
    # This represents your calculated correlation matrix R.
    correlation_matrix = np.array([
        [1.0, 0.2, 0.1, 0.03],  # Tempo
        [0.2, 1.0, 0.3, -0.01],  # Genre
        [0.1, 0.12, 1.0, -0.05],  # Mood
        [0.03, -0.01, -0.05, 1.0]   # Instrument
    ])

    # Generate correlated continuous data (Multivariate Normal)
    mean = np.zeros(N_CATEGORIES)
    z_continuous = multivariate_normal.rvs(mean=mean, cov=correlation_matrix, size=n_records)

    data = np.zeros((n_records, N_CATEGORIES), dtype=int)
    
    # Transform continuous data into discrete counts based on desired marginals
    # (using inverse CDF of an arbitrary discrete distribution for simulation)
    # This simulates your real-world data having specific tag count distributions
    for i, cat in enumerate(CATEGORIES):
        max_c = MAX_COUNTS[cat]
        # Simulate log normal-like distribution for counts
        mu = np.log(MEANS[cat]**2 / np.sqrt(MEANS[cat]**2 + VARIANCES[cat]))
        sigma = np.sqrt(np.log(1 + VARIANCES[cat] / MEANS[cat]**2))
        # Create discrete probability distribution
        x = np.arange(1, max_c + 1)
        p = (1 / (x * sigma * np.sqrt(2 * np.pi)))
        p *= np.exp(- (np.log(x) - mu)**2 / (2 * sigma**2))
        p /= p.sum()  # Normalize to sum to 1
        
        # Convert continuous z (uniform quantile) to discrete count (inverse CDF)
        uniform_quantiles = norm.cdf(z_continuous[:, i])
        
        # Quantile mapping for a simple discrete distribution
        counts = np.digitize(uniform_quantiles, np.cumsum(p[:-1])) + 1
        data[:, i] = np.clip(counts, 1, max_c)

    print(f"Synthetic Data Shape: {data.shape}")
    print(f"Calculated Correlation of Synthetic Data:\n{np.corrcoef(data.T).round(2)}")
    return data, correlation_matrix

In [10]:
data, _ = generate_synthetic_correlated_data(N_SAMPLES_TO_GENERATE)
print(data)

--- 1. Generating Synthetic Data ---
Synthetic Data Shape: (2041, 4)
Calculated Correlation of Synthetic Data:
[[ 1.    0.14  0.09 -0.01]
 [ 0.14  1.    0.21  0.  ]
 [ 0.09  0.21  1.   -0.06]
 [-0.01  0.   -0.06  1.  ]]
[[2 2 4 6]
 [2 1 2 2]
 [1 1 2 2]
 ...
 [2 1 1 2]
 [1 1 1 1]
 [1 2 2 3]]


In [None]:
temperatures = [1.0, 1.5, 2, 2.5, 3]

res_df = pd.DataFrame()

for temp in tqdm(temperatures):
    for idx in tqdm(range(N_SAMPLES_TO_GENERATE), leave=False):
        num_tags_for_category = {
            "tempo": data[idx, 0],
            "genre": data[idx, 1],
            "mood": data[idx, 2],
            "instrument": data[idx, 3],
        }
        temp_df = generate_df(idx, tags_per_category=num_tags_for_category, temperature=temp)
        temp_df['temperature'] = temp
        res_df = pd.concat([res_df, temp_df], ignore_index=True)
res_df

100%|██████████| 5/5 [00:15<00:00,  3.10s/it]


Unnamed: 0,id,original_aspect_list,aspect_list,generated_tempo_tags,generated_genre_tags,generated_mood_tags,generated_instrument_tags,temperature
0,track_0007391,"[drums, bass, guitar, electronic, emotional, p...","[electronic, guitar, emotional, slow tempo, ca...",[slow tempo],[],"[calming, passionate]",[flat male vocal],0.5
1,track_0015161,"[drums, bass, rock, emotional, pop]","[rock, bass, emotional, medium tempo, intense,...",[medium tempo],[],"[intense, epic, dramatic]","[electric guitar, orchestra]",0.5
2,track_0015166,"[bass, electronic, dance, techno, emotional, pop]","[pop, bass, emotional, medium tempo, movie mus...",[medium tempo],[movie music],[dramatic],"[electric guitar, no singer, electronic drums]",0.5
3,track_0015167,"[bass, electronic, emotional, pop, violin]","[pop, violin, emotional, medium tempo, happy]",[medium tempo],[],[happy],[],0.5
4,track_0015169,"[drums, bass, electronic, emotional, pop]","[electronic, bass, emotional, uptempo, acousti...","[uptempo, acoustic rhythm guitar chords]","[classical, jazz]",[passionate],[flat male vocal],0.5
...,...,...,...,...,...,...,...,...
10200,track_1420702,"[drums, bass, dance, funk, keyboard, happy]","[dance, drums, happy, upbeat, movie music, pla...",[upbeat],[movie music],[playful],"[percussion, no singer]",1.5
10201,track_1420704,"[drums, bass, dance, keyboard, happy]","[dance, bass, happy, upbeat, pop, fun, claps]",[upbeat],[pop],[fun],[claps],1.5
10202,track_1420705,"[drums, bass, dance, keyboard, happy]","[dance, drums, happy, fast tempo, country musi...",[fast tempo],"[country music, folk music]","[cheerful, playful, festive]",[],1.5
10203,track_1420706,"[drums, bass, dance, keyboard, happy]","[dance, drums, happy, medium tempo, country mu...",[medium tempo],[country music],[playful],[electric guitar],1.5


In [None]:
# Sort aspect list column and deduplicate tag combinations
res_df['aspect_list'] = res_df['aspect_list'].apply(lambda x: sorted(list(set(x))))
res_df = res_df.drop_duplicates(subset=['aspect_list']).reset_index(drop=True)
res_df

In [None]:
# Add surrogate key based on track_id and temperature
import hashlib
def generate_surrogate_key(track_id: str, temperature: float) -> str:
    key_str = f"{track_id}_{temperature}"
    return hashlib.md5(key_str.encode()).hexdigest()

res_df['surrogate_key'] = res_df.apply(lambda row: generate_surrogate_key(row['id'], row['temperature']), axis=1)
res_df.drop(columns=['id'], inplace=True)
res_df.rename(columns={'surrogate_key': 'id'}, inplace=True)
res_df

## Push to Hugginface Hub

In [47]:
from sklearn.model_selection import train_test_split

df_train, df_valid = train_test_split(res_df, test_size=0.1, random_state=42)
df_valid, df_test = train_test_split(df_valid, test_size=0.5, random_state=42)

In [48]:
from pathlib import Path

# Create output directory
output_dir = Path("../data/vae_mtg_tags")
output_dir.mkdir(parents=True, exist_ok=True)

df_train.to_csv(output_dir / "train.csv", index=False)
df_valid.to_csv(output_dir / "validation.csv", index=False)
df_test.to_csv(output_dir / "test.csv", index=False)
all_df = pd.concat([df_train, df_valid, df_test])
all_df.to_csv(output_dir / "all.csv", index=False)

In [None]:
data_files = {
    "train": str(output_dir / "train.csv"),
    "validation": str(output_dir / "validation.csv"),
    "test": str(output_dir / "test.csv")
}
dataset = load_dataset("csv", data_files=data_files)
dataset.push_to_hub("bsienkiewicz/mtg_vae_tags_dataset", private=True)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md:   0%|          | 0.00/813 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/bsienkiewicz/mtg_causal_tags_dataset/commit/c389f6cc328f38da916c0f4400f9b3467d8c25e9', commit_message='Upload dataset', commit_description='', oid='c389f6cc328f38da916c0f4400f9b3467d8c25e9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/bsienkiewicz/mtg_causal_tags_dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='bsienkiewicz/mtg_causal_tags_dataset'), pr_revision=None, pr_num=None)