In [1]:
# Libraries needed for data preparation
import pandas as pd
import numpy as np

# Download the dataset and put it in subfolder called data
datapath = "train_only_dialogue_window_1.csv"
df = pd.read_csv(datapath)
df = df[["text", "label"]]

# Show the data
df.head()

Unnamed: 0,text,label
0,\nThe following is a fragment of a conversatio...,probing
1,\nThe following is a fragment of a conversatio...,generic
2,\nThe following is a fragment of a conversatio...,probing
3,\nThe following is a fragment of a conversatio...,probing
4,\nThe following is a fragment of a conversatio...,telling


In [2]:
print('Total number of news: {}'.format(len(df)))
print(40*'-')
print('Split by category:')
print(df["label"].value_counts())
print(40*'-')
nr_categories = len(df["label"].unique())
print("Number of categories: {n}".format(n=nr_categories))

Total number of news: 12646
----------------------------------------
Split by category:
label
focus      5334
probing    3005
telling    2428
generic    1879
Name: count, dtype: int64
----------------------------------------
Number of categories: 4


In [3]:
X = df['text']
y=np.unique(df['label'], return_inverse=True)[1]
print(y)

[2 1 2 ... 2 3 3]


In [4]:
import transformers
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [5]:
import torch

X_list=X.to_list()
X_pt = tokenizer(X_list, padding='max_length', max_length = 512, truncation=True, return_tensors='pt')["input_ids"]

y_list=y.tolist()
y_pt = torch.Tensor(y_list).long()

In [6]:
datapath_test = "test_only_dialogue_window_1.csv"
df_test = pd.read_csv(datapath_test)
df_test = df_test[["text", "label"]]

X_test = df_test['text']
y_test=np.unique(df_test['label'], return_inverse=True)[1]

X_list_test=X_test.to_list()
X_pt_test = tokenizer(X_list_test, padding='max_length', max_length = 512, truncation=True, return_tensors='pt')["input_ids"]

y_list_test=y_test.tolist()
y_pt_test = torch.Tensor(y_list_test).long()

In [7]:
# Convert data to torch dataset

X_pt_train = X_pt
y_pt_train = y_pt
from torch.utils.data import Dataset, DataLoader
class BBCNewsDataset(Dataset):
    """Custom-built BBC News dataset"""

    def __init__(self, X, y):
        """
        Args:
            X, y as Torch tensors
        """
        self.X_train = X
        self.y_train = y
        

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, idx):
        return self.X_train[idx], self.y_train[idx]# Get train and test data in form of Dataset class
train_data_pt = BBCNewsDataset(X=X_pt_train, y=y_pt_train)
test_data_pt = BBCNewsDataset(X=X_pt_test, y=y_pt_test)

In [8]:
# Get train and test data in form of Dataloader class
train_loader_pt = DataLoader(train_data_pt, batch_size=50, shuffle=True)
test_loader_pt = DataLoader(test_data_pt, batch_size=50, shuffle=True)

In [9]:
config = transformers.DistilBertConfig(dropout=0.2, attention_dropout=0.2)
dbert_pt = transformers.DistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

In [10]:
# Let's create a sample of size 5 from the training data
sample = X_pt_train[0:5]
print('Object type: ', type(dbert_pt(sample)))
print('Output format (shape): ',dbert_pt(sample)[0].shape)
print('Output used as input for the classifier (shape): ', dbert_pt(sample)[0][:,0,:].shape)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Object type:  <class 'transformers.modeling_outputs.BaseModelOutput'>
Output format (shape):  torch.Size([5, 512, 768])
Output used as input for the classifier (shape):  torch.Size([5, 768])


In [11]:
from torch import nn
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

class DistilBertClassification(nn.Module):
    def __init__(self):
        super(DistilBertClassification, self).__init__()
        self.dbert = dbert_pt
        self.dropout = nn.Dropout(p=0.2)
        self.linear1 = nn.Linear(768,64)
        self.ReLu = nn.ReLU()
        self.linear2 = nn.Linear(64,5)

    def forward(self, x):
        x = self.dbert(input_ids=x)
        x = x["last_hidden_state"][:,0,:]
        x = self.dropout(x)
        x = self.linear1(x)
        x = self.ReLu(x)
        logits = self.linear2(x)
        # No need for a softmax, because it is already included in the CrossEntropyLoss
        return logits

model_pt = DistilBertClassification().to(device)

Using cuda device


In [12]:
for param in model_pt.dbert.parameters():
    param.requires_grad = False

In [13]:
total_params = sum(p.numel() for p in model_pt.parameters())
total_params_trainable = sum(p.numel() for p in model_pt.parameters() if p.requires_grad)
print("Number of parameters: ", total_params)
print("Number of trainable parameters: ", total_params_trainable)

Number of parameters:  66412421
Number of trainable parameters:  49541


In [14]:
epochs = 5
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_pt.parameters())

from tqdm import tqdm
# Define the dictionary "history" that will collect key performance indicators during training
history = {}
history["epoch"]=[]
history["train_loss"]=[]
history["valid_loss"]=[]
history["train_accuracy"]=[]
history["valid_accuracy"]=[]

from datetime import datetime
# Measure time for training
start_time = datetime.now()

# Loop on epochs
for e in range(epochs):
    
    # Set mode in train mode
    model_pt.train()
    
    train_loss = 0.0
    train_accuracy = []
    
    # Loop on batches
    for X, y in tqdm(train_loader_pt):
        # Get prediction & loss
        prediction = model_pt(X.to(device))
        loss = criterion(prediction, y.to(device))
        
        # Adjust the parameters of the model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
        prediction_index = prediction.argmax(axis=1)
        accuracy = (prediction_index==y.to(device))
        train_accuracy += accuracy
    
    train_accuracy = (sum(train_accuracy) / len(train_accuracy)).item()
    
    # Calculate the loss on the test data after each epoch
    # Set mode to evaluation (by opposition to training)
    model_pt.eval()
    valid_loss = 0.0
    valid_accuracy = []
    for X, y in tqdm(test_loader_pt):
        
        prediction = model_pt(X.to(device))
        loss = criterion(prediction, y.to(device))

        valid_loss += loss.item()
        
        prediction_index = prediction.argmax(axis=1)
        accuracy = (prediction_index==y.to(device))
        valid_accuracy += accuracy
    valid_accuracy = (sum(valid_accuracy) / len(valid_accuracy)).item()
    
    # Populate history
    history["epoch"].append(e+1)
    history["train_loss"].append(train_loss / len(train_loader_pt))
    history["valid_loss"].append(valid_loss / len(test_loader_pt))
    history["train_accuracy"].append(train_accuracy)
    history["valid_accuracy"].append(valid_accuracy)    
        
    print(f'Epoch {e+1} \t\t Training Loss: {train_loss / len(train_loader_pt) :10.3f} \t\t Validation Loss: {valid_loss / len(test_loader_pt) :10.3f}')
    print(f'\t\t Training Accuracy: {train_accuracy :10.3%} \t\t Validation Accuracy: {valid_accuracy :10.3%}')
    
# Measure time for training
end_time = datetime.now()
training_time_pt = (end_time - start_time).total_seconds()

  0%|          | 0/253 [00:04<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 76.00 MiB. GPU 0 has a total capacty of 23.62 GiB of which 86.19 MiB is free. Process 2627555 has 22.36 GiB memory in use. Including non-PyTorch memory, this process has 774.00 MiB memory in use. Of the allocated memory 490.85 MiB is allocated by PyTorch, and 85.15 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
from matplotlib import pyplot as plt

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
ax[0].set(title='Loss')
ax[0].plot(history['train_loss'], label='Training')
ax[0].plot(history['valid_loss'], label='Validation')
ax[0].legend(loc="upper right")

ax[1].set(title='Accuracy')
ax[1].plot(history['train_accuracy'], label='Training')
ax[1].plot(history['valid_accuracy'], label='Validation')
ax[1].legend(loc="lower right")

In [None]:
accuracy_pt = history['valid_accuracy'][-1]
print('Accuracy Training data: {:.1%}'.format(history['train_accuracy'][-1]))
print('Accuracy Test data: {:.1%}'.format(history['valid_accuracy'][-1]))
print('Training time: {:.1f}s (or {:.1f} minutes)'.format(training_time_pt, training_time_pt/60))

In [None]:
# Save only the parameters of the model but not the model itself, and get it back
torch.save(model_pt.state_dict(), 'PyModel_window_1.sd')
model_reloaded = DistilBertClassification()
model_reloaded.load_state_dict(torch.load('PyModel_window_1.sd'))
model_reloaded.eval()

In [None]:
# Save the entire model, and get it back
torch.save(model_pt, 'PyModelComplete_window_1.pt')
model_reloaded2 = torch.load('PyModelComplete_window_1.pt')
model_reloaded2.eval()

In [None]:
from sklearn.metrics import classification_report

prediction = model_pt(X_pt_test[:5].to(device)).argmax(axis=1)
report = classification_report(y_pt_test[:5], prediction.detach().cpu().numpy())

In [None]:
print(report)

In [1]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files={"train": "./data/train_window_1.csv", "test": "./data/test_window_1.csv"})
dataset

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 12646
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 3100
    })
})

In [2]:
import argparse
import os

import evaluate
import numpy as np
import torch
from transformers import (RobertaForMultipleChoice, RobertaTokenizer, Trainer,
                          TrainingArguments, XLMRobertaForMultipleChoice,
                          XLMRobertaTokenizer, RobertaForSequenceClassification)

In [3]:
model_name = "roberta-base"
logging_dir = "./logs/roberta_base/window_1"

print(f"Training {model_name} on {dataset}")

metric = evaluate.load("accuracy")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

if model_name == "roberta-base":
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
elif model_name == "xlm-roberta-base":
    tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
else:
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    print("Using the default roberta tokenizer, be careful")

Training roberta-base on DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 12646
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 3100
    })
})


In [6]:
def preprocess_function(examples):
    texts, labels = examples["text"], examples["label"]

    # Tokenize premises and choices
    # Note that we provide both choices together as multiple_choices_inputs
    multiple_choices_inputs = []
    for text in texts:
        multiple_choices_inputs.append(tokenizer.encode_plus( \
            text, max_length=512, padding='max_length', \
            truncation=True))

    # RoBERTa expects a list of all first choices and a list of all second 
    # choices, hence we restructure the inputs
    input_ids = [x['input_ids'] for x in multiple_choices_inputs]
    attention_masks = [x['attention_mask'] for x in multiple_choices_inputs]

    labels = np.unique(labels, return_inverse=True)[1]
    print(type(torch.tensor(attention_masks).view(-1, 512)), torch.tensor(attention_masks).view(-1, 512).dtype)

    # Restructure inputs to match the expected format for RobertaForMultipleChoice
    features = {
        'input_ids': torch.tensor(input_ids).view(-1, 512),
        'attention_mask': torch.tensor(attention_masks).view(-1, 512),
        'labels': torch.tensor(labels)
    }
    return features


In [7]:
# Map the preprocessing function over the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/12646 [00:00<?, ? examples/s]

<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64


Map:   0%|          | 0/3100 [00:00<?, ? examples/s]

<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64
<class 'torch.Tensor'> torch.int64


In [9]:
tokenized_datasets = tokenized_datasets.remove_columns(["index", "text", "label"])

In [10]:
type(tokenized_datasets["test"]["input_ids"])

list

In [11]:
if model_name == "roberta-base":
    model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=4).to(device)
elif model_name == "xlm-roberta-base":
    model = XLMRobertaForMultipleChoice.from_pretrained(model_name).to(device)
else:
    model = RobertaForMultipleChoice.from_pretrained(model_name).to(device)
    print("Using the default roberta, be careful")

output_dir = f"{logging_dir}/{model_name}"

# Define the training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_ratio=0.07, 
    weight_decay=0.01,
    learning_rate=1e-6, 
    logging_dir='./logs',
    logging_steps=100,
    save_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    report_to = "wandb"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
tokenized_datasets['train']

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 12646
})

In [13]:
# Train the model
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdaha_kot[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/63220 [00:00<?, ?it/s]

{'loss': 1.4076, 'learning_rate': 2.2593764121102575e-08, 'epoch': 0.03}


  0%|          | 0/388 [00:00<?, ?it/s]

{'eval_loss': 1.4059182405471802, 'eval_accuracy': 0.1406451612903226, 'eval_runtime': 17.1024, 'eval_samples_per_second': 181.262, 'eval_steps_per_second': 22.687, 'epoch': 0.03}
{'loss': 1.4051, 'learning_rate': 4.518752824220515e-08, 'epoch': 0.06}


  0%|          | 0/388 [00:00<?, ?it/s]

{'eval_loss': 1.4051610231399536, 'eval_accuracy': 0.1406451612903226, 'eval_runtime': 16.8076, 'eval_samples_per_second': 184.44, 'eval_steps_per_second': 23.085, 'epoch': 0.06}
{'loss': 1.4108, 'learning_rate': 6.778129236330772e-08, 'epoch': 0.09}


  0%|          | 0/388 [00:00<?, ?it/s]

{'eval_loss': 1.403479814529419, 'eval_accuracy': 0.1406451612903226, 'eval_runtime': 16.7911, 'eval_samples_per_second': 184.621, 'eval_steps_per_second': 23.107, 'epoch': 0.09}
{'loss': 1.4039, 'learning_rate': 9.03750564844103e-08, 'epoch': 0.13}


  0%|          | 0/388 [00:00<?, ?it/s]

{'eval_loss': 1.4012691974639893, 'eval_accuracy': 0.1406451612903226, 'eval_runtime': 16.9846, 'eval_samples_per_second': 182.518, 'eval_steps_per_second': 22.844, 'epoch': 0.13}
{'loss': 1.4024, 'learning_rate': 1.1296882060551286e-07, 'epoch': 0.16}


  0%|          | 0/388 [00:00<?, ?it/s]

{'eval_loss': 1.3992811441421509, 'eval_accuracy': 0.1406451612903226, 'eval_runtime': 16.8946, 'eval_samples_per_second': 183.491, 'eval_steps_per_second': 22.966, 'epoch': 0.16}
{'loss': 1.3905, 'learning_rate': 1.3556258472661544e-07, 'epoch': 0.19}


  0%|          | 0/388 [00:00<?, ?it/s]

{'eval_loss': 1.3963583707809448, 'eval_accuracy': 0.1406451612903226, 'eval_runtime': 16.8416, 'eval_samples_per_second': 184.068, 'eval_steps_per_second': 23.038, 'epoch': 0.19}
{'loss': 1.3906, 'learning_rate': 1.5815634884771803e-07, 'epoch': 0.22}


  0%|          | 0/388 [00:00<?, ?it/s]

{'eval_loss': 1.3933305740356445, 'eval_accuracy': 0.1406451612903226, 'eval_runtime': 16.8207, 'eval_samples_per_second': 184.297, 'eval_steps_per_second': 23.067, 'epoch': 0.22}
{'loss': 1.3834, 'learning_rate': 1.807501129688206e-07, 'epoch': 0.25}


  0%|          | 0/388 [00:00<?, ?it/s]

{'eval_loss': 1.3890458345413208, 'eval_accuracy': 0.1406451612903226, 'eval_runtime': 16.8255, 'eval_samples_per_second': 184.244, 'eval_steps_per_second': 23.06, 'epoch': 0.25}
{'loss': 1.3923, 'learning_rate': 2.033438770899232e-07, 'epoch': 0.28}


  0%|          | 0/388 [00:00<?, ?it/s]

{'eval_loss': 1.3852674961090088, 'eval_accuracy': 0.14193548387096774, 'eval_runtime': 16.7491, 'eval_samples_per_second': 185.085, 'eval_steps_per_second': 23.165, 'epoch': 0.28}
{'loss': 1.3842, 'learning_rate': 2.2593764121102573e-07, 'epoch': 0.32}


  0%|          | 0/388 [00:00<?, ?it/s]

{'eval_loss': 1.3810272216796875, 'eval_accuracy': 0.23806451612903226, 'eval_runtime': 16.7439, 'eval_samples_per_second': 185.142, 'eval_steps_per_second': 23.173, 'epoch': 0.32}
{'loss': 1.3729, 'learning_rate': 2.485314053321283e-07, 'epoch': 0.35}


  0%|          | 0/388 [00:00<?, ?it/s]

{'eval_loss': 1.3758118152618408, 'eval_accuracy': 0.3841935483870968, 'eval_runtime': 16.7436, 'eval_samples_per_second': 185.145, 'eval_steps_per_second': 23.173, 'epoch': 0.35}
{'loss': 1.3675, 'learning_rate': 2.711251694532309e-07, 'epoch': 0.38}


  0%|          | 0/388 [00:00<?, ?it/s]

{'eval_loss': 1.3687165975570679, 'eval_accuracy': 0.38451612903225807, 'eval_runtime': 16.813, 'eval_samples_per_second': 184.381, 'eval_steps_per_second': 23.077, 'epoch': 0.38}
{'loss': 1.3509, 'learning_rate': 2.9371893357433345e-07, 'epoch': 0.41}


  0%|          | 0/388 [00:00<?, ?it/s]

{'eval_loss': 1.3577444553375244, 'eval_accuracy': 0.38451612903225807, 'eval_runtime': 16.8747, 'eval_samples_per_second': 183.707, 'eval_steps_per_second': 22.993, 'epoch': 0.41}
{'loss': 1.338, 'learning_rate': 3.1631269769543607e-07, 'epoch': 0.44}


  0%|          | 0/388 [00:00<?, ?it/s]

{'eval_loss': 1.342713475227356, 'eval_accuracy': 0.38451612903225807, 'eval_runtime': 16.8894, 'eval_samples_per_second': 183.548, 'eval_steps_per_second': 22.973, 'epoch': 0.44}


In [None]:
tokenized_datasets['train']

Dataset({
    features: ['index', 'text', 'label', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 12646
})

In [None]:





# Path where the checkpoints are saved
checkpoints_path = output_dir
checkpoints = [os.path.join(checkpoints_path, name) \
                for name in os.listdir(checkpoints_path) \
                if name.startswith("checkpoint")]

# Placeholder for the best performance
best_performance = 0.0
best_checkpoint = None

for checkpoint in checkpoints:
    # Load the model from checkpoint
    if model_name == "roberta-base":
        model = RobertaForMultipleChoice.from_pretrained(checkpoint).to(device)
    elif model_name == "xlm-roberta-base":
        model = XLMRobertaForMultipleChoice.from_pretrained(checkpoint).to(device)
    else:
        model = RobertaForMultipleChoice.from_pretrained(checkpoint).to(device)
        print("Using the default roberta, be careful")

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir=output_dir,
            per_device_eval_batch_size=1,  # Adjust as necessary
        ),
        compute_metrics=compute_metrics,
    )

    # Evaluate the model
    eval_results = trainer.evaluate(tokenized_datasets['validation'])

    # Assuming 'accuracy' is your metric of interest
    print(eval_results)
    performance = eval_results["eval_accuracy"]

    # Update the best checkpoint if current model is better
    if performance > best_performance:
        best_performance = performance
        best_checkpoint = checkpoint

print(f"Best checkpoint: {best_checkpoint} with Eval Loss: {best_performance}")

if best_checkpoint:
    print(f"Best checkpoint: {best_checkpoint} with Eval Loss: {best_performance}")

    # Load the best model
    if model_name == "roberta-base":
        best_model = RobertaForMultipleChoice.from_pretrained(best_checkpoint).to(device)
    elif model_name == "xlm-roberta-base":
        best_model = XLMRobertaForMultipleChoice.from_pretrained(best_checkpoint).to(device)
    else:
        best_model = RobertaForMultipleChoice.from_pretrained(best_checkpoint).to(device)
        print("Using the default roberta, be careful")

    # Directly save the best model to the desired directory
    best_model.save_pretrained(f"{output_dir}/best_{best_checkpoint}")

    # If you want to save the tokenizer as well
    tokenizer.save_pretrained(f"{output_dir}/best_{best_checkpoint}")

    # Optional: Evaluate the best model again for confirmation, using the Trainer
    trainer = Trainer(
        model=best_model,
        args=TrainingArguments(
            output_dir=f'./{output_dir}/best',  # Ensure this matches where you're saving the model
            per_device_eval_batch_size=8,
        ),
        compute_metrics=compute_metrics,
    )

    eval_results = trainer.evaluate(tokenized_datasets['test'])
    print("Final Evaluation on Best Model:", eval_results)
else:
    print("No best checkpoint identified.")


Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdaha_kot[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/63220 [00:00<?, ?it/s]

ValueError: too many dimensions 'str'