In [70]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import os
import sys

EXP_NAME = "downstream_tasks"
PROJECT_ROOT = "/home/jovyan/vmeshchaninov/LatentDiffusion"
EXP_RESULT_PATH = f"{PROJECT_ROOT}/experiments/results/{EXP_NAME}/"
os.makedirs(EXP_RESULT_PATH, exist_ok=True)
sys.path.append(PROJECT_ROOT)
os.environ["PROJECT_ROOT"] = PROJECT_ROOT

In [3]:
import torch
import hydra
from omegaconf import DictConfig, OmegaConf
from hydra.core.global_hydra import GlobalHydra

from diffusion_trainer import DiffusionTrainer

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reset Hydra to avoid conflicts if already initialized
GlobalHydra.instance().clear()

# Initialize Hydra and load config manually
hydra.initialize(config_path="../conf", version_base=None)  # Set path to your configs

# Load the configuration
cfg = hydra.compose(config_name="config")  # Replace with your main config file



In [5]:
# Setup the config
cfg.ddp.enabled = False

num_latents = 128

cfg.encoder.latent.num_latents = num_latents
cfg.decoder.latent.num_latents = num_latents
cfg.encoder.embedding.max_position_embeddings = 512
cfg.decoder.embedding.max_position_embeddings = 512

cfg.project.name = 'latent-diffusion-article-large-v1.0'

cfg.diffusion.dynamic.d = 2

diff_prefix_folder = "/home/jovyan/vmeshchaninov/LatentDiffusion/checkpoints"
autoencoder_prefix_folder = "/home/jovyan/vmeshchaninov/LatentDiffusion/checkpoints"

diff_prefix_folder = "/home/jovyan/vmeshchaninov/LatentDiffusion/checkpoints"
diffusion_checkpoints = f"diffusion-openwebtext-512-{num_latents}-d=2-final/200000.pth"
autoencoder_checkpoints = f"autoencoder-num_latents={num_latents}-openwebtext-512-final-512/200000.pth"

cfg.diffusion.model.load_checkpoint = os.path.join(diff_prefix_folder, diffusion_checkpoints)
cfg.autoencoder.model.load_checkpoint = os.path.join(autoencoder_prefix_folder, autoencoder_checkpoints)
cfg.training = ""

diffusion_trainer = DiffusionTrainer(cfg)
diffusion_trainer.restore_checkpoint()

Checkpotint /home/jovyan/vmeshchaninov/LatentDiffusion/checkpoints/autoencoder-num_latents=128-openwebtext-512-final-512/200000.pth loaded
Score estimator parameters: 129801216
Checkpoint /home/jovyan/vmeshchaninov/LatentDiffusion/checkpoints/diffusion-openwebtext-512-128-d=2-final/200000.pth is restored


In [14]:
_ = diffusion_trainer.score_estimator.eval()

In [78]:
_ = diffusion_trainer.autoencoder.decoder.cuda()

In [20]:
def prepare_samples(dataset):
    all_samples = {
        "input_text": [],
        "gold": [],
        "context": [],
    }

    for sample in dataset:
        ctx = sample['ctx']
        endings = sample['endings']
        gold = sample['label']

        all_samples["context"].append(ctx)
        all_samples["input_text"].append([])
        all_samples["gold"].append(gold)

        for ending in endings:
            input_text = ctx + " " + ending
            all_samples["input_text"][-1].append(input_text)
    return all_samples

In [251]:
def prepare_samples(dataset):
    all_samples = {
        "input_text": [],
        "gold": [],
        "context": [],
    }

    for sample in dataset:
        ctx = sample['ctx']
        endings = sample['endings']
        gold = sample['label']

        all_samples["context"].append(ctx)
        all_samples["gold"].append(gold)

        input_text = f"Context: {ctx}. You need to choose the most appropriate ending. Your answer should be a number from 0 to 3. These are possible endings: "

        for i, ending in enumerate(endings):
            input_text = input_text + f"{i}: {ending};"
        
        input_text = input_text + "Right answer is: [MASK]. Choose answer from 0, 1, 2, 3."
        all_samples["input_text"].append(input_text)
    return all_samples

In [19]:
from typing import List
from utils import seed_everything

## mse loss

In [60]:
def make_choice(diffusion_trainer, input_text: List[List[str]], context: List[str]):
    flat_input_text = []
    end_indices = [0]

    for texts in input_text:
        flat_input_text.extend(texts)
        end_indices.append(len(flat_input_text))

    total_loss = torch.zeros(len(flat_input_text))


    with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.bfloat16):
        tokenized_texts = diffusion_trainer.tokenizer(
            flat_input_text,
            add_special_tokens=diffusion_trainer.cfg.tokenizer.add_special_tokens,
            padding=diffusion_trainer.cfg.tokenizer.padding,
            truncation=diffusion_trainer.cfg.tokenizer.truncation,
            max_length=diffusion_trainer.cfg.dataset.max_sequence_len,
            return_tensors=diffusion_trainer.cfg.tokenizer.return_tensors,
            return_attention_mask=diffusion_trainer.cfg.tokenizer.return_attention_mask,
            return_token_type_ids=diffusion_trainer.cfg.tokenizer.return_token_type_ids,
        )
        tokenized_texts = tokenized_texts.to(diffusion_trainer.device)

        latent, _ = diffusion_trainer.autoencoder.get_latent(tokenized_texts, bert_output_masking=False)
        latent = diffusion_trainer.autoencoder.normalize_latent(latent)

        t = 0.5
        input_t = torch.ones(latent.shape[0], device=latent.device) * t

        seed_everything(42)
        for _ in range(3):
            x_t = diffusion_trainer.dynamic.marginal(latent, input_t)["x_t"]

            x_0_self_cond = torch.zeros_like(x_t)
            x_0 = diffusion_trainer.score_estimator(
                x_t=x_t,
                time_t=input_t,
                x_0_self_cond=latent.detach().clone()
            )

            loss_x_0 = torch.mean(torch.square(latent - x_0), dim=[1, 2]).cpu().detach()
            total_loss += loss_x_0
    # unflatten predictions
    probabilities = []
    predictions = []
    for i in range(len(end_indices) - 1):
        answers = -total_loss[end_indices[i]:end_indices[i+1]]
        probs = torch.softmax(answers, dim=0)
        predictions.append(probs.argmax())
        probabilities.append(probs.max())
    return predictions, probabilities


## similarity

In [26]:
def make_choice(diffusion_trainer, input_text: List[List[str]], context: List[str]):
    flat_input_text = []
    end_indices = [0]

    for texts in input_text:
        flat_input_text.extend(texts)
        end_indices.append(len(flat_input_text))

    flat_context = []
    for i, ctx in enumerate(context):
        for _ in range(end_indices[i], end_indices[i+1]):
            flat_context.append(ctx)
    
    with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.bfloat16):
        tokenized_context = diffusion_trainer.tokenizer(
            flat_context,
            add_special_tokens=diffusion_trainer.cfg.tokenizer.add_special_tokens,
            padding=diffusion_trainer.cfg.tokenizer.padding,
            truncation=diffusion_trainer.cfg.tokenizer.truncation,
            max_length=diffusion_trainer.cfg.dataset.max_sequence_len,
            return_tensors=diffusion_trainer.cfg.tokenizer.return_tensors,
            return_attention_mask=diffusion_trainer.cfg.tokenizer.return_attention_mask,
            return_token_type_ids=diffusion_trainer.cfg.tokenizer.return_token_type_ids,
        )
        tokenized_context = tokenized_context.to(diffusion_trainer.device)

        latent_context, _ = diffusion_trainer.autoencoder.get_latent(tokenized_context, bert_output_masking=False)
        latent_context = diffusion_trainer.autoencoder.normalize_latent(latent_context)


    with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.bfloat16):
        tokenized_texts = diffusion_trainer.tokenizer(
            flat_input_text,
            add_special_tokens=diffusion_trainer.cfg.tokenizer.add_special_tokens,
            padding=diffusion_trainer.cfg.tokenizer.padding,
            truncation=diffusion_trainer.cfg.tokenizer.truncation,
            max_length=diffusion_trainer.cfg.dataset.max_sequence_len,
            return_tensors=diffusion_trainer.cfg.tokenizer.return_tensors,
            return_attention_mask=diffusion_trainer.cfg.tokenizer.return_attention_mask,
            return_token_type_ids=diffusion_trainer.cfg.tokenizer.return_token_type_ids,
        )
        tokenized_texts = tokenized_texts.to(diffusion_trainer.device)

        latent, _ = diffusion_trainer.autoencoder.get_latent(tokenized_texts, bert_output_masking=False)
        latent = diffusion_trainer.autoencoder.normalize_latent(latent)



        # t = 0.5
        # input_t = torch.ones(latent.shape[0], device=latent.device) * t

        # seed_everything(42)
        # x_t = diffusion_trainer.dynamic.marginal(latent, input_t)["x_t"]

        # x_0_self_cond = torch.zeros_like(x_t)
        # x_0 = diffusion_trainer.score_estimator(
        #     x_t=x_t,
        #     time_t=input_t,
        #     x_0_self_cond=x_0_self_cond
        # )

    loss_x_0 = torch.mean(torch.square(latent - latent_context), dim=[1, 2]).cpu().detach()

    # unflatten predictions
    probabilities = []
    predictions = []
    for i in range(len(end_indices) - 1):
        answers = -loss_x_0[end_indices[i]:end_indices[i+1]]
        probs = torch.softmax(answers, dim=0)
        predictions.append(probs.argmax())
        probabilities.append(probs.max())
    return predictions, probabilities


## long context

In [231]:
def make_choice(diffusion_trainer, input_text: List[str], context: List[str]):
    tok_ids = [121, 122, 123, 124]

    predictions = []

    probabilities = []
    pred_texts = []
    with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.bfloat16):
        tokenized_texts = diffusion_trainer.tokenizer(
            input_text,
            add_special_tokens=diffusion_trainer.cfg.tokenizer.add_special_tokens,
            padding=diffusion_trainer.cfg.tokenizer.padding,
            truncation=diffusion_trainer.cfg.tokenizer.truncation,
            max_length=diffusion_trainer.cfg.dataset.max_sequence_len,
            return_tensors=diffusion_trainer.cfg.tokenizer.return_tensors,
            return_attention_mask=diffusion_trainer.cfg.tokenizer.return_attention_mask,
            return_token_type_ids=diffusion_trainer.cfg.tokenizer.return_token_type_ids,
        )
        tokenized_texts = tokenized_texts.to(diffusion_trainer.device)
        
        mask_indices = []
        for i in range(len(tokenized_texts["input_ids"])):
            for j in range(len(tokenized_texts["input_ids"][i])):
                if tokenized_texts["input_ids"][i][j] == diffusion_trainer.tokenizer.mask_token_id:
                    mask_indices.append(j)
                    break

        latent, _ = diffusion_trainer.autoencoder.get_latent(tokenized_texts, bert_output_masking=False)
        latent = diffusion_trainer.autoencoder.normalize_latent(latent)

        t = 0.5
        input_t = torch.ones(latent.shape[0], device=latent.device) * t

        seed_everything(42)
        
        x_t = diffusion_trainer.dynamic.marginal(latent, input_t)["x_t"]

        x_0_self_cond = torch.zeros_like(x_t)
        x_0 = diffusion_trainer.score_estimator(
            x_t=x_t,
            time_t=input_t,
            x_0_self_cond=x_0_self_cond.detach().clone()
        )
        
        encoder_latent = diffusion_trainer.autoencoder.denormalize_latent(x_0)
        pred_logits = diffusion_trainer.autoencoder.decoder(encoder_latent)
        

        for b_ind in range(len(pred_logits)):
            answer_probs = torch.softmax(pred_logits[b_ind, mask_indices[b_ind]], dim=0)
            answer_probs = answer_probs[tok_ids]
            # print(answer_probs)
            answer = answer_probs.argmax()
            predictions.append(answer)

        text, _ = diffusion_trainer.sample_from_logits(pred_logits)
        pred_texts.extend(text)
        

    # unflatten predictions
    # probabilities = []
    # predictions = []
    # for i in range(len(end_indices) - 1):
    #     answers = -total_loss[end_indices[i]:end_indices[i+1]]
    #     probs = torch.softmax(answers, dim=0)
    #     predictions.append(probs.argmax())
    #     probabilities.append(probs.max())
    return predictions, pred_texts


In [252]:
from tqdm import tqdm

def check_model(model, samples):
    texts = []
    probabilities = []
    predictions = []
    pred_texts = []
    batch_size = 1000

    for i in tqdm(range(0, len(samples["input_text"]), batch_size)):
        input_texts = samples["input_text"][i:i+batch_size]
        context = samples["context"][i:i+batch_size]
        sample_predictions = make_choice(model, input_texts, context)
        # texts.extend(sample_predictions)
        predictions.extend(sample_predictions[0])
        pred_texts.extend(sample_predictions[1])

    targets = [int(g) for g in samples["gold"]]
    correct = 0
    for pred, target in zip(predictions, targets):
        if pred.item() == target:
            correct += 1

    accuracy = correct / len(predictions)
    print(f"Accuracy: {accuracy:.4f}")
    return predictions, targets, pred_texts

# Hellaswag

In [219]:
from datasets import load_dataset

dataset = load_dataset("Rowan/hellaswag", split='validation')

In [255]:
dataset_1000 = dataset.select(range(1000))

In [256]:
# Prepare samples for Hellaswag
hellaswag_samples = prepare_samples(dataset_1000)

# Get predictions from the model
hellaswag_output = check_model(diffusion_trainer, hellaswag_samples)


100%|██████████| 1/1 [00:08<00:00,  8.09s/it]

Accuracy: 0.2680





In [257]:
for p, g in zip(hellaswag_output[0], hellaswag_output[1]):
    print(f"Prediction: {p}, Gold: {g}")

Prediction: 3, Gold: 3
Prediction: 0, Gold: 3
Prediction: 0, Gold: 2
Prediction: 3, Gold: 2
Prediction: 0, Gold: 1
Prediction: 0, Gold: 1
Prediction: 2, Gold: 2
Prediction: 0, Gold: 0
Prediction: 2, Gold: 1
Prediction: 2, Gold: 1
Prediction: 2, Gold: 3
Prediction: 3, Gold: 3
Prediction: 0, Gold: 2
Prediction: 3, Gold: 2
Prediction: 0, Gold: 0
Prediction: 0, Gold: 3
Prediction: 1, Gold: 2
Prediction: 1, Gold: 0
Prediction: 0, Gold: 1
Prediction: 1, Gold: 1
Prediction: 0, Gold: 1
Prediction: 0, Gold: 0
Prediction: 3, Gold: 3
Prediction: 3, Gold: 3
Prediction: 3, Gold: 0
Prediction: 0, Gold: 3
Prediction: 0, Gold: 0
Prediction: 2, Gold: 3
Prediction: 0, Gold: 1
Prediction: 3, Gold: 3
Prediction: 2, Gold: 1
Prediction: 0, Gold: 0
Prediction: 2, Gold: 1
Prediction: 1, Gold: 2
Prediction: 0, Gold: 0
Prediction: 3, Gold: 0
Prediction: 3, Gold: 3
Prediction: 3, Gold: 2
Prediction: 2, Gold: 2
Prediction: 3, Gold: 1
Prediction: 1, Gold: 2
Prediction: 1, Gold: 2
Prediction: 1, Gold: 3
Prediction:

In [200]:
ind = 0
hellaswag_samples["input_text"][ind]

"Context: A man is sitting on a roof. he. You need to choose the most appropriate ending. Your answer should be a number from 0 to 3. These are possible endings:\n 0: is using wrap to wrap a pair of skis.\n1: is ripping level tiles off.\n2: is holding a rubik's cube.\n3: starts pulling up roofing on a roof.\nYour answer: [MASK]"

In [201]:
hellaswag_output[2][ind]

"Context : A man is sitting on a roof. he. You need to choose the most appropriate ending. Your answer should be a number from 0 to 3. These are possible endings : 0 : is using wrap to wrap a pair of skis. 1 : is ripping level tiles off. 2 : is holding a rubik's cube. 3 : starts pulling up roofing on a roof. Your answer : …"

In [118]:
hellaswag_samples["gold"][0]    

'3'