# ReadMe
For best results, view this notebook in a light-mode window.

In [2]:
from peft import PeftModel, PeftConfig
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch.nn.functional as F
import wandb
import huggingface_hub
from rich.console import Console
from rich.text import Text
import numpy as np


# 1. Login into the clients

In [3]:

import os
from dotenv import load_dotenv

load_dotenv()

# TODO - write your own token here
hf_token = os.getenv("HF_TOKEN")
wandb_token = os.getenv("wandb_api_key")
huggingface_hub.login(token=hf_token)

# login into the clients
wandb.login(key=wandb_token)
huggingface_hub.login(token=hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/erik/.cache/huggingface/token
Login successful


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbode-karl-erik[0m ([33merikbodedev[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/erik/.netrc


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/erik/.cache/huggingface/token
Login successful


In [4]:
# load model and tokenizer
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(
    model_id, token=hf_token, padding_side="right"
)


# load finetuned models


In [5]:
# Subjective
model_id_opinion = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# tokenizer_opinion = AutoTokenizer.from_pretrained(
#     model_id_opinion, token=hf_token, padding_side="right"
# )
model_raw_opinion = AutoModelForCausalLM.from_pretrained(
    model_id_opinion, quantization_config=bnb_config, device_map={"": 0}, token=hf_token
)

# select model

# VoxPopuli
# adapter_name_subjective = "ErikBode/train_VP_cluster_more_tokens"  

# Cornell
adapter_name_subjective = "ErikBode/train_cornel_subj_cluster_hpoparams"


config = PeftConfig.from_pretrained(adapter_name_subjective)
model_subjective = PeftModel.from_pretrained(model_raw_opinion, adapter_name_subjective, config=config)
# merge the model and adapter, optional
# model_subjective = model_subjective.merge_and_unload()

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# Objective

model_id_objective = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# tokenizer_objective = AutoTokenizer.from_pretrained(
#     model_id_objective, token=hf_token, padding_side="right"
# )
model_raw_objective = AutoModelForCausalLM.from_pretrained(
    model_id_objective, quantization_config=bnb_config, device_map={"": 0}, token=hf_token
)

# select model

# CommonVoice
# adapter_name_objective = "ErikBode/train_CV_cluster_more_tokens" 

# Arxiv
adapter_name_objective = "ErikBode/workingtrain_AX_cluster_more_tokens" 

# Cornell
# adapter_name_objective = "ErikBode/train_cornel_obj_cluster_hpoparams"

config_objective = PeftConfig.from_pretrained(adapter_name_objective)
model_objective = PeftModel.from_pretrained(model_raw_objective, adapter_name_objective, config=config_objective)
# merge the model and adapter, optional
# model_objective = model_objective.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# 2 prep

## 2.1 Token Probabilities and Difference

In [7]:
input_text = "An unseasonably early hurricane is gathering strength as it bears down on the Caribbean, stoking fears it could cause nine-foot waves and catastrophic damage."

device = "cuda:0"
inputs = tokenizer(input_text, return_tensors="pt").to(device)

input_text = tokenizer.decode(inputs['input_ids'].tolist()[0], skip_special_tokens=True)
input_token = tokenizer.convert_ids_to_tokens(inputs['input_ids'].tolist()[0], skip_special_tokens=True)
print(f"Input:\n{'-' * 100}\n{input_text}\n{'-' * 100}\n{input_token}\n\n\n{'=' * 100}\n\n")

with torch.no_grad():
    outputs_sub = model_subjective(**inputs)
    outputs_ob = model_objective(**inputs)
    logits_sub = outputs_sub.logits
    logits_ob = outputs_ob.logits


shift_logits_sub = logits_sub[:, :-1, :].contiguous()
shift_logits_sub = shift_logits_sub.view(-1, shift_logits_sub.size(-1))

shift_logits_ob = logits_ob[:, :-1, :].contiguous()
shift_logits_ob = shift_logits_ob.view(-1, shift_logits_ob.size(-1))

shift_labels = inputs["input_ids"][:, 1:].view(-1).contiguous()


prob_sub = F.softmax(shift_logits_sub, dim=-1)
prob_ob = F.softmax(shift_logits_ob, dim=-1)


probs_labels_sub = prob_sub.gather(1, shift_labels.unsqueeze(-1)).squeeze(-1)
probs_labels_ob = prob_ob.gather(1, shift_labels.unsqueeze(-1)).squeeze(-1)

token_diff = probs_labels_ob - probs_labels_sub


print(f"{'Token:':16s}|{'  Subjective:':14s}|{'  Objective:':14s}|{'  Difference:':14s}\n{'-' * 62}")
for token, p_sub, p_ob, diff in zip(shift_labels, probs_labels_sub, probs_labels_ob, token_diff):
    token_str = "'" + tokenizer.decode([token]) + "'"
    print(f"{token_str:16s}|{p_sub:14.10f}|{p_ob:14.10f}|{diff:14.10f}")


Input:
----------------------------------------------------------------------------------------------------
An unseasonably early hurricane is gathering strength as it bears down on the Caribbean, stoking fears it could cause nine-foot waves and catastrophic damage.
----------------------------------------------------------------------------------------------------
['An', '▁un', 'season', 'ably', '▁early', '▁hurricane', '▁is', '▁gathering', '▁strength', '▁as', '▁it', '▁bears', '▁down', '▁on', '▁the', '▁Caribbean', ',', '▁sto', 'king', '▁fears', '▁it', '▁could', '▁cause', '▁nine', '-', 'foot', '▁waves', '▁and', '▁catastrophic', '▁damage', '.']




Token:          |  Subjective: |  Objective:  |  Difference: 
--------------------------------------------------------------
'An'            |  0.0000153185|  0.0000013099| -0.0000140086
' un'           |  0.0205385666|  0.0023014611| -0.0182371065
'season'        |  0.0006688284|  0.0001082025| -0.0005606259
'ably'          |  0.3745705187|  

## 2.2 Token Colouring

In [8]:
from rich.console import Console
from rich.text import Text


# Parameter to make the token probability difference more distinct
color_multiplier = 4

colored_text = Text()

# First Token (might be needed later)
#colored_text.append(tokenizer.decode(inputs['input_ids'].tolist()[0][0]))

scaled_token_diff = torch.clamp(token_diff * color_multiplier, min=-1, max=1)


for token, diff in zip(shift_labels, scaled_token_diff):
    scaling = int((diff.item() + 1) / 2 * 255)
    # rgb_color = (180, scaling, 0)   # original colormap
    rgb_color = (255-scaling, 150, scaling)  # alternative colormap
    token_str = tokenizer.decode([token])
    colored_text.append(token_str, style=f"rgb({rgb_color[0]},{rgb_color[1]},{rgb_color[2]}) ")

console = Console()
console.print(colored_text)


# Make it compact

In [9]:
device = "cuda:0"

def generate_colored_text(input_text, model_subjective=model_subjective, name_opinion=adapter_name_subjective, model_objective=model_objective, name_objective=adapter_name_objective, tokenizer=tokenizer,
                           device="cuda:0", verbose=False, save_path=None):
    print(f"using model  {name_opinion.split('/')[1]} as subjective model and {name_objective.split('/')[1]} as objective model")
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    input_text = tokenizer.decode(inputs['input_ids'].tolist()[0], skip_special_tokens=True)
    input_token = tokenizer.convert_ids_to_tokens(inputs['input_ids'].tolist()[0], skip_special_tokens=True)
    if verbose:
        print(f"Input:\n{'-' * 100}\n{input_text}\n{'-' * 100}\n{input_token}\n\n\n{'=' * 100}\n\n")

    with torch.no_grad():
        outputs_sub = model_subjective(**inputs)
        outputs_ob = model_objective(**inputs)
        logits_sub = outputs_sub.logits
        logits_ob = outputs_ob.logits


    shift_logits_sub = logits_sub[:, :-1, :].contiguous()
    shift_logits_sub = shift_logits_sub.view(-1, shift_logits_sub.size(-1))

    shift_logits_ob = logits_ob[:, :-1, :].contiguous()
    shift_logits_ob = shift_logits_ob.view(-1, shift_logits_ob.size(-1))

    shift_labels = inputs["input_ids"][:, 1:].view(-1).contiguous()


    prob_sub = F.softmax(shift_logits_sub, dim=-1)
    prob_ob = F.softmax(shift_logits_ob, dim=-1)


    probs_labels_sub = prob_sub.gather(1, shift_labels.unsqueeze(-1)).squeeze(-1)
    probs_labels_ob = prob_ob.gather(1, shift_labels.unsqueeze(-1)).squeeze(-1)

    token_diff = probs_labels_ob - probs_labels_sub

    if verbose:
        print(f"{'Token:':16s}|{'  Subjective:':14s}|{'  Objective:':14s}|{'  Difference:':14s}\n{'-' * 62}")
        for token, p_sub, p_ob, diff in zip(shift_labels, probs_labels_sub, probs_labels_ob, token_diff):
            token_str = "'" + tokenizer.decode([token]) + "'"
            print(f"{token_str:16s}|{p_sub:14.10f}|{p_ob:14.10f}|{diff:14.10f}")


    # Parameter to make the token probability difference more distinct
    color_multiplier = 4

    colored_text = Text()

    # First Token (might be needed later)
    #colored_text.append(tokenizer.decode(inputs['input_ids'].tolist()[0][0]))

    scaled_token_diff = torch.clamp(token_diff * color_multiplier, min=-1, max=1)

    # standard method: token-wise coloring
    for token, diff in zip(shift_labels, scaled_token_diff):
        # scaling = int((diff.item() + 1) / 2 * 255)
        scaling = int(1 / (1 + np.exp(-diff.item() * 10)) * 255)
        # rgb_color = (180, scaling, 0)   # original colormap
        if np.abs(diff.item()) < 0.1:
            rgb_color = (0,0,0)
        else:
            rgb_color = (255-scaling, 150, scaling)  # alternative colormap

        token_str = tokenizer.decode([token])
        colored_text.append(token_str, style=f"rgb({rgb_color[0]},{rgb_color[1]},{rgb_color[2]}) ")

    # alternative method: coloring the entire text
    # only let the top k tokens influence the color
    # k = 3
    # scalings = []
    # for token, diff in zip(shift_labels, scaled_token_diff):
    #     scalings.append(int((diff.item() + 1) / 2 * 255))

    # scalings = np.array(scalings)
    # indices = np.abs(scalings - 127).argsort()[-k:]
    # scaling = np.mean(scalings[indices])
    # verbose = True
    # if verbose:
    #     print(f"scaling: {scaling}")
    # rgb_color = (255-scaling, 150, scaling)  # alternative colormap
    # if verbose:
    #     print(rgb_color)
    # colored_text.append(input_text, style=f"rgb({rgb_color[0]},{rgb_color[1]},{rgb_color[2]}) ")

    if save_path is not None:
        console = Console(record=True)
    else: 
        console = Console()
    console.print(colored_text)

    if save_path is not None:
        # Export to HTML
        console.save_html(save_path)

In [11]:
# generate_colored_text(input_text, verbose=False, save_path="output.html")

In [12]:
text_samples = [ #  "Biden’s performance was so far below acceptable that the Democratic party should be ashamed of itself for allowing him to stand for re-election.",
                "On Capitol Hill, some Democratic lawmakers openly acknowledged that Mr. Biden’s performance was a disaster, while other leaders offered only terse signs of support and hoped that the focus would turn back to Mr. Trump’s lies.",
                "Biden’s performance was so far below acceptable that the Democratic party should be ashamed of itself for allowing him to stand for re-election.",
                "A range of despairing Democrats began to reconsider their nominee after his rough debate showing, but there was no agreement on how, or whether, to urge him to step off the ticket.",
                "Hurricane Beryl, which slammed into Texas on Monday after wreaking havoc in the Caribbean, was supercharged by “absolutely crazy” ocean temperatures that are likely to fuel further violent storms in the coming months, scientists have warned.",
                "An unseasonably early hurricane is gathering strength as it bears down on the Caribbean, stoking fears it could cause nine-foot waves and catastrophic damage."]
for text in text_samples[:]:
    generate_colored_text(text, model_objective=model_objective, name_opinion=adapter_name_subjective, model_subjective=model_subjective, name_objective=adapter_name_objective, tokenizer=tokenizer, verbose=False)


using model  train_cornel_subj_cluster_hpoparams_stop as subjective model and workingtrain_AX_cluster_more_tokens as objective model


using model  train_cornel_subj_cluster_hpoparams_stop as subjective model and workingtrain_AX_cluster_more_tokens as objective model


using model  train_cornel_subj_cluster_hpoparams_stop as subjective model and workingtrain_AX_cluster_more_tokens as objective model


using model  train_cornel_subj_cluster_hpoparams_stop as subjective model and workingtrain_AX_cluster_more_tokens as objective model


using model  train_cornel_subj_cluster_hpoparams_stop as subjective model and workingtrain_AX_cluster_more_tokens as objective model


In [18]:
temp_text = "The election between Donald Trump and Joe Biden in the United States is characterized by significant voter turnout, extensive use of mail-in ballots, and numerous legal challenges."
generate_colored_text(temp_text, verbose=False  )# , save_path="output.html")

using model  train_cornel_subj_cluster_hpoparams_stop as subjective model and workingtrain_AX_cluster_more_tokens as objective model
