In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import (
    BeamSearchScorer,
    LogitsProcessor,
    LogitsProcessorList,
    StoppingCriteriaList,
    MaxLengthCriteria
)
from math import *
import types
import copy
import warnings
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

from transformers.generation.beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
from transformers.generation.utils import BeamSearchDecoderOnlyOutput, BeamSearchEncoderDecoderOutput
from transformers.generation.stopping_criteria import validate_stopping_criteria

import gc

In [2]:
tokenizer_path = "/cognitive_comp/lincong/models/llama2_13B_sft/0905_step18k_hf"
model_path = tokenizer_path

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForCausalLM.from_pretrained(model_path).cuda()
model.eval()
pass

Loading checkpoint shards:   0%|          | 0/28 [00:00<?, ?it/s]

In [16]:
def beam_search_with_CD(
        self,
        input_ids: torch.LongTensor,
        contrastive_ids: torch.LongTensor,
        beam_scorer: BeamScorer,
        logits_processor: LogitsProcessorList,
        contrastive_temperature: float = 1.0,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        max_length: Optional[int] = None,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[Union[int, List[int]]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
        synced_gpus: bool = False,
        **model_kwargs,
    ):
        r"""
        Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        <Tip warning={true}>

        In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate()
        instead. For an overview of generation strategies and code examples, check the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            beam_scorer (`BeamScorer`):
                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
            logits_processor (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            max_length (`int`, *optional*, defaults to 20):
                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
                tokens. The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            eos_token_id (`Union[int, List[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            synced_gpus (`bool`, *optional*, defaults to `False`):
                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.BeamSearchEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.


        Examples:

        ```python
        >>> from transformers import (
        ...     AutoTokenizer,
        ...     AutoModelForSeq2SeqLM,
        ...     LogitsProcessorList,
        ...     MinLengthLogitsProcessor,
        ...     BeamSearchScorer,
        ... )
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

        >>> encoder_input_str = "translate English to German: How old are you?"
        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids


        >>> # lets run beam search using 3 beams
        >>> num_beams = 3
        >>> # define decoder start token ids
        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
        >>> input_ids = input_ids * model.config.decoder_start_token_id

        >>> # add encoder_outputs to model keyword arguments
        >>> model_kwargs = {
        ...     "encoder_outputs": model.get_encoder()(
        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
        ...     )
        ... }

        >>> # instantiate beam scorer
        >>> beam_scorer = BeamSearchScorer(
        ...     batch_size=1,
        ...     num_beams=num_beams,
        ...     device=model.device,
        ... )

        >>> # instantiate logits processors
        >>> logits_processor = LogitsProcessorList(
        ...     [
        ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
        ...     ]
        ... )

        >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)

        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ['Wie alt bist du?']
        ```"""
        # init values
        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
        if max_length is not None:
            warnings.warn(
                "`max_length` is deprecated in this function, use"
                " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
                UserWarning,
            )
            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
        if len(stopping_criteria) == 0:
            warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
        if isinstance(eos_token_id, int):
            eos_token_id = [eos_token_id]
        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
        output_attentions = (
            output_attentions if output_attentions is not None else self.generation_config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
        )
        return_dict_in_generate = (
            return_dict_in_generate
            if return_dict_in_generate is not None
            else self.generation_config.return_dict_in_generate
        )

        batch_size = len(beam_scorer._beam_hyps)
        num_beams = beam_scorer.num_beams

        batch_beam_size, cur_len = input_ids.shape
        batch_beam_size_contrastive, cur_len_contrastive = contrastive_ids.shape
        assert batch_beam_size == batch_beam_size_contrastive, f"Contrastive input size {batch_beam_size_contrastive} does not match input size {batch_beam_size}."
    
        if num_beams * batch_size != batch_beam_size:
            raise ValueError(
                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
            )

        # init attention / hidden states / scores tuples
        scores = () if (return_dict_in_generate and output_scores) else None
        beam_indices = (
            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
        )
        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None

        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
        if return_dict_in_generate and self.config.is_encoder_decoder:
            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
            encoder_hidden_states = (
                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
            )

        model_kwargs_contrastive = copy.deepcopy(model_kwargs)
    
        # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
        # of the first beam are considered to avoid sampling the exact same tokens across all beams.
        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
        beam_scores[:, 1:] = -1e9
        beam_scores = beam_scores.view((batch_size * num_beams,))

        this_peer_finished = False  # used by synced_gpus only
        while True:
            if synced_gpus:
                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
                # The following logic allows an early break if all peers finished generating their sequence
                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
                # send 0.0 if we finished, 1.0 otherwise
                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
                # did all peers finish? the reduced sum will be 0.0 then
                if this_peer_finished_flag.item() == 0.0:
                    break

            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
            outputs = self(
                **model_inputs,
                return_dict=True,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
            )

            model_inputs_contrastive = self.prepare_inputs_for_generation(contrastive_ids, **model_kwargs_contrastive)
            contrastive_outputs = self(
                **model_inputs_contrastive,
                return_dict=True,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
            )    

            if synced_gpus and this_peer_finished:
                cur_len = cur_len + 1
                cur_len_contrastive = cur_len_contrastive + 1
                continue  # don't waste resources running the code we don't need

            next_token_scores = outputs.logits[:, -1, :]
            next_token_scores_contrastive = contrastive_outputs.logits[:, -1, :]
            # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
            # cannot be generated both before and after the `nn.functional.log_softmax` operation.
            next_token_scores = self.adjust_logits_during_generation(next_token_scores, cur_len=cur_len)
            next_token_scores_contrastive = self.adjust_logits_during_generation(next_token_scores_contrastive, cur_len=cur_len_contrastive)

            next_token_scores = next_token_scores / contrastive_temperature
            
            next_token_scores = nn.functional.log_softmax(
                next_token_scores, dim=-1
            )  # (batch_size * num_beams, vocab_size)
            next_token_scores_contrastive = nn.functional.log_softmax(
                next_token_scores_contrastive, dim=-1
            )  # (batch_size * num_beams, vocab_size)

            next_token_scores_processed = logits_processor(next_token_scores_contrastive, next_token_scores)
            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores_processed)
            
            # Store scores, attentions and hidden_states when required
            if return_dict_in_generate:
                if output_scores:
                    scores += (next_token_scores_processed,)
                if output_attentions:
                    decoder_attentions += (
                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
                    )
                    if self.config.is_encoder_decoder:
                        cross_attentions += (outputs.cross_attentions,)

                if output_hidden_states:
                    decoder_hidden_states += (
                        (outputs.decoder_hidden_states,)
                        if self.config.is_encoder_decoder
                        else (outputs.hidden_states,)
                    )

            # reshape for beam search
            vocab_size = next_token_scores.shape[-1]
            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)

            # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search)
            next_token_scores, next_tokens = torch.topk(
                next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True
            )
            # # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
            # n_eos_tokens = len(eos_token_id) if eos_token_id else 0
            # next_token_scores, next_tokens = torch.topk(
            #     next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
            # )

            next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
            next_tokens = next_tokens % vocab_size

            # stateless
            beam_outputs = beam_scorer.process(
                input_ids,
                next_token_scores,
                next_tokens,
                next_indices,
                pad_token_id=pad_token_id,
                eos_token_id=eos_token_id,
                beam_indices=beam_indices,
            )

            beam_scores = beam_outputs["next_beam_scores"]
            beam_next_tokens = beam_outputs["next_beam_tokens"]
            beam_idx = beam_outputs["next_beam_indices"]

            input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
            contrastive_ids = torch.cat([contrastive_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)

            model_kwargs = self._update_model_kwargs_for_generation(
                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
            )
            if model_kwargs["past_key_values"] is not None:
                model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx)
            
            model_kwargs_contrastive = self._update_model_kwargs_for_generation(
                contrastive_outputs, model_kwargs_contrastive, is_encoder_decoder=self.config.is_encoder_decoder
            )
            if model_kwargs_contrastive["past_key_values"] is not None:
                model_kwargs_contrastive["past_key_values"] = self._reorder_cache(model_kwargs_contrastive["past_key_values"], beam_idx)

            if return_dict_in_generate and output_scores:
                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))

            # increase cur_len
            cur_len = cur_len + 1
            cur_len_contrastive = cur_len_contrastive + 1

            if beam_scorer.is_done or stopping_criteria(input_ids, scores):
                if not synced_gpus:
                    break
                else:
                    this_peer_finished = True

        sequence_outputs = beam_scorer.finalize(
            input_ids,
            beam_scores,
            next_tokens,
            next_indices,
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
            max_length=stopping_criteria.max_length,
            beam_indices=beam_indices,
        )

        if return_dict_in_generate:
            if not output_scores:
                sequence_outputs["sequence_scores"] = None

            if self.config.is_encoder_decoder:
                return BeamSearchEncoderDecoderOutput(
                    sequences=sequence_outputs["sequences"],
                    sequences_scores=sequence_outputs["sequence_scores"],
                    scores=scores,
                    beam_indices=sequence_outputs["beam_indices"],
                    encoder_attentions=encoder_attentions,
                    encoder_hidden_states=encoder_hidden_states,
                    decoder_attentions=decoder_attentions,
                    cross_attentions=cross_attentions,
                    decoder_hidden_states=decoder_hidden_states,
                )
            else:
                return BeamSearchDecoderOnlyOutput(
                    sequences=sequence_outputs["sequences"],
                    sequences_scores=sequence_outputs["sequence_scores"],
                    scores=scores,
                    beam_indices=sequence_outputs["beam_indices"],
                    attentions=decoder_attentions,
                    hidden_states=decoder_hidden_states,
                )
        else:
            return sequence_outputs["sequences"]

model.beam_search_with_CD = types.MethodType(beam_search_with_CD, model)

In [5]:
def _verbose(*tensors):
    steps = ["'base'", "'prpt'", "'ctst'"]
    for i, tensor in enumerate(*tensors):
        for id, (values, indices) in enumerate(zip(*torch.sort(tensor, dim=-1, descending=True))):
            print(f'Step {steps[i]} Beam {id}', tokenizer.decode(indices[:10]), values[:10].cpu().detach().numpy().tolist())   
    print()

class ContrastiveLogits(LogitsProcessor):
    def __init__(self, apc=0.1, bound=None, verbose=True):
        super().__init__()
        self.apc = apc
        self.bound = bound
        self.verbose = verbose
    
    def __call__(self, ref_scores, scores):
        mask = ref_scores < log(self.apc) + torch.max(ref_scores, dim=-1).values.reshape(-1, 1)
        if self.bound is not None:
            mask = torch.logical_or(mask, ref_scores < log(self.bound) + scores)

        ref_scores = ref_scores.masked_fill(mask, -float("inf"))
        processed_scores = ref_scores - scores

        if self.verbose:
            _verbose([scores, ref_scores, processed_scores])
            
        return processed_scores

class MixedLogits(LogitsProcessor):
    def __init__(self, alpha=0.9, apc=0.1, verbose=True):
        super().__init__()
        self.alpha = alpha
        self.apc = apc
        self.verbose = verbose

    def __call__(self, ref_scores, scores):
        mask = ref_scores < log(self.apc) + torch.max(ref_scores, dim=-1).values.reshape(-1, 1)
        ref_scores = ref_scores.masked_fill(mask, -float("inf"))
        processed_scores = self.alpha * ref_scores + (1 - self.alpha) * scores

        if self.verbose:
            _verbose([scores, ref_scores, processed_scores])
            
        return processed_scores

class RectifiedLogits(LogitsProcessor):
    def __init__(self, gamma=4, beta=1, apc=0.1, verbose=True):
        self.gamma = gamma
        self.beta = beta
        self.apc = apc
        self.verbose = verbose

    def __call__(self, ref_scores, scores):
        mask = ref_scores < log(self.apc) + torch.max(ref_scores, dim=-1).values.reshape(-1, 1)
        ref_scores = ref_scores.masked_fill(mask, -float("inf"))
        processed_scores = F.sigmoid(self.gamma * (ref_scores - scores) + self.beta)

        if self.verbose:
            _verbose([scores, ref_scores, processed_scores])
            
        return processed_scores   

In [17]:
def CD_beams(input, ref_input=None, in_beams=3, out_beams=1, logits_processor=None, T=1.0, max_length=100, **model_kwargs):
    
    input_ids = tokenizer(input, return_tensors='pt').input_ids
    
    beam_scorer = BeamSearchScorer(
        batch_size=input_ids.shape[0],
        num_beams=in_beams,
        num_beam_hyps_to_keep=out_beams,
        device=model.device,
    )

    if ref_input is None:
        generated = model.beam_search(
            torch.cat([input_ids] * in_beams).cuda(),
            beam_scorer=beam_scorer,
            logits_processor=None,
            stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)]),
            **model_kwargs
        )
    else:
        ref_input_ids = tokenizer(ref_input, return_tensors='pt').input_ids
        generated = model.beam_search_with_CD(
            torch.cat([input_ids] * in_beams).cuda(),
            torch.cat([ref_input_ids] * in_beams).cuda(),
            beam_scorer=beam_scorer,
            logits_processor=logits_processor,
            contrastive_temperature=T, # 2.0 is nice
            stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)]),
            **model_kwargs
        )
    
    for index, output_tokenized in enumerate(generated):
        output = tokenizer.decode(output_tokenized)
        print(f'beam {index}: {output}')

    torch.cuda.empty_cache()
    gc.collect()


In [24]:
prompt = '<Human Round-1>:请续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:'

inputs = tokenizer(prompt, return_tensors="pt").input_ids.cuda()

outputs = model.generate(inputs, num_beams=3, max_new_tokens=50)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['<Human Round-1>:请续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>: 我们班的同学都很想念老师，希望老师早日康复，回到我们身边。']

In [23]:
prompt = '<Human Round-1>:请用高兴的语气续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:'

inputs = tokenizer(prompt, return_tensors="pt").input_ids.cuda()

outputs = model.generate(inputs, num_beams=3, max_new_tokens=50)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['<Human Round-1>:请用高兴的语气续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>: 同学们都很开心，因为终于可以不用听老师的唠叨了。但是，我们也希望老师能早日康复，回到我们身边。']

In [29]:
CD_beams('<Human Round-1>:请续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', in_beams=3, out_beams=1, logits_processor=None, use_cache=True)

beam 0: <s> <Human Round-1>:请续写这句话：今天老师生病了，没来上课，
<Assistant Round-1>: 我们班的同学都很想念老师，希望老师早日康复，回到我们身边。</s>


In [30]:
CD_beams('<Human Round-1>:请用高兴的语气续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', in_beams=3, out_beams=1, logits_processor=None, use_cache=True)

beam 0: <s> <Human Round-1>:请用高兴的语气续写这句话：今天老师生病了，没来上课，
<Assistant Round-1>: 同学们都很开心，因为终于可以不用听老师的唠叨了。但是，我们也希望老师能早日康复，回到我们身边。</s>


In [32]:
CD_beams('<Human Round-1>:请续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', 'null', in_beams=2, out_beams=1, logits_processor=None, use_cache=True)

beam 0: <s> <Human Round-1>:请续写这句话：今天老师生病了，没来上课，
<Assistant Round-1>: 所以我们自习了一节课。</s>


In [33]:
CD_beams('<Human Round-1>:请用高兴的语气续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', 'null', in_beams=2, out_beams=1, logits_processor=None, use_cache=True)

beam 0: <s> <Human Round-1>:请用高兴的语气续写这句话：今天老师生病了，没来上课，
<Assistant Round-1>: 同学们都很开心，因为终于可以不用听老师的唠叨了。但是，我们也希望老师能早日康复，回到我们身边。</s>


In [32]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Oct 13 12:09:01 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.141.03   Driver Version: 470.141.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:16:00.0 Off |                    0 |
| N/A   33C    P0    67W / 400W |  62472MiB / 81251MiB |      0%      Default |
|                               |            

In [10]:
# raw beam search

CD_beams(
    input='<Human Round-1>:请续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', 
    ref_input=None, 
    logits_processor=None, 
    T=1.0,
    in_beams=3,
    out_beams=1,
)

beam 0: <s> <Human Round-1>:请续写这句话：今天老师生病了，没来上课，
<Assistant Round-1>: 我们班的同学都很想念老师，希望老师早日康复，回到我们身边。</s>


In [12]:
# ablation

CD_beams(
    input='<Human Round-1>:请用高兴的语气续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', 
    ref_input=None, 
    logits_processor=None, 
    T=1.0,
    in_beams=3,
    out_beams=1,
)

beam 0: <s> <Human Round-1>:请用高兴的语气续写这句话：今天老师生病了，没来上课，
<Assistant Round-1>: 同学们都很开心，因为终于可以不用听老师的唠叨了。但是，我们也希望老师能早日康复，回到我们身边。</s>


In [14]:
# contrastive decoding against prompted model w/o temperature

CD_beams(
    input='<Human Round-1>:请续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', 
    ref_input='<Human Round-1>:请用高兴的语气续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', 
    logits_processor=ContrastiveLogits(apc=0.05, bound=None, verbose=True), 
    T=3,
    in_beams=2,
    out_beams=1,
)

Step 'base' Beam 0  so “ A but we ( — The L [-4.838179588317871, -7.213222503662109, -7.22791051864624, -7.284914493560791, -7.73336124420166, -7.797381401062012, -7.918823719024658, -8.051321983337402, -8.087226867675781, -8.098796844482422]
Step 'base' Beam 1  so “ A but we ( — The L [-4.838179588317871, -7.213222503662109, -7.22791051864624, -7.284914493560791, -7.73336124420166, -7.797381401062012, -7.918823719024658, -8.051321983337402, -8.087226867675781, -8.098796844482422]
Step 'prpt' Beam 0 <unk><s></s>  [-0.006629378534853458, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf]
Step 'prpt' Beam 1 <unk><s></s>  [-0.006629378534853458, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf]
Step 'ctst' Beam 0 <unk><s></s>  [4.831550121307373, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf]
Step 'ctst' Beam 1 <unk><s></s>  [4.831550121307373, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf]

Step 'base' Beam 0 我所同小但由没你大于 [-5.641993999481201, -5.825

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 79.35 GiB total capacity; 76.45 GiB already allocated; 11.19 MiB free; 78.03 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [11]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Oct 13 11:39:07 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.141.03   Driver Version: 470.141.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:16:00.0 Off |                    0 |
| N/A   33C    P0    67W / 400W |  69348MiB / 81251MiB |      0%      Default |
|                               |            

In [None]:
# contrastive decoding against prompted model w/o temperature

CD_beams(
    input='<Human Round-1>:请续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', 
    ref_input='<Human Round-1>:请用高兴的语气续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', 
    logits_processor=ContrastiveLogits(apc=0.1, bound=None, verbose=False), 
    T=1.0,
    in_beams=3,
    out_beams=1,
)

In [None]:
# contrastive decoding against prompted model w/o temperature

CD_beams(
    input='<Human Round-1>:请续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', 
    ref_input='<Human Round-1>:请用高兴的语气续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', 
    logits_processor=ContrastiveLogits(apc=0.2, bound=None, verbose=False), 
    T=1.0,
    in_beams=3,
    out_beams=1,
)

In [None]:
# contrastive decoding against prompted model w/o temperature

CD_beams(
    input='<Human Round-1>:请续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', 
    ref_input='<Human Round-1>:请用高兴的语气续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', 
    logits_processor=ContrastiveLogits(apc=0.05, bound=None, verbose=False), 
    T=2.0,
    in_beams=3,
    out_beams=1,
)

In [None]:
# contrastive decoding against prompted model w/o temperature

CD_beams(
    input='<Human Round-1>:请续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', 
    ref_input='<Human Round-1>:请用高兴的语气续写这句话：今天老师生病了，没来上课，\n<Assistant Round-1>:', 
    logits_processor=ContrastiveLogits(apc=0.05, bound=None, verbose=False), 
    T=3.0,
    in_beams=3,
    out_beams=1,
)