In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import transformers
from transformers import AutoTokenizer,AutoConfig,AutoModel,AutoModelForCausalLM
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from IPython.display import Image
# default: 100
mpl.rcParams['figure.dpi'] = 150
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

  from .autonotebook import tqdm as notebook_tqdm


'cuda'

In [3]:
model_ckpt = "/root/autodl-fs/qwen1.5_0.5B_chat"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForCausalLM.from_pretrained(model_ckpt).to(device)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [4]:
sample_text = 'A long long time ago, '

model_inputs = tokenizer(sample_text, return_tensors='pt')
model_inputs

{'input_ids': tensor([[  32, 1293, 1293,  882, 4134,   11,  220]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [5]:
input_ids = model_inputs['input_ids'].to(device)
print(input_ids)

input_ids[0]

tensor([[  32, 1293, 1293,  882, 4134,   11,  220]], device='cuda:0')


tensor([  32, 1293, 1293,  882, 4134,   11,  220], device='cuda:0')

In [6]:
tokenizer.decode(input_ids[0])

'A long long time ago, '

In [19]:
sorted_ids = torch.argsort(torch.softmax(model(input_ids).logits[0, -1, :], dim=-1), dim=-1, descending=True)
sorted_ids[None, 0, None].shape

torch.Size([1, 1])

In [18]:
n = 10 # 生成的词数量
choices = 5 # 
iterations = []

with torch.no_grad():
    for _ in range(n):
        iteration = {}
        iteration["input"] = tokenizer.decode(input_ids[0])
        output = model(input_ids)
    
        last_token_logits = output.logits[0,-1,:]
        
        # print(last_token_logits.shape) torch.Size([151936])
        # 只有一个维度 ~= dim=-1 只取最后一个维度
        last_token_probs = torch.softmax(last_token_logits,dim=-1)
        sorted_ids = torch.argsort(last_token_probs,dim=-1,descending=True)
        for choice_idx in range(choices):
            token_id = sorted_ids[choice_idx]
            token_prob = last_token_probs[token_id].cpu().numpy()
            token_choice = f"{tokenizer.decode(token_id)}({100*token_prob:.2f}%)"
            iteration[f'choice {choice_idx+1}'] = token_choice
            
        print('before append input_ids.shape', input_ids.shape)
        input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)
        print('after append input_ids.shape', input_ids.shape)
        
        iterations.append(iteration)
            
            
        
        

before append input_ids.shape torch.Size([1, 17])
after append input_ids.shape torch.Size([1, 18])
before append input_ids.shape torch.Size([1, 18])
after append input_ids.shape torch.Size([1, 19])
before append input_ids.shape torch.Size([1, 19])
after append input_ids.shape torch.Size([1, 20])
before append input_ids.shape torch.Size([1, 20])
after append input_ids.shape torch.Size([1, 21])
before append input_ids.shape torch.Size([1, 21])
after append input_ids.shape torch.Size([1, 22])
before append input_ids.shape torch.Size([1, 22])
after append input_ids.shape torch.Size([1, 23])
before append input_ids.shape torch.Size([1, 23])
after append input_ids.shape torch.Size([1, 24])
before append input_ids.shape torch.Size([1, 24])
after append input_ids.shape torch.Size([1, 25])
before append input_ids.shape torch.Size([1, 25])
after append input_ids.shape torch.Size([1, 26])
before append input_ids.shape torch.Size([1, 26])
after append input_ids.shape torch.Size([1, 27])


In [16]:
iterations[-1]

{'input': 'A long long time ago,  a group of people  decided to travel to',
 'choice 1': ' a(43.56%)',
 'choice 2': ' the(23.48%)',
 'choice 3': ' (8.41%)',
 'choice 4': ' another(5.29%)',
 'choice 5': ' an(3.04%)'}

In [15]:
pd.DataFrame(iterations)

Unnamed: 0,input,choice 1,choice 2,choice 3,choice 4,choice 5
0,"A long long time ago,",a(15.83%),in(13.61%),there(9.16%),((3.70%),A(3.34%)
1,"A long long time ago, a",group(22.58%),young(11.94%),very(8.00%),great(4.93%),person(2.37%)
2,"A long long time ago, a group",of(96.09%),(1.59%),called(0.81%),<|im_end|>(0.14%),named(0.12%)
3,"A long long time ago, a group of",people(31.19%),(8.96%),friends(6.81%),young(4.63%),humans(2.84%)
4,"A long long time ago, a group of people",(11.09%),from(8.82%),who(7.21%),called(6.61%),came(6.46%)
5,"A long long time ago, a group of people",decided(12.72%),lived(10.80%),were(8.62%),came(8.42%),had(5.42%)
6,"A long long time ago, a group of people decided",to(77.71%),(17.27%),(1.97%),<|im_end|>(0.73%),that(0.49%)
7,"A long long time ago, a group of people deci...",travel(26.40%),build(20.37%),go(16.30%),(9.63%),start(2.93%)
8,"A long long time ago, a group of people deci...",to(59.79%),(10.66%),across(5.94%),around(4.51%),from(1.90%)
9,"A long long time ago, a group of people deci...",a(43.56%),the(23.48%),(8.41%),another(5.29%),an(3.04%)


In [27]:
n_steps = 2
choice_per_step = 5
iterations = []

with torch.no_grad():
    for _ in range(n_steps):
        iteration = {}
        iteration["input"] = tokenizer.decode(input_ids[0])
        output = model(input_ids)
        # print(output)
        last_token_logits = output.logits[0, -1, :] # 取出生成的token对应的分数[0,-1,:]
        # print(last_token_logits)
        last_token_probs = torch.softmax(last_token_logits,dim=-1)
        print(last_token_probs.shape)
        # torch.argsort()返回只是排序后的值所对应原输入input的下标，即torch.sort()返回的indices
        sorted_ids = torch.argsort(last_token_probs,dim=-1,descending = True)
        print(sorted_ids.shape)
        for choice_idx in range(choice_per_step):
            token_id = sorted_ids[choice_idx]
            print(token_id)
            token_prob = last_token_probs[token_id].cpu().numpy()
            token_choice = f'{tokenizer.decode(token_id)}({100*token_prob:.2f}%)'
            iteration[f'choice {choice_idx+1}'] = token_choice
            
        # append
        print('before append input_ids.shape', input_ids.shape)
        input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)
        print('after append input_ids.shape', input_ids.shape)
        
        iterations.append(iteration)
            


torch.Size([151936])
torch.Size([151936])
tensor(4792, device='cuda:0')
tensor(271, device='cuda:0')
tensor(2121, device='cuda:0')
tensor(785, device='cuda:0')
tensor(3966, device='cuda:0')
before append input_ids.shape torch.Size([1, 25])
after append input_ids.shape torch.Size([1, 26])
torch.Size([151936])
torch.Size([151936])
tensor(594, device='cuda:0')
tensor(10362, device='cuda:0')
tensor(374, device='cuda:0')
tensor(748, device='cuda:0')
tensor(5112, device='cuda:0')
before append input_ids.shape torch.Size([1, 26])
after append input_ids.shape torch.Size([1, 27])


In [23]:
print(iterations)

[{'input': 'A long long time ago,  a group of people  decided to travel to a faraway land.', 'choice 1': '<|im_end|>(41.36%)', 'choice 2': ' They(21.93%)', 'choice 3': ' (11.06%)', 'choice 4': ' The(5.97%)', 'choice 5': ' \n\n(2.87%)'}, {'input': 'A long long time ago,  a group of people  decided to travel to a faraway land.<|im_end|>', 'choice 1': '\n(100.00%)', 'choice 2': '<|im_start|>(0.00%)', 'choice 3': '\r\n(0.00%)', 'choice 4': '，\n(0.00%)', 'choice 5': ' \n(0.00%)'}]


In [14]:
import pandas as pd

In [25]:
pd.DataFrame(iterations)

Unnamed: 0,input,choice 1,choice 2,choice 3,choice 4,choice 5
0,"A long long time ago, a group of people deci...",<|im_end|>(41.36%),They(21.93%),(11.06%),The(5.97%),\n\n(2.87%)
1,"A long long time ago, a group of people deci...",\n(100.00%),<|im_start|>(0.00%),\r\n(0.00%),，\n(0.00%),\n(0.00%)
