In [103]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "/Users/frankdzzz/models/Qwen2.5-3B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    force_download=True,
    torch_dtype="auto",
    device_map="auto"
)  
tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.20s/it]
Some parameters are on the meta device because they were offloaded to the disk.


In [70]:
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [71]:
response

'A large language model (LLM) is a type of artificial intelligence that can generate human-like text based on the patterns and statistical probabilities learned from vast amounts of textual data. These models are trained on enormous datasets, such as books, articles, websites, and other written sources, allowing them to understand context, grammar, and semantics deeply.\n\nKey features of large language models include:\n\n1. **Training**: They are typically trained using techniques like deep learning, where neural networks learn to predict the next word in a sequence based on previous words. This process involves understanding the relationships between words and phrases within a given context.\n\n2. **Generative Power**: Once trained, these models can take an input prompt and generate coherent text that is often indistinguishable from human-written content. They can answer questions, write stories, compose emails, and perform a wide range of tasks.\n\n3. **Context Awareness**: Large la

'\n \n\n    \n\n  \n\r\n\n'

In [74]:
model.device

device(type='mps', index=0)

In [162]:
import torch 
input_text = "<|im_start|>system\nAnswer only with 'Yes' or 'No'<|im_end|>\n<|im_start|>user\nIs Paris the capital of France<|im_end|>\n<|im_start|>assistant\n"
inputs = tokenizer(input_text, return_tensors="pt")
input_ids = inputs["input_ids"].to(model.device)
attention_mask = inputs["attention_mask"].to(model.device)  # 确保 attention_mask 定义

with torch.inference_mode():
    response = model.generate(
        input_ids,
        attention_mask=attention_mask,
        output_logits=True,
        return_dict_in_generate=True,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=1,
    )
    

In [163]:
response

GenerateDecoderOnlyOutput(sequences=tensor([[151644,   8948,    198,  16141,   1172,    448,    364,   9454,      6,
            476,    364,   2753,      6, 151645,    198, 151644,    872,    198,
           3872,  12095,    279,   6722,    315,   9625, 151645,    198, 151644,
          77091,    198,   2753]], device='mps:0'), scores=None, logits=(tensor([[ 7.5625, 10.0625, 13.3750,  ...,  2.0469,  2.0469,  2.0469]],
       device='mps:0'),), attentions=None, hidden_states=None, past_key_values=DynamicCache(layers=[DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, D

In [164]:
response.logits

(tensor([[ 7.5625, 10.0625, 13.3750,  ...,  2.0469,  2.0469,  2.0469]],
        device='mps:0'),)

In [174]:
logits = response.logits[0][0].squeeze(0).cpu().numpy()  

In [166]:
# check token generated
tokens = tokenizer.decode(response.sequences[0])
tokens

"<|im_start|>system\nAnswer only with 'Yes' or 'No'<|im_end|>\n<|im_start|>user\nIs Paris the capital of France<|im_end|>\n<|im_start|>assistant\nNo"

In [180]:
import numpy as np
token_id = np.argmax(logits)

In [181]:
token = tokenizer.decode(token_id)
token

'No'

In [110]:
import torch
logits_tensor = response.logits[0][0].squeeze(0)
top5_ids = torch.topk(logits_tensor, 5).indices.tolist()
top5_tokens = tokenizer.decode(top5_ids)
print(f"Top 5 IDs: {top5_ids}")
print(f"Top 5 Tokens: {top5_tokens}")

Top 5 IDs: [2753, 9454, 2308, 97976, 8996]
Top 5 Tokens: NoYes No-NoNO


In [186]:
logits = response.logits[0][0].squeeze(0).cpu().numpy()  # shape -> [vocab_size]

In [None]:
logits 

array([ 7.5625  , 10.0625  , 13.375   , ...,  2.046875,  2.046875,
        2.046875], shape=(151936,), dtype=float32)

In [182]:
logits[2753]

np.float32(44.5)

In [None]:
import numpy as np
np.argmax(logits)

np.int64(2753)

In [120]:
tokenizer.decode([2753])

'No'

In [None]:
answer = response.sequences[0]

In [93]:
answer[-1]

tensor(20412, device='mps:0')

In [None]:
first_generated_token_id

In [66]:
first_logits= response.logits[0].squeeze(0).cpu().numpy()
first_logits

array([7.03125   , 7.6875    , 5.28125   , ..., 0.73828125, 0.73828125,
       0.73828125], shape=(151936,), dtype=float32)

In [63]:
first_generated_token_id

np.int64(20412)

In [64]:
tokenizer.decode([20412])

'是'

In [42]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

model_name = "Qwen/Qwen2.5-1.5B-Instruct"

# 加载模型和 tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # 或 "auto"
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 创建 pipeline
text_gen = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)

# 测试 prompt
prompt = "Give me a short introduction to large language models."

# 生成文本
output = text_gen(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)

print(output[0]['generated_text'])

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use mps


Give me a short introduction to large language models.强奸强奸强奸强奸强奸强奸强奸强奸强奸强奸强奸强奸强奸强奸强奸强奸强奸强奸 ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculationIENTATION hydration hydrationresponsesassertCountassertCountIENTATIONIENTATIONIENTATION stringByAppendingString hydration hydrationIHIH，《 ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejaculation ejac

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
188796.20s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


zsh:1: command not found: pip


In [None]:
# src/evaluator/scoring.py
# This file is to implement scoring functions for pred-vs-gt evaluations
# src/scoring.py
import pandas as pd
import numpy as np

def calculate_rejection_metrics(df, metric_name,):
    """
    计算拒识准确率。
    metric_name: 'logprob' 或 'logtoku'
    """
    # Rank from most certain to least certain
    # logprob and logtoku: the greater number means the more confident llm is about its prediction closer to 0
    
    df_sorted = df.sort_values(by=metric_name, ascending=False).reset_index(drop=True)  # from smallest to largest   
    
    total_samples = len(df_sorted)
    results = []

    # 计算不同百分位下的准确率 (从 0% （no rejection) 拒绝到 95%拒绝)
    # 这里的 i 代表“保留前 i% 的样本”
    for percentile in range(0, 100, 5):
        keep_ratio = (100 - percentile) / 100
        num_keep = max(1, int(total_samples * keep_ratio))

        subset = df_sorted.head(num_keep)
        accuracy = subset['is_correct'].mean()
        
        results.append({
            "percentile": 100 - percentile,  # keep ratio
            "accuracy": accuracy
        })
    
    return pd.DataFrame(results)

In [237]:
#src/evaluator/plotting.py
####################     
#This file is to implement plotting functions for evaluation results

# src/plotting.py
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

def plot_uncertainty_heatmap(prob_results, toku_results, save_path="results/rejection_heatmap.png"):
    """
    绘制不确定性拒绝结果的热力图
    
    参数:
        prob_results: dict, 包含 'percentile' 和 'accuracy' 键
        toku_results: dict, 包含 'percentile' 和 'accuracy' 键
        save_path: str, 保存路径
    """
    # 提取数据
    percentiles = prob_results['percentile']
    prob_accuracy = prob_results['accuracy']
    toku_accuracy = toku_results['accuracy']
    
    # 将数据组织成二维数组 (行: percentile, 列: metrics_accuracy)
    data = np.column_stack([prob_accuracy, toku_accuracy])
    
    # 创建图形
    plt.figure(figsize=(10, 12))
    
    # 绘制热力图
    ax = sns.heatmap(data[::-1], 
                     annot=True,  # 显示数值
                     fmt='.3f',   # 数值格式：保留3位小数
                     cmap='YlGnBu',  # 颜色方案 (黄绿蓝)
                     yticklabels=percentiles[::-1],  # y轴标签 from 100 to 5%
                     xticklabels=['acc_w_rejection_prob', 'acc_w_rejection_uncertainty_2'],  # x轴标签
                     cbar_kws={'label': ''},  # 颜色条
                     vmin=np.min(data), 
                     vmax=1.0,  # 设置颜色范围
                     linewidths=0.5,  # 网格线宽度
                     linecolor='white')  # 网格线颜色
    
    # 设置标签
    plt.ylabel('percentile', fontsize=12)
    plt.xlabel('')
    
    # 设置标题
    plt.title('Accuracy vs. Rejection Rate (Heatmap)', fontsize=14, pad=20)
    
    # 调整x轴标签
    plt.xticks(rotation=0, ha='center')
    
    # 调整布局
    plt.tight_layout()
    
    # 保存图片
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"Heatmap saved to {save_path}")
    
    # 显示图形
    plt.show()




ValueError: Key backend: 'module://matplotlib_inline.backend_inline' is not a valid value for backend; supported values are ['gtk3agg', 'gtk3cairo', 'gtk4agg', 'gtk4cairo', 'macosx', 'nbagg', 'notebook', 'qtagg', 'qtcairo', 'qt5agg', 'qt5cairo', 'tkagg', 'tkcairo', 'webagg', 'wx', 'wxagg', 'wxcairo', 'agg', 'cairo', 'pdf', 'pgf', 'ps', 'svg', 'template']

In [286]:
import numpy as np
import pandas as pd

df = pd.read_csv("results/boolq_results.csv")


In [292]:
df["logtoku"].describe()

count    2000.000000
mean       -0.015706
std         0.000664
min        -0.018054
25%        -0.016189
50%        -0.015800
75%        -0.015260
max        -0.013974
Name: logtoku, dtype: float64

In [293]:
df["logprob"].describe()

count    2.000000e+03
mean    -1.692777e-02
std      7.455978e-02
min     -6.473705e-01
25%     -4.169432e-05
50%     -2.384186e-07
75%      0.000000e+00
max      0.000000e+00
Name: logprob, dtype: float64

In [300]:
df_sorted_2 = df.sort_values(by="logtoku", ascending=False).reset_index(drop=True)
df_sorted_2

Unnamed: 0,is_correct,logprob,logtoku
0,True,-5.623774e-01,-0.013974
1,False,-2.191842e-01,-0.013980
2,False,-9.163713e-02,-0.014012
3,True,-2.450537e-01,-0.014025
4,False,-2.888999e-01,-0.014025
...,...,...,...
1995,True,-3.576279e-07,-0.017400
1996,False,-1.192093e-07,-0.017522
1997,False,-2.384186e-07,-0.017591
1998,True,-1.311303e-06,-0.017652


In [298]:
df_sorted_1 = df.sort_values(by='logprob', ascending=False).reset_index(drop=True)  # from smallest to largest   
df_sorted_1

Unnamed: 0,is_correct,logprob,logtoku
0,True,0.000000,-0.016420
1,True,0.000000,-0.015928
2,True,0.000000,-0.015954
3,True,0.000000,-0.015755
4,True,0.000000,-0.015794
...,...,...,...
1995,True,-0.589742,-0.014705
1996,False,-0.618071,-0.014363
1997,True,-0.647370,-0.014953
1998,False,-0.647370,-0.014053


In [307]:
 #3.1 logprob df
logprob_df = calculate_rejection_metrics(df, metric_name='logprob',) # the greater logprob means more confident
#3.2 logtoku df
logtoku_df = calculate_rejection_metrics(df, metric_name='logtoku',) #  the smaller logtoku means more uncertain


In [308]:
logtoku_df

Unnamed: 0,percentile,accuracy
0,1,0.7305
1,-4,0.726842
2,-9,0.721667
3,-14,0.715294
4,-19,0.709375
5,-24,0.706667
6,-29,0.702857
7,-34,0.696923
8,-39,0.694167
9,-44,0.687273


In [296]:
logprob_df

Unnamed: 0,percentile,accuracy
0,0,0.7305
1,5,0.742632
2,10,0.757222
3,15,0.758824
4,20,0.768125
5,25,0.768667
6,30,0.767143
7,35,0.771538
8,40,0.775
9,45,0.771818


In [228]:
accuracy = subset['is_correct']

In [231]:
accuracy

0    False
1     True
2    False
3    False
4     True
5    False
6     True
7     True
8     True
9    False
Name: is_correct, dtype: bool