In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# 1. 加载本地模型
model_path = "Qwen2.5-0.5B-Instruct/"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cuda:0")

# 2. 定义提取隐藏状态的函数
def extract_hidden_state(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    
    # 设置output_hidden_states=True来获取隐藏状态
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    
    # 获取最后一层的最后一个token的隐藏状态
    last_hidden_state = outputs.hidden_states[-1][0, -1, :]
    
    return last_hidden_state.cpu()

# # 3. 文本分类示例
# def text_classification_example():
#     # 准备数据
#     texts = ["这是一个积极的评论", "这个产品太糟糕了", "价格合理，质量一般"]
#     labels = [1, 0, 1]  # 1=积极, 0=消极
    
#     # 提取隐藏状态
#     hidden_states = [extract_hidden_state(text, model, tokenizer) for text in texts]
#     hidden_states_tensor = torch.stack(hidden_states)
    
#     # 简单线性分类器
#     classifier = torch.nn.Linear(hidden_states_tensor.shape[1], 2)
    
#     # 训练分类器...
#     # 使用分类器进行预测...
def extract_sentence_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    
    # 获取最后一层的所有token的隐藏状态
    last_hidden_states = outputs.hidden_states[-1][0]
    
    # 忽略特殊token (如[CLS], [SEP]等)，计算平均值
    # 对于Qwen模型，可能需要根据具体情况调整
    input_ids = inputs.input_ids[0]
    mask = torch.ones_like(input_ids).float()
    special_tokens = tokenizer.all_special_ids
    for special_id in special_tokens:
        mask = mask * (input_ids != special_id).float()
    
    # 应用mask并计算平均
    masked_states = last_hidden_states * mask.unsqueeze(-1)
    sum_embeddings = torch.sum(masked_states, dim=0)
    sum_mask = torch.sum(mask).item()
    
    # 避免除以0
    if sum_mask > 0:
        mean_embedding = sum_embeddings / sum_mask
    else:
        mean_embedding = last_hidden_states.mean(dim=0)
    
    return mean_embedding.cpu()

# 4. 相似度匹配示例
def similarity_matching_example():
    query = "我肯定。我我说话，你能听懂吗？"
    candidates = [
        "这个可以。缓一缓再还吗？",
        "你什么意思啊？",
        "你是不是听不懂我说话",
        "我是说。",
        "你能听到我说话吗",
        "我听不懂。"
    ]
    
    # 提取查询的隐藏状态
    query_state = extract_sentence_embedding(query, model, tokenizer)
    
    # 提取候选的隐藏状态
    candidate_states = [extract_sentence_embedding(c, model, tokenizer) for c in candidates]
    
    # 计算余弦相似度
    similarities = [torch.nn.functional.cosine_similarity(query_state, c_state, dim=0) 
                    for c_state in candidate_states]
    
    # 输出结果
    for i, sim in enumerate(similarities):
        print(f"与'{candidates[i]}'的相似度: {sim.item():.4f}")

def extract_sentence_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    
    # 获取最后一层的所有token的隐藏状态
    last_hidden_states = outputs.hidden_states[-1][0]
    
    # 忽略特殊token (如[CLS], [SEP]等)，计算平均值
    # 对于Qwen模型，可能需要根据具体情况调整
    input_ids = inputs.input_ids[0]
    mask = torch.ones_like(input_ids).float()
    special_tokens = tokenizer.all_special_ids
    for special_id in special_tokens:
        mask = mask * (input_ids != special_id).float()
    
    # 应用mask并计算平均
    masked_states = last_hidden_states * mask.unsqueeze(-1)
    sum_embeddings = torch.sum(masked_states, dim=0)
    sum_mask = torch.sum(mask).item()
    
    # 避免除以0
    if sum_mask > 0:
        mean_embedding = sum_embeddings / sum_mask
    else:
        mean_embedding = last_hidden_states.mean(dim=0)
    
    return mean_embedding.cpu()

similarity_matching_example()