In [5]:
import logging

import torch
from torch import nn
from transformers import AutoModelForSequenceClassification, PreTrainedModel, TrainingArguments, AutoTokenizer
from transformers.modeling_outputs import SequenceClassifierOutput

In [19]:
model = AutoModelForSequenceClassification.from_pretrained("/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/reranker_group30_batch2_v100", torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/reranker_group30_batch2_v100")

In [42]:
import torch
from transformers import AutoModelForSequenceClassification

# 假设 model 是已经加载的模型
# model = AutoModelForSequenceClassification.from_pretrained(...)

# 假设 group_size 是你的模型处理的样本数量
group_size = 15  # 根据实际情况设置
batch_size = 2

# 假设你已经有一个文本序列和对应的标签
text = "这是一个示例文本。"
label = 1  # 假设标签是1

# 使用模型的分词器对文本进行编码
encoded_input = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# 调整编码后的输入以匹配 group_size
input_ids = encoded_input['input_ids'].repeat_interleave(batch_size*group_size, dim=0).cuda()
attention_mask = encoded_input['attention_mask'].repeat_interleave(batch_size*group_size, dim=0).cuda()

print(type(input_ids))

# 创建 batch 字典
batch = {
    'input_ids': input_ids,
    'attention_mask': attention_mask,
}

labels = torch.tensor([label]*batch_size, dtype=torch.long).cuda()

<class 'torch.Tensor'>


In [52]:
def get_embedding(input_ids, attention_mask, model=model, tokenizer=tokenizer):
    hidden_state = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1].cpu()
    attention_mask = attention_mask.cpu()
    seq_lengths = attention_mask.sum(dim=1)
    embeddings = []
    for seq_len, seq_emb in zip(seq_lengths, hidden_state):
        valid_emb = seq_emb[:seq_len]
        embeddings.append(torch.mean(valid_emb, dim=0))

    embedding = torch.stack(embeddings)
    return embedding

In [44]:
def forward(batch):
    cross_entropy = nn.CrossEntropyLoss(reduction='mean')
    embeddings = get_embedding(**batch)
    loss = batchloss(embeddings)
    return loss

In [54]:
results = get_embedding(**batch)
print(results)
print(results.shape)

tensor([[-0.2715, -0.0060, -0.6865,  ..., -0.9995, -0.5493,  0.4402],
        [-0.2715, -0.0060, -0.6865,  ..., -0.9995, -0.5493,  0.4402],
        [-0.2715, -0.0060, -0.6865,  ..., -0.9995, -0.5493,  0.4402],
        ...,
        [-0.2715, -0.0060, -0.6865,  ..., -0.9995, -0.5493,  0.4402],
        [-0.2715, -0.0060, -0.6865,  ..., -0.9995, -0.5493,  0.4402],
        [-0.2715, -0.0060, -0.6865,  ..., -0.9995, -0.5493,  0.4402]],
       dtype=torch.float16, grad_fn=<StackBackward0>)
torch.Size([30, 768])


In [56]:
pred = results.view(batch_size, group_size, -1)
print(pred.shape)

torch.Size([2, 15, 768])


In [64]:
import torch
import torch.nn.functional as F

def infoNCELoss(anchor, positive, negatives, temperature=1):
    # 计算所有样本的相似度
    pos_similarity = F.cosine_similarity(anchor, positive, dim=-1)
    # 将anchor重复到与负样本相同数量的维度，以便计算
    neg_similarity = F.cosine_similarity(anchor, negatives, dim=-1)
    # 合并正样本和负样本的相似度
    all_similarity = torch.cat([pos_similarity, neg_similarity])
    # 应用温度缩放
    all_similarity /= temperature
    # 计算InfoNCE损失
    loss = - torch.log(torch.exp(pos_similarity)/torch.sum(torch.exp(all_similarity)))
    return loss.mean()

def batchloss(embeddings):
    # 遍历每个batch计算损失
    losses = []
    for i in range(embeddings.size(0)):
        # anchor embeddings
        anchor = embeddings[i, 0].unsqueeze(0)  # [1, 768]
        # positive embeddings
        positive = embeddings[i, 1].unsqueeze(0)  # [1, 768]
        # 除了anchor和positive之外的所有embeddings作为负样本
        negatives = embeddings[i, 2:]  # [13, 768]
        # 计算当前batch的InfoNCE损失
        loss = infoNCELoss(anchor, positive, negatives)
        losses.append(loss)
    # 计算整个batch的平均损失
    batch_loss = torch.mean(torch.stack(losses))
    return batch_loss

In [65]:
# 假设 embeddings 是一个形状为 [batch, group_size, embedding_len] 的张量
embeddings = torch.randn(2, 15, 768)  # 示例数据
print(batchloss(embeddings))

tensor(2.6431)


# 去掉模型的分类头

In [8]:
from transformers import XLMRobertaForSequenceClassification, AutoModel, AutoTokenizer
import torch
model = AutoModel.from_pretrained('/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/xlmr/models--FacebookAI--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/', torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/xlmr/models--FacebookAI--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/")
# print(type(model.modules()))
print(model)
# print(model.roberta)
# print(model.classifier)

XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=Tru

In [78]:
del model.classifier
print(model)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [None]:
from sentence_transformers import SentenceTransformer, models