In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import LlamaTokenizer, LlamaModel

# 加载预训练的LLaMA 2模型和tokenizer
tokenizer = LlamaTokenizer.from_pretrained('LLaMA-2')
model = LlamaModel.from_pretrained('LLaMA-2')

# 超参数
batch_size = 32
learning_rate = 1e-4
num_epochs = 10
temperature = 0.07

# 构建笔记压缩prompt
def build_prompt(note):
    instruction = "Extract the note information in json format, compress it into one word for recommendation."
    input_note = f"{{'title': '{note['title']}', 'content': '{note['content']}'}}"
    prompt = f"[BOS]{instruction} {input_note} The compression word is: '[EMB]'.[EOS]"
    return prompt

# 生成式对比学习任务
class GenerativeContrastiveLearning(nn.Module):
    def __init__(self, model):
        super(GenerativeContrastiveLearning, self).__init__()
        self.model = model
        self.fc = nn.Linear(model.config.hidden_size, model.config.hidden_size)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        emb_token_idx = (input_ids == tokenizer.convert_tokens_to_ids('[EMB]')).nonzero(as_tuple=True)
        emb_vectors = hidden_states[emb_token_idx]
        emb_vectors = self.fc(emb_vectors)
        return emb_vectors

# 协同监督微调任务
class CollaborativeSupervisedFineTuning(nn.Module):
    def __init__(self, model):
        super(CollaborativeSupervisedFineTuning, self).__init__()
        self.model = model
        self.classifier = nn.Linear(model.config.hidden_size, num_labels)  # num_labels为类别数量
    
    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        cls_token_idx = (input_ids == tokenizer.cls_token_id).nonzero(as_tuple=True)
        cls_vectors = hidden_states[cls_token_idx]
        logits = self.classifier(cls_vectors)
        return logits

# 损失函数
def contrastive_loss(embeddings, positive_pairs, negative_pairs, temperature):
    pos_sim = torch.cosine_similarity(embeddings[positive_pairs[:, 0]], embeddings[positive_pairs[:, 1]])
    neg_sim = torch.cosine_similarity(embeddings[negative_pairs[:, 0]], embeddings[negative_pairs[:, 1]])
    loss = -torch.log(torch.exp(pos_sim / temperature) / (torch.exp(pos_sim / temperature) + torch.exp(neg_sim / temperature)))
    return loss.mean()

# 数据加载和预处理
def preprocess_data(notes):
    input_ids, attention_masks = [], []
    for note in notes:
        prompt = build_prompt(note)
        encoded_prompt = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)
        input_ids.append(encoded_prompt['input_ids'])
        attention_masks.append(encoded_prompt['attention_mask'])
    return torch.cat(input_ids), torch.cat(attention_masks)

# 加载数据
notes = [
    {'title': 'Note 1', 'content': 'Content of note 1'},
    {'title': 'Note 2', 'content': 'Content of note 2'},
    # 添加更多笔记数据
]

input_ids, attention_masks = preprocess_data(notes)

# 模型初始化
gcl_model = GenerativeContrastiveLearning(model)
csft_model = CollaborativeSupervisedFineTuning(model)

# 优化器
optimizer = optim.Adam(list(gcl_model.parameters()) + list(csft_model.parameters()), lr=learning_rate)

# 训练循环
for epoch in range(num_epochs):
    gcl_model.train()
    csft_model.train()
    
    # 前向传播
    embeddings = gcl_model(input_ids, attention_mask=attention_masks)
    logits = csft_model(input_ids, attention_mask=attention_masks)
    
    # 计算损失
    gcl_loss = contrastive_loss(embeddings, positive_pairs, negative_pairs, temperature)
    csft_loss = nn.CrossEntropyLoss()(logits, labels)  # labels为真实标签
    loss = gcl_loss + csft_loss
    
    # 反向传播和优化
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# 保存模型
torch.save(gcl_model.state_dict(), 'gcl_model.pth')
torch.save(csft_model.state_dict(), 'csft_model.pth')
