In [21]:
from transformers import AutoTokenizer, AutoModel
import torch
import os
import random
import torch
from torch.utils.data import Dataset
import tqdm

In [22]:
# 加载 BERT 模型和分词器
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
model = AutoModel.from_pretrained('bert-base-chinese')



In [23]:
# 预测
text = "我爱北京天安门"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [24]:
print(output.last_hidden_state.shape[:])

torch.Size([1, 9, 768])


In [25]:
class KnowledgeGraphDataset(Dataset):
    def __init__(self, file_path):
        self.file_path = file_path
        self.triples = self.load_triples(self.file_path)
        self.entities = self.get_entities(self.triples)
        self.positivetriples=self.loadpositivesample(self.file_path)
        self.negativetriples=self.loadnegativesample(self.file_path,self.entities)
        self.relations = self.get_relations(self.triples)
    def loadpositivesample(self,file_path):
        with open(file_path,'r',encoding='utf-8') as f:
            triples=[]
            for line in f:
                triples.append(line)
        return triples
    
    def load_triples(self, file_path):
        triples = []
        with open(file_path, 'r') as file:
            for line in file:
                a = line.split(',')
                triples.append(a)
        return triples
    
    def get_entities(self, triples):
        entities = set()
        for triple in triples:
            entities.add(triple[0])
            entities.add(triple[2])
        return list(entities)

    def get_relations(self, triples):
        relations = set()
        for triple in triples:
            relations.add(triple[1])
        return list(relations)
    
    def loadnegativesample(self,file_path,entities):
        with open(file_path,'r',encoding='utf-8') as f:
            triples=[]
            for line in f:
                a = line.split(',')
                a[2] = random.choice(entities)
                line = a[0]+','+a[1]+','+a[2]
                triples.append(line)
        return triples
    
    def __len__(self):
        return len(self.triples)

    def __getitem__(self, idx):
        positivetriple = self.positivetriples[idx]
        negativetriple = self.negativetriples[idx]
        return positivetriple ,negativetriple

dataset = KnowledgeGraphDataset('triples.txt')
positive_triple, negative_triples = dataset[0]
print(f"Positive triple: {positive_triple}")
print(f"Negative triples: {negative_triples}")

Positive triple: [ ' 歼 - 1 6 战 机 ' ,   ' 名 称 ' ,   ' 歼 - 1 6 战 机 ' ]

Negative triples: [ ' 歼 - 1 6 战 机 ' ,   ' 名 称 ' ,   ' 3 8 . 1 千 克 ' ]



In [26]:
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc = torch.nn.Linear(768, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self,cls):
        output = self.fc(cls)
        output = self.sigmoid(output)
        return output
# class Model(torch.nn.Module):
#     def __init__(self,triples):
#         super(Model, self).__init__()
#         self.bert = AutoModel.from_pretrained('bert-base-chinese')
#         self.fc = torch.nn.Linear(768, 1)
#         self.triples = triples

#     def forward(self):
#         sentences = [f"{triple[0]} {triple[1]} {triple[2]}" for triple in self.triples]
#         output = self.bert(sentences)
#         cls = output.last_hidden_state[:,0,:]
#         cls.sequeeze(1)
#         output = self.fc(cls)
#         return output


# model=Model(triples)
# output = model.forward()
# print(output.shape[:])

In [27]:
def trainbertfc(dataset, epoch=1, batch_size=32, learning_rate=0.001):
    model = Model()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = torch.nn.BCELoss()
    for _ in range(epoch):
        totalloss = 0
        for i in (range(0, len(dataset), batch_size)):
            batch = [dataset[j] for j in range(i, min(i+batch_size, len(dataset)))]
            tokenizer=AutoTokenizer.from_pretrained('bert-base-chinese')
            bert = AutoModel.from_pretrained('bert-base-chinese')
            batch_labels = []
            outputlist = []
            for triple in batch:
                pos,neg=triple[0],triple[1]
                batch_labels.append(1)
                batch_labels.append(0)
                pos=','.join(pos)
                neg=','.join(neg)
                posencoded_input = tokenizer(pos, return_tensors='pt')
                negencoded_input = tokenizer(neg, return_tensors='pt')
                posoutput = bert(**posencoded_input)
                negoutput = bert(**negencoded_input)
                cls1=posoutput.pooler_output
                cls2=negoutput.pooler_output
                output1 = model(cls1)
                output2 = model(cls2)
                outputlist.append(output1)
                outputlist.append(output2)
            batch_labels = torch.tensor(batch_labels, dtype=torch.float32,requires_grad=True)
            outputlist = torch.tensor(outputlist, dtype=torch.float32,requires_grad=True)
            optimizer.zero_grad()
            loss = criterion(outputlist, batch_labels)
            loss.backward()
            optimizer.step()

            totalloss += loss.item()
        print(f"Epoch {epoch+1}/{epoch}, Average Loss: {totalloss/len(dataset):.4f}")
    return model

In [28]:
file_path = 'triples.txt'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = KnowledgeGraphDataset(file_path)
model = trainbertfc(dataset)



Epoch 2/1, Average Loss: 0.1452


In [29]:
def testmodel(model, dataset,batch_size=32):
    model.eval()
    for i in (range(0, len(dataset), batch_size)):
        batch = [dataset[j] for j in range(i, min(i+batch_size, len(dataset)))]
        tokenizer=AutoTokenizer.from_pretrained('bert-base-chinese')
        tokenizer=AutoTokenizer.from_pretrained('bert-base-chinese')
        bert = AutoModel.from_pretrained('bert-base-chinese')
        batch_labels = []
        outputlist = []
        for triple in batch:
            pos,neg=triple[0],triple[1]
            batch_labels.append(1)
            batch_labels.append(0)
            pos=','.join(pos)
            print(pos)
            neg=','.join(neg)
            print(neg)
            posencoded_input = tokenizer(pos, return_tensors='pt')
            negencoded_input = tokenizer(neg, return_tensors='pt')
            posoutput = bert(**posencoded_input)
            negoutput = bert(**negencoded_input)
            cls1=posoutput.pooler_output
            cls2=negoutput.pooler_output
            output1 = model(cls1)
            output2 = model(cls2)
            outputlist.append(output1)
            outputlist.append(output2)
        batch_labels = torch.tensor(batch_labels, dtype=torch.float32,requires_grad=False)
        # print(batch_labels)
        outputlist = torch.tensor(outputlist, dtype=torch.float32,requires_grad=False)
        acc = sum(batch_labels==outputlist)
        # print(outputlist)
        # distance=torch.nn.PairwiseDistance(p=2)
        # loss = distance(outputlist, batch_labels)
        acc+=acc
    # averagetestloss = totalloss/len(dataset)
    return acc/len(dataset*2)

In [30]:
file_path = 'testtriples.txt'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = KnowledgeGraphDataset(file_path)
print(testmodel(model,dataset))



[, ,', ,G, ,a, ,m, ,e, ,c, ,o, ,c, ,k, ,“, ,斗, ,鸡, ,”, ,式, ,战, ,斗, ,机, ,', ,,, , , ,', ,首, ,飞, ,时, ,间, ,', ,,, , , ,', ,1, ,9, ,2, ,5, ,年, ,2, ,月, ,', ,],

[, ,', ,G, ,a, ,m, ,e, ,c, ,o, ,c, ,k, ,“, ,斗, ,鸡, ,”, ,式, ,战, ,斗, ,机, ,', ,,, , , ,', ,首, ,飞, ,时, ,间, ,', ,,, , , ,', ,单, ,发, ,', ,],

[, ,', ,G, ,a, ,m, ,e, ,c, ,o, ,c, ,k, ,“, ,斗, ,鸡, ,”, ,式, ,战, ,斗, ,机, ,', ,,, , , ,', ,研, ,发, ,单, ,位, ,', ,,, , , ,', ,英, ,国, ,哥, ,士, ,达, ,公, ,司, ,', ,],

[, ,', ,G, ,a, ,m, ,e, ,c, ,o, ,c, ,k, ,“, ,斗, ,鸡, ,”, ,式, ,战, ,斗, ,机, ,', ,,, , , ,', ,研, ,发, ,单, ,位, ,', ,,, , , ,', ,单, ,发, ,', ,],

[, ,', ,G, ,a, ,m, ,e, ,c, ,o, ,c, ,k, ,“, ,斗, ,鸡, ,”, ,式, ,战, ,斗, ,机, ,', ,,, , , ,', ,气, ,动, ,布, ,局, ,', ,,, , , ,', ,双, ,翼, ,', ,],

[, ,', ,G, ,a, ,m, ,e, ,c, ,o, ,c, ,k, ,“, ,斗, ,鸡, ,”, ,式, ,战, ,斗, ,机, ,', ,,, , , ,', ,气, ,动, ,布, ,局, ,', ,,, , , ,', ,单, ,发, ,', ,],

[, ,', ,G, ,a, ,m, ,e, ,c, ,o, ,c, ,k, ,“, ,斗, ,鸡, ,”, ,式, ,战, ,斗, ,机, ,', ,,, , , ,', ,发, ,动, ,机, ,数, ,量, ,', ,,, , , ,', ,单, ,发, ,', ,],

[, ,