In [1]:
from transformers import BertTokenizer, BertModel
import torch
 
# Load Bert and Tokenizer
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("SIKU-BERT/sikubert")
model = AutoModel.from_pretrained("SIKU-BERT/sikubert")
 
def get_word_embedding(sentence):
    tokens = tokenizer.tokenize(sentence)
    # Add [CLS] and [SEP]
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    # Transform to PyTorch tensor
    input_ids = torch.tensor([input_ids])
    # Achieve word vector
    outputs = model(input_ids)
    embedding = outputs[0]
    word_embedding = embedding[:, 1:-1, :]
 
    return word_embedding

  return self.fget.__get__(instance, owner)()


In [2]:
sen1 = '红梅映雪千家瑞'
sen2 = '赤县迎春百业新'
sen1_em = get_word_embedding(sen1)
sen2_em = get_word_embedding(sen2)

In [3]:
def compare_sentence(sentence1, sentence2):
    # Tokenize
    tokens1 = tokenizer.tokenize(sentence1)
    tokens2 = tokenizer.tokenize(sentence2)
    # Add [CLS] and [SEP]
    tokens1 = ['[CLS]'] + tokens1 + ['[SEP]']
    tokens2 = ['[CLS]'] + tokens2 + ['[SEP]']
    # Transform to id
    input_ids1 = tokenizer.convert_tokens_to_ids(tokens1)
    input_ids2 = tokenizer.convert_tokens_to_ids(tokens2)
    # Transform to PyTorch tensor
    input_ids1 = torch.tensor([input_ids1])
    input_ids2 = torch.tensor([input_ids2])
 
    # Achieve word vector
    outputs1 = model(input_ids1)
    outputs2 = model(input_ids2)
 
    # Outputs[0] is embedding
    embedding1 = outputs1[0]
    embedding2 = outputs2[0]
    sentence_embedding1 = embedding1[:, 0, :]
    sentence_embedding2 = embedding2[:, 0, :]
 
    # Calculate euclidean distance
    euclidean_distance = torch.nn.PairwiseDistance(p=2)
    distance = euclidean_distance(sentence_embedding1, sentence_embedding2)
    # Calculate cosine similarity
    cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
    similarity = cos(sentence_embedding1, sentence_embedding2)
 
    return similarity.item(),distance.item()
 

In [4]:
sen1 = '红梅映雪千家瑞'
sen2 = '赤县迎春百业新'
compare_sentence(sen1, sen2)

(0.752055823802948, 15.432188034057617)

In [5]:
sen3 = '黄河东去流不息'
sen4 = '庭前落叶扫无痕'
compare_sentence(sen3, sen4)

(0.6171375513076782, 19.113195419311523)

In [6]:
compare_sentence('咏物想家','怀古想家')

(0.810288667678833, 13.56978988647461)

In [7]:
compare_sentence('千家瑞', '百业新')

(0.6739263534545898, 17.448453903198242)

In [8]:
sen_in = ['红梅映雪千家瑞','花明柳媚春光好','水无两点不成冰','庄生梦蝶知无我','秋风送爽花正艳','黄河东去流不息','浪遏飞舟留客住','黄莺日日盼新岁']
sen_out = ['赤县迎春百业新','大江南北庆丰收','王不出头谁是主','晏子分桃为有他','雨打屋檐人未归','庭前落叶扫无痕','风吹垂柳赋情来','绿柳枝枝辞旧年']
s = []
d = []

In [9]:
for i in range(len(sen_in)):
    sen1 = sen_in[i]
    sen2 = sen_out[i]
    similarity, distance = compare_sentence(sen1, sen2)
    s.append(similarity)
    d.append(distance)

In [10]:
import pandas as pd
data = {'in': sen_in, 'out': sen_out, 'Similarity': s, 'Distance': d}
df = pd.DataFrame(data)

In [11]:
df

Unnamed: 0,in,out,Similarity,Distance
0,红梅映雪千家瑞,赤县迎春百业新,0.752056,15.432188
1,花明柳媚春光好,大江南北庆丰收,0.663895,18.013247
2,水无两点不成冰,王不出头谁是主,0.61825,19.17234
3,庄生梦蝶知无我,晏子分桃为有他,0.731698,15.838715
4,秋风送爽花正艳,雨打屋檐人未归,0.661324,17.981579
5,黄河东去流不息,庭前落叶扫无痕,0.617138,19.113195
6,浪遏飞舟留客住,风吹垂柳赋情来,0.7012,16.777468
7,黄莺日日盼新岁,绿柳枝枝辞旧年,0.758124,15.078329


In [20]:
label_in = ['咏物想家','咏物想家','咏物喜悦','怀古喜悦','送别想家','思乡想家','咏物喜悦','咏物喜悦']
label_out = ['怀古喜悦','思乡喜悦','战争喜悦','怀古失意','田园想家','咏物喜悦','咏物喜悦','咏物想家']
cos = []

In [21]:
for i in range(len(label_in)):
    sen1 = label_in[i]
    sen2 = label_out[i]
    similarity, distance = compare_sentence(sen1, sen2)
    cos.append(similarity)


In [23]:
data = {'in': sen_in, 'out': sen_out, 'label_in': label_in, 'label_out': label_out, 'Similarity': cos}
df = pd.DataFrame(data)
df

Unnamed: 0,in,out,label_in,label_out,Similarity
0,红梅映雪千家瑞,赤县迎春百业新,咏物想家,怀古喜悦,0.766967
1,花明柳媚春光好,大江南北庆丰收,咏物想家,思乡喜悦,0.72152
2,水无两点不成冰,王不出头谁是主,咏物喜悦,战争喜悦,0.752221
3,庄生梦蝶知无我,晏子分桃为有他,怀古喜悦,怀古失意,0.894446
4,秋风送爽花正艳,雨打屋檐人未归,送别想家,田园想家,0.757544
5,黄河东去流不息,庭前落叶扫无痕,思乡想家,咏物喜悦,0.704952
6,浪遏飞舟留客住,风吹垂柳赋情来,咏物喜悦,咏物喜悦,1.0
7,黄莺日日盼新岁,绿柳枝枝辞旧年,咏物喜悦,咏物想家,0.850157


In [24]:
import os
desktop = os.path.join(os.path.expanduser("~"), "Desktop")
file_path = os.path.join(desktop, "output.csv")

# Save the DataFrame to a CSV file
df.to_csv(file_path, index=False)