## how to build sentence level embedding model 

In [13]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample,models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
"""
要进行句子级别的文本嵌入，
必要用一个词嵌入模型来提取文本特征
然后用一个池化层来对其向量
然后用一个线性层来任意调节向量长度  
"""
# 定义 词嵌入的模型
word_embedding_model = models.Transformer('bert-base-chinese', max_seq_length=256)
# 定义池化层
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
# 最后组合模型 
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


path: /cail/sentence_relation_set.json

In [7]:
# 模型前向传播实例 
example = ['银行借贷','套路贷']
example_tensor  =  model.encode(example)
example_tensor.shape
print(util.cos_sim(example_tensor[0],example_tensor[1]))

tensor([[0.6123]])


In [15]:
# 设计下游的文本相似度任务
path = "../cail/sentence_relation_set.json"
datafile = open(path,'r',encoding='utf-8')
import json
train_set  = []
for i in range(4):
    line = datafile.readline()
    line = json.loads(line)
    train_set.append(line)
train_set
dev_samples = train_set

In [19]:
import torch 
from torch.utils.data import Dataset,DataLoader
from datetime import datetime  

In [39]:
class downstramset(Dataset):
    def __init__(self,data) -> None:
        super().__init__()
        self.data = data
    def __getitem__(self, index) :
        item =self.data[index]
        s1 = item['s1']
        s2 = item['s2']
        score = float(item['label'])
        
        return InputExample(texts=[s1,s2], label=score)
    def __len__(self):
        return len(self.data)

In [40]:

num_epochs = 1
model_name = 'law_bert'
model_save_path = 'output/-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
trainset = downstramset(train_set)
train_dataloader = DataLoader(trainset,shuffle=True)
train_loss  = losses.CosineSimilarityLoss(model=model)
def dict2input(item):
    s1 = item['s1']
    s2 = item['s2']
    score = str(item['label'])
    return  InputExample(texts=[s1,s2], label=score)
dev_samples = [dict2input(i) for  i in train_set]
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up


In [41]:
train_set

[{'s1': '上海市长宁区人民检察院指控:2020年3月,被告人XXX受他人指使在广东省揭阳市使用自己的身份证办理中国农业银行卡、手机卡及网银U盾1套,并以人民币1,000元的价格出售给他人。',
  's2': '桂林市象山区人民检察院指控,2019年12月,XXX、XXX(二人均另案处理)、XXX(在逃)为获取非法利益,合谋组织搭建“亿万家”平台为境外赌博等非法网站提供支付结算服务收取佣金。',
  'label': '1'},
 {'s1': '桂林市象山区人民检察院指控,2019年12月,XXX、XXX(二人均另案处理)、XXX(在逃)为获取非法利益,合谋组织搭建“亿万家”平台为境外赌博等非法网站提供支付结算服务收取佣金。',
  's2': '上海市长宁区人民检察院指控:2020年3月,被告人XXX受他人指使在广东省揭阳市使用自己的身份证办理中国农业银行卡、手机卡及网银U盾1套,并以人民币1,000元的价格出售给他人。',
  'label': '1'},
 {'s1': '该农业银行卡支付结算金额超人民币1,860余万元。',
  's2': '截止2020年6月12日,XXX为“亿万家”平台提供转账等支付结算服务,资金结算金额达565884元,非法获利5000元。',
  'label': '1'},
 {'s1': '截止2020年6月12日,XXX为“亿万家”平台提供转账等支付结算服务,资金结算金额达565884元,非法获利5000元。',
  's2': '该农业银行卡支付结算金额超人民币1,860余万元。',
  'label': '1'}]

In [42]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Iteration: 100%|██████████| 4/4 [00:00<00:00,  5.82it/s]
Epoch: 100%|██████████| 1/1 [00:00<00:00,  1.24it/s]


In [None]:
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

In [11]:
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses

model = SentenceTransformer('nli-distilroberta-base-v2')
train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
    InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)]
train_dataset = SentencesDataset(train_examples, model)

Downloading (…)7023f/.gitattributes: 100%|██████████| 736/736 [00:00<00:00, 737kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 108kB/s]
Downloading (…)433037023f/README.md: 100%|██████████| 3.71k/3.71k [00:00<00:00, 3.74MB/s]
Downloading (…)3037023f/config.json: 100%|██████████| 679/679 [00:00<?, ?B/s] 
Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<?, ?B/s] 
Downloading (…)33037023f/merges.txt: 100%|██████████| 456k/456k [00:01<00:00, 394kB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████| 329M/329M [00:36<00:00, 9.00MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 52.0/52.0 [00:00<00:00, 26.8kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 120kB/s]
Downloading (…)7023f/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:01<00:00, 955kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 1.12k/1.12k [00:00<00:00, 1.12MB/s]
Downloading (…)33037023f/vocab.json: 100%|██████████| 

In [None]:

from sentence_transformers import SentenceTransformer, models
from torch import nn

### 
word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])



In [None]:

###
from sentence_transformers import SentenceTransformer, InputExample
from torch.utils.data import DataLoader

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
   InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)


from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

#Define the model. Either from scratch of by loading a pre-trained model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

#Define your train examples. You need more than just two examples...
train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
    InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)]

#Define your train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

#Tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)