In [2]:
import syntok.segmenter as segmenter

document = """
SentenceTransformers Documentation
SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings. The initial work is described in our paper Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks.

You can use this framework to compute sentence / text embeddings for more than 100 languages. These embeddings can then be compared e.g. with cosine-similarity to find sentences with a similar meaning. This can be useful for semantic textual similar, semantic search, or paraphrase mining.

The framework is based on PyTorch and Transformers and offers a large collection of pre-trained models tuned for various tasks. Further, it is easy to fine-tune your own models.

Installation
You can install it using pip:

pip install -U sentence-transformers
We recommand Python 3.6 or higher, and at least PyTorch 1.6.0. See installation for further installation options, especially if you want to use a GPU.

Usage
The usage is as simple as:

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")
"""



In [4]:
# choose the segmentation function you need/prefer

for paragraph in segmenter.process(document):
    for sentence in paragraph:
        for token in sentence:
            # roughly reproduce the input,
            # except for hyphenated word-breaks
            # and replacing "n't" contractions with "not",
            # separating tokens by single spaces
            print(token.value, end=' ')
        print()  # print one sentence per line
    print()  # separate paragraphs with newlines

# for paragraph in segmenter.analyze(document):
#     for sentence in paragraph:
#         for token in sentence:
#             # exactly reproduce the input
#             # and do not remove "imperfections"
#             print(token.spacing, token.value, sep='', end='')
    print("\n")  # reinsert paragraph separators
###### Two diffrent types of segmentator are available, we could use any one of them as per our convenience. #####

Sentence Transformers Documentation Sentence Transformers is a Python framework for state of the art sentence , text and image embeddings . 
The initial work is described in our paper Sentence BERT : Sentence Embeddings using Siamese BERT Networks . 



You can use this framework to compute sentence / text embeddings for more than 100 languages . 
These embeddings can then be compared e.g . with cosine similarity to find sentences with a similar meaning . 
This can be useful for semantic textual similar , semantic search , or paraphrase mining . 



The framework is based on Py Torch and Transformers and offers a large collection of pre trained models tuned for various tasks . 
Further , it is easy to fine tune your own models . 



Installation You can install it using pip : 



pip install - U sentence transformers We recommand Python 3.6 or higher , and at least Py Torch 1.6.0 . See installation for further installation options , especially if you want to use a GPU . 



Usage The u

In [7]:
from sentence_transformers import SentenceTransformer, util
import torch

model = SentenceTransformer('stsb-bert-base')

# sentences
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

# encode
embedding_list = model.encode(sentences)

  0%|          | 0.00/405M [00:00<?, ?B/s]

In [14]:
print(embedding_list)  # ndarray
print(embedding_list.shape)

[[-0.0931991   0.58960193  0.79686326 ...  0.17124614 -1.0291953
   0.16041785]
 [-0.34643695 -0.2202087   0.44292238 ...  0.14638452 -0.09052458
   0.97680795]
 [-0.5632062   0.16794418 -0.4360822  ...  0.5827177   0.71271753
   0.09079245]]
(3, 768)


In [25]:
# 我們前面已經先將document分成句子, 這邊透過加總回去得到document embedding

# 轉成tensor, 才可以透過torch.mean
embedding_list = torch.tensor(embedding_list)

doc_1 = torch.mean(embedding_list, dim=0)

  after removing the cwd from sys.path.


In [27]:
doc_1.shape

torch.Size([768])

In [31]:
sentences_2 = ['With the help of this technique',
    'we could generate more labeled data as well with decent accuracy',
    'The quick brown fox jumps over the lazy dog.']

doc_2 = model.encode(sentences_2)
doc_2 = torch.tensor(doc_2)
doc_2 = torch.mean(doc_2, dim=0)
doc_2.shape

torch.Size([768])

In [32]:
distance = util.pytorch_cos_sim(doc_1, doc_2)

In [33]:
distance

tensor([[0.4183]])