In [41]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np

import io
import re
import string
import tqdm
from tensorflow.keras import layers

import seaborn as sns
from sklearn.metrics import pairwise

import os
import tensorboard
from tensorboard.plugins import projector

import collections
import pandas as pd

import random

import math

In [None]:
# ds = tfds.load('multi_news', split='train', with_info=True)

# #use index to get specific document
# index = 20
# count = 0
# with tf.Graph().as_default():
#     numpy_imgs = next(iter(ds))
#     # numpy_imgs = tfds.as_numpy(ds)
# count = 0
# document = []
# summary = []
# for x in numpy_imgs:
#     count += 1
#     if count == index:
#         # tf.print(x["document"])
#         # print("\n")
#         # print("\n")
#         # print("SUMMARY")
#         # tf.print(x["summary"])

#         document = x["document"]
#         summary = x["summary"]
#         break
# document = bytes(document.numpy())
# document = [document.decode("utf-8")]

# summary = bytes(summary.numpy())
# summary = [summary.decode("utf-8")]
# #create vocab
# d_tokens = document[0].lower().split()
# s_tokens = summary[0].lower().split()
# tokens = d_tokens + s_tokens
# vocab, index = {}, 1
# vocab["<pad>"] = 0
# for token in tokens:
#     if token not in vocab:
#         vocab[token] = index
#         index = index + 1

# inverse_vocab = {index: token for token, index in vocab.items()}
# example = [vocab[word] for word in s_tokens]
# embed = hub.load("https://tfhub.dev/google/nnlm-en-dim128/2")
# embeddings = embed(["cat is on the mat", "dog is in the fog"])
# print(embeddings)
# # stuff to do
# # get vocabulary
# # display output in a visual way
# #lean word2vec

# preprocess = hub.load('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
# bert = hub.load('https://tfhub.dev/google/experts/bert/wiki_books/2')

# sentences = [
#   "Here We Go Then, You And I is a 1999 album by Norwegian pop artist Morten Abel. It was Abel's second CD as a solo artist.",
#   "The album went straight to number one on the Norwegian album chart, and sold to double platinum.",
#   "Among the singles released from the album were the songs \"Be My Lover\" and \"Hard To Stay Awake\".",
#   "Riccardo Zegna is an Italian jazz musician.",
#   "Rajko Maksimović is a composer, writer, and music pedagogue.",
#   "One of the most significant Serbian composers of our time, Maksimović has been and remains active in creating works for different ensembles.",
#   "Ceylon spinach is a common name for several plants and may refer to: Basella alba Talinum fruticosum",
#   "A solar eclipse occurs when the Moon passes between Earth and the Sun, thereby totally or partly obscuring the image of the Sun for a viewer on Earth.",
#   "A partial solar eclipse occurs in the polar regions of the Earth when the center of the Moon's shadow misses the Earth.",
# ]

# wordArray = []

# for i in sentences:
#   words = i.split()
#   for w in words:
#     w = w.replace(",", "")
#     w = w.replace(".", "")
#     wordArray.append(w)

# print("word array")
# print(wordArray)

# bert_inputs = preprocess(sentences)
# bert_outputs = bert(bert_inputs)
# pooled_output = bert_outputs['pooled_output']
# sequence_output = bert_outputs['sequence_output']

# print('\nPooled output:')
# print(pooled_output)
# print('\nSequence output:')
# print(sequence_output)

# def plot_similarity(features, labels):
#   """Plot a similarity matrix of the embeddings."""
#   cos_sim = pairwise.cosine_similarity(features)
#   sns.set(font_scale=1.2)
#   cbar_kws=dict(use_gridspec=False, location="left")
#   g = sns.heatmap(
#       cos_sim, xticklabels=labels, yticklabels=labels,
#       vmin=0, vmax=1, cmap="Blues", cbar_kws=cbar_kws)
#   g.tick_params(labelright=True, labelleft=False)
#   g.set_yticklabels(labels, rotation=0)
#   g.set_title("Semantic Textual Similarity")

# plot_similarity(bert_outputs["pooled_output"], wordArray)

In [42]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for vocab_size tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=SEED,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')

class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1,
                                       name="w2v_context")

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [43]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE
num_ns = 4

ngram_freq = pd.read_csv('ngram_freq.csv')
ngramWordList = list(ngram_freq['word'].values)

def textToArray(file_path):
    # turn text into array of words
    text_ds = tf.data.TextLineDataset(file_path).filter(lambda x: tf.cast(tf.strings.length(x), bool))
    text_ds = text_ds.enumerate()
    docTxt = []
    for i in text_ds.as_numpy_iterator():
        line = i[1].decode().split()
        for l in line:
            docTxt.append(l)
    
    return docTxt

def getVocab(file_path):
    doc_ds = tf.data.TextLineDataset(file_path).filter(lambda x: tf.cast(tf.strings.length(x), bool))
    vocab_size = 4096
    sequence_length = 220
    vectorize_layer = layers.TextVectorization(

        standardize=custom_standardization,
        max_tokens=vocab_size,
        output_mode='int',
        output_sequence_length=sequence_length)

    vectorize_layer.adapt(doc_ds.batch(1024))

    doc_vector_ds = doc_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()
    sequences = list(doc_vector_ds.as_numpy_iterator())

    doc_vocab = vectorize_layer.get_vocabulary()
    return doc_vocab, sequences

def findKeyWords(docTxt, vocab, ngramWordList):
    #find the key words in the document
    keyWords = {}
    docFeq = collections.Counter(docTxt)
    maxWordFreq = max(docFeq, key=docFeq.get)
    maxFreq = docFeq[str(maxWordFreq)]

    for v in vocab:
        try:
            realFreq = docFeq[str(v)]
            v_index = ngramWordList.index(str(v)) + 1
            nFreq = int(maxFreq / v_index)

            if realFreq > nFreq and v_index > 20 and realFreq > 0:
                keyWords[str(v)] = str(realFreq) + " | " + str(nFreq)
        except:
            continue
    
    return keyWords

def generateTupleTraining(file_path, doc_vocab, keyWords, sequences):
    vec_keyWords = []
    for k in keyWords:
        vecNum = doc_vocab.index(str(k))
        vec_keyWords.append(vecNum)

    #labels: 1 - positive sample, 0 - negative sample
    targets, contexts, labels = generate_training_data(
        sequences=sequences,
        window_size=2,
        num_ns=4,
        vocab_size=(len(doc_vocab) + 1),
        seed=SEED)

    keySamples = []
    for index, t in enumerate(targets):
        if t in vec_keyWords:
            for c_index, c in enumerate(contexts[index]):
                sample = [t, int(c[0]), int(labels[index][c_index])]
                keySamples.append(sample)
    
    return keySamples

In [61]:
file_path = 'document.txt'
docTxt = textToArray(file_path)
doc_vocab, sequences = getVocab(file_path)
keyWords = findKeyWords(docTxt, doc_vocab, ngramWordList)
keySamples = generateTupleTraining(file_path, doc_vocab, keyWords, sequences)

100%|██████████| 89/89 [00:00<00:00, 513.81it/s]


In [45]:
def createWordVectors(file_path, useBoard):
    text_ds = tf.data.TextLineDataset(file_path).filter(lambda x: tf.cast(tf.strings.length(x), bool))
    
    # Define the vocabulary size and number of words in a sequence.
    vocab_size = 4096
    sequence_length = 300

    # Use the TextVectorization layer to normalize, split, and map strings to
    # integers. Set output_sequence_length length to pad all samples to same length.
    vectorize_layer = layers.TextVectorization(
        standardize=custom_standardization,
        max_tokens=vocab_size,
        output_mode='int',
        output_sequence_length=sequence_length)

    vectorize_layer.adapt(text_ds.batch(1024))

    inverse_vocab = vectorize_layer.get_vocabulary()

    text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

    sequences = list(text_vector_ds.as_numpy_iterator())
    
    targets, contexts, labels = generate_training_data(
        sequences=sequences,
        window_size=2,
        num_ns=4,
        vocab_size=vocab_size,
        seed=SEED)

    contexts = np.array(contexts)[:,:,0]
    labels = np.array(labels)

    # BATCH_SIZE = 1024
    # BUFFER_SIZE = 10000
    BATCH_SIZE = 10
    BUFFER_SIZE = 15
    dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
    dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

    dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)

    embedding_dim = 128
    word2vec = Word2Vec(vocab_size, embedding_dim)
    word2vec.compile(optimizer='adam',
                    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])
    
    if useBoard:
        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
        word2vec.fit(dataset, epochs=1, callbacks=[tensorboard_callback])
    else:
        word2vec.fit(dataset, epochs=5)

    tWeights = word2vec.get_layer('w2v_embedding').get_weights()[0]
    cWeights = word2vec.get_layer('w2v_context').get_weights()[0]
    vocab = vectorize_layer.get_vocabulary()

    # tf_weight = tf.Variable(tWeights)
    # checkpoint = tf.train.Checkpoint(embedding=tf_weight)
    # checkpoint.save(os.path.join('logs', "embedding.ckpt"))
    return tWeights, cWeights, vocab, sequences

In [60]:
scores = []
for i in range(10):
    # doc_weights, doc_vocab = createWordVectors("document.txt", False)
    tWeights, cWeights, vocab, sequences = createWordVectors("summary.txt", False)
    cWeights = np.array(cWeights).reshape(128, 4096)

    #target and context weights for the summary model
    vocab = np.array(vocab)
    # sequences = np.array(sequences)

    #ToDo: generate positive and negative skip-grams for testing v/
    #ToDo: cloud version of ngram_freq.csv
    #create sequence function?

    #use the weights to predict the samples, produce a final evaluation (0-1)
    #model is not evaluated correctly
    # USE THE DOCUMENT VOCAB TO TRAIN THE SUMMARY WEIGHTS!!!!!!!!!!!!
    score = 0
    guesses = []
    answers = []
    probs = []
    badSample = 0
    goodSamples = []
    for k in keySamples:
        if  doc_vocab[k[0]] in list(vocab) and doc_vocab[k[1]] in list(vocab):
            t_idx = list(vocab).index(doc_vocab[k[0]])
            c_idx = list(vocab).index(doc_vocab[k[1]])
            goodSamples.append(k)
            answers.append(k[2])
        
    for g in goodSamples:
        t_idx = list(vocab).index(doc_vocab[g[0]])
        c_idx = list(vocab).index(doc_vocab[g[1]])

        Ht = tWeights[t_idx]
        HtWO = np.matmul(Ht, cWeights)
        softSum = 0
        for h in HtWO:
            softSum = softSum + math.e ** h
        
        prob = (math.e ** HtWO[c_idx]) / softSum
        guess = 1 if prob > (1 / softSum) else 0
        guesses.append(float(guess))
        probs.append(prob)

        # tM = tWeights[t_idx].reshape(1, 128)
        # cM = cWeights[c_idx].reshape(128, 1)

        # unSquashed = float(np.matmul(tM, cM))
        # guess = 1 / (1 + math.e ** -unSquashed)
        # print(guess)
        # guess = 1 if guess > 0.5 else 0
        # guesses.append(float(guess))
        
    cce = tf.keras.losses.CategoricalCrossentropy()
    m = tf.keras.metrics.Accuracy()

    m.update_state(np.array(answers), np.array(guesses))
    entroScore = cce(np.array(answers), np.array(guesses)).numpy() 
    accScore = m.result().numpy()

    scores.append(accScore)    
    # for k in keySamples:
    #     try:
    #         t_idx = list(vocab).index(doc_vocab[k[0]])
    #         c_idx = list(vocab).index(doc_vocab[k[1]])

    #     except:
    #         continue

    #     tM = tWeights[t_idx].reshape(1, 128)
    #     cM = cWeights[c_idx].reshape(128, 1)

    #     unSquashed = float(np.matmul(tM, cM))
    #     guess = 1 / (1 + math.e ** -unSquashed)
    #     guess = 1 if guess > 0.5 else 0
    #     guesses.append(guess)
        
    #     diff = abs(guess - k[2])
    #     mini_score = (1 - diff) * (100 / (len(keySamples) - badSample))
    #     score = score + mini_score

    # scores.append(score)
    # print(str(missed) + str(" / ") + str(len(keyWords)))
    # print("score: " + str(round(score, 2)))
    # print(guesses)

avg = round(np.average(scores), 2)
print("Average accuracy: " + str(avg))
print(scores)
print(probs)


100%|██████████| 1/1 [00:00<00:00, 200.67it/s]


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Average accuracy: 0.5
[0.49886876]
[0.00024224193514665274, 0.0002480595562089633, 0.0002437059217578762, 0.0002430118644912268, 0.00024826462527465355, 0.00024532397218979696, 0.00023970731402737133, 0.00024361723062414414, 0.0002453537729743801, 0.00023795548597495584, 0.00024524780257148413, 0.0002453537729743801, 0.00024353767112612044, 0.00023738330502646288, 0.0002453537729743801, 0.00023738330502646288, 0.00024384973825599405, 0.00024567176352856414, 0.00024282790488327043, 0.000246801715058369, 0.000242601200886798, 0.0002423017864610616, 0.0002468082160483504, 0.0002448264904533975, 0.0002468082160483504, 0.0002445092301159489, 0.00024313790091847582, 0.0002500921448241859, 0.0002420284309475297, 0.00024391193622293905, 0.0002437131486210327, 0.0002460737457602085, 0.0002437131486210327, 0.0002420284309475297, 0.00024524443993629154, 0.0002448599619376855, 0.0002487425996883957, 0.0002433645207950941, 0.0002433645207950941, 0.0

In [None]:
tWeights, cWeights, vocab, sequences = createWordVectors("christmas_carol.txt", False)

In [None]:
def bRep(vec):
    vec = np.array(vec)
    bVec = []
    bVec.append([0] * len(vec[0]))

    for v_index, v in enumerate(vec):
        if v_index == 0:
            continue
        bigVec = []
        unitVec = []
        squaredSum = 0
        for vi_index, vi in enumerate(v):
            vecNum = vec[v_index][vi_index] - vec[v_index - 1][vi_index]
            squaredSum = squaredSum + (vecNum ** 2)
            bigVec.append(vecNum)

        magnitude = math.sqrt(squaredSum)
        for b in bigVec:
            magnitude = 1
            b = round(b / magnitude, 3)
            unitVec.append(b)
        
        bVec.append(unitVec)

    return bVec

myArray = [[2, 5, 2, 4], [1, 2, 3, 2]]
test = bRep(myArray)
print(test)

In [None]:
corpus = ['the', 'large', 'man', 'walked', 'through', 'the', 'brown', 'and', 'sturdy', 'door', 'this', 'afternoon']
sum = ['the', 'man', 'walked', 'through', 'the', 'door']
corpusVec = []
sumVec = []
for c in corpus:
    cVec = tWeights[vocab.index(c)]
    corpusVec.append(cVec)
print(np.array(corpusVec).shape)
corpusVec = np.array(corpusVec).reshape(12, 3)
print('corpus vector')
print(corpusVec)

for c in sum:
    cVec = tWeights[vocab.index(c)]
    sumVec.append(cVec)
sumVec = np.array(sumVec).reshape(6, 3)
print('summary vector')
print(sumVec)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embed = hub.load(module_url)
embeddings = embed(["The", "cat", "in", "the", "hat"])
print(embeddings.shape)  #(3,128)

In [None]:
# importing mplot3d toolkits, numpy and matplotlib
from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt
 
fig = plt.figure()
 
# syntax for 3-D projection
ax = plt.axes(projection ='3d')

corpusVec = bRep(np.array(corpusVec).reshape(12,3))
print(corpusVec)
 
# defining all 3 axes
z = corpusVec[2]
x = corpusVec[0]
y = corpusVec[1]
 
# plotting
ax.scatter(x, y, z, 'green')
ax.set_title('corpus vector')
plt.show()

In [None]:
# importing mplot3d toolkits, numpy and matplotlib
from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
 
fig = plt.figure()
 
# syntax for 3-D projection
ax = Axes3D(fig)
sumVec = bRep(np.array(sumVec).reshape(6,3))
print(sumVec)
 
# defining all 3 axes
z = sumVec[2]
x = sumVec[0]
y = sumVec[1]
 
# plotting
ax.scatter(x, y, z, 'green')
ax.set_title('corpus vector')
plt.show()

In [None]:
# importing mplot3d toolkits, numpy and matplotlib
from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt
 
fig = plt.figure()
 
# syntax for 3-D projection
ax = plt.axes(projection ='3d')

A = [[0, 0, 0], [1, 1, 1]]
B = [[1, 1, 0], [0, 0, 1]]
A = np.array(A).reshape(3, 2)
B = np.array(B).reshape(3, 2)
C = A

# defining all 3 axes
z = C[2]
x = C[0]
y = C[1]

# plotting
ax.scatter(x, y, z, 'blue')
ax.scatter(B[0], B[1], B[2], 'blue')
ax.set_title('corpus vector')
plt.show()

In [None]:
tWeights = [-0.094491, -0.443977, 0.313917, -0.490796, -0.229903, 0.065460, 0.072921, 0.172246, -0.357751, 0.104514, -0.463000, 0.079367, -0.226, -0.1547, -0.0384, 0.4061, -0.1928, -0.4420, 0.1818, 0.0883, 0.2776, -0.0553, 0.4918, 0.2631]
cWeights = [0.0230, 0.4799, 0.4321, 0.3758, -0.3647, -0.1198, 0.2661, -0.3510, -0.3680, 0.4248, -0.2571, -0.1488, 0.0339, 0.3538, -0.1449, 0.1309, 0.4224, 0.3645, 0.4679, -0.0203, -0.4239, -0.4388, 0.2686, -0.4468]
tWeights = np.array(tWeights)
cWeights = np.array(cWeights)
tWeights = tWeights.reshape(8, 3)
cWeights = cWeights.reshape(3, 8)
Ht = tWeights[1]
print(Ht)
# Ht = np.array(Ht).reshape(8, 3)
# cWeights = cWeights.reshape(3, 8)
HtWO = np.matmul(Ht, cWeights)
print(HtWO)
softSum = 0
for h in HtWO:
    softSum = softSum + math.e ** h
print(softSum)
print(math.e ** HtWO[1] / softSum)

In [18]:
%pip install transformers
%pip install sentence-transformers

Collecting transformers
  Using cached transformers-4.15.0-py3-none-any.whl (3.4 MB)
Collecting tokenizers<0.11,>=0.10.1
  Using cached tokenizers-0.10.3-cp38-cp38-win_amd64.whl (2.0 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Using cached huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
Collecting sacremoses
  Using cached sacremoses-0.0.46-py3-none-any.whl (895 kB)
Installing collected packages: tokenizers, huggingface-hub, sacremoses, transformers
Successfully installed huggingface-hub-0.2.1 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.15.0
Note: you may need to restart the kernel to use updated packages.


ERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

huggingface-hub 0.2.1 requires packaging>=20.9, but you'll have packaging 20.4 which is incompatible.


Processing c:\users\brad\appdata\local\pip\cache\wheels\52\19\88\6625593382e23a926740e6fcee0f2df0a0de25766094842a28\sentence_transformers-2.1.0-py3-none-any.whl
Collecting sentencepiece
  Using cached sentencepiece-0.1.96-cp38-cp38-win_amd64.whl (1.1 MB)
Collecting torchvision
  Using cached torchvision-0.11.2-cp38-cp38-win_amd64.whl (985 kB)
Collecting torch>=1.6.0
  Using cached torch-1.10.1-cp38-cp38-win_amd64.whl (226.6 MB)

Installing collected packages: sentencepiece, torch, torchvision, sentence-transformers
Successfully installed sentence-transformers-2.1.0 sentencepiece-0.1.96 torch-1.10.1 torchvision-0.11.2


In [20]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [26]:
model = SentenceTransformer('stsb-roberta-large')

Downloading:   5%|▌         | 76.5M/1.42G [04:46<1:23:49, 267kB/s]
Downloading:   1%|▏         | 21.3M/1.42G [04:08<4:31:54, 85.8kB/s]
Downloading:  14%|█▎        | 67.6M/499M [01:03<06:45, 1.06MB/s]
Downloading:  14%|█▍        | 70.5M/499M [01:59<12:06, 589kB/s] 
Downloading:   3%|▎         | 41.0M/1.42G [00:15<09:18, 2.47MB/s]


Downloading: 100%|██████████| 1.42G/1.42G [09:02<00:00, 2.62MB/s]
Downloading: 100%|██████████| 52.0/52.0 [00:00<00:00, 17.6kB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 232kB/s]
Downloading: 100%|██████████| 1.36M/1.36M [00:00<00:00, 2.67MB/s]
Downloading: 100%|██████████| 1.17k/1.17k [00:00<00:00, 587kB/s]
Downloading: 100%|██████████| 798k/798k [00:00<00:00, 1.84MB/s]
Downloading: 100%|██████████| 191/191 [00:00<00:00, 95.5kB/s]


In [40]:
sentence1 = "the large man walked through the brown and sturdy door this afternoon"
sentence2 = "the man walked through the door"

docTxt = textToArray("document.txt")
docTxt = ' '.join(docTxt)
sumTxt = textToArray("summary.txt")
sumTxt = ' '.join(sumTxt)

sentence1 = docTxt
sentence2 = sumTxt

# encode sentences to get their embeddings
embedding1 = model.encode(sentence1, convert_to_tensor=True)
embedding2 = model.encode(sentence2, convert_to_tensor=True)

# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
# print("Sentence 1:", sentence1)
# print("Sentence 2:", sentence2)
print("Similarity score:", cosine_scores.item())

Similarity score: 0.5372869372367859


In [49]:
print(random.random())

0.4029007060710259
