In [44]:
import os
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import sent_tokenize
from extractor import findSVOs
import en_core_web_lg
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import json

nlp = en_core_web_lg.load()

### Getting Resolved Entities from Stanford CoreNLP

In [51]:
def get_resolved_entities(input_file, output_file):

    try:
        with open('./stanford-corenlp-4.4.0/' + output_file + '.txt', 'r') as f:
            processed_text = f.read()
    except:
        print("processed file doesn't exist, processing...")
        os.system('cd stanford-corenlp-4.4.0 && java -Xmx5g -cp "*" edu.stanford.nlp.naturalli.OpenIE ' + input_file + '.txt -resolve_coref true -output ' + output_file + '.txt')
        with open('./stanford-corenlp-4.4.0/' + output_file + '.txt', 'r') as f:
            processed_text = f.read()

    with open('./stanford-corenlp-4.4.0/' + input_file + '.txt', 'r') as f:
        text = f.read()

    sentences = processed_text.split('\n')
    tokens_list = []
    for sent in sentences:
        tokens_list.append(sent.split('\t')[1:4])
    
    return text, tokens_list

In [54]:
text, tokens_list = get_resolved_entities('blockchain', 'blockchain_output')

processed file doesn't exist, processing...


### Generating SVO Phrases from the tokens

In [42]:
svo_phrases = []

for tokens in tokens_list:
    sent = " ".join(tokens)
    tokens = nlp(sent)
    svos = findSVOs(tokens)
    svo_phrases.append(svos)

## Working on Sentence BERT

In [10]:
model = SentenceTransformer('all-MiniLM-L6-v2')

### Agglomerative Clustering

In [41]:
sentences = sent_tokenize(text)
sentence_embeddings = model.encode(sentences)

sentence_embeddings = sentence_embeddings /  np.linalg.norm(sentence_embeddings, axis=1, keepdims=True)

clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
clustering_model.fit(sentence_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(sentences[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
["There's no silver bullet to the climate crisis, but nuclear fusion may be the closest thing to it.", 'In the quest for a near-limitless, zero-carbon source of reliable power, scientists have generated fusion energy before, but they have struggled for decades to sustain it for very long.', 'On Wednesday, however, scientists working in the United Kingdom announced that they more than doubled the previous record for generating and sustaining nuclear fusion, which is the same process that allows the sun and stars to shine so brightly.', 'The JET tokamak near Oxford, England, produced a record amount of sustained energy from nuclear fusion.', 'In a giant donut-shaped machine known as a tokamak, scientists working in the English village of Culham, near Oxford, were able to generate a record-breaking 59 megajoules of sustained fusion energy over five seconds on December 21 last year.', 'A magnetic field is required to contain the high temperatures needed to carry out the fusion p

### Semantic Textual Similarity

In [57]:
sentences = sent_tokenize(text)

embeddings = model.encode(sentences, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.cos_sim(embeddings, embeddings)

cnt = 0

summary = []

#Output the pairs with their score
for i in range(len(sentences)):
    flag = False
    for j in range(len(sentences)):
        if(i > j and cosine_scores[i][j] >= 0.65):
            print("{} \t\t {} \t\t Score: {:.4f}\n".format(sentences[i], sentences[j], cosine_scores[i][j]))
            summary.append(sentences[i])
            break

for sent in summary:
    result = " ".join(summary)


print("Text:\n{}\n\nSummary:\n{}".format(text, result))

As a database, a blockchain stores information electronically in digital format. 		 A blockchain is a distributed database that is shared among the nodes of a computer network. 		 Score: 0.6665

Blockchains are best known for their crucial role in cryptocurrency systems, such as Bitcoin, for maintaining a secure and decentralized record of transactions. 		 A blockchain is a distributed database that is shared among the nodes of a computer network. 		 Score: 0.6530

One key difference between a typical database and a blockchain is how the data is structured. 		 A blockchain is a distributed database that is shared among the nodes of a computer network. 		 Score: 0.7195

A blockchain collects information together in groups, known as blocks, that hold sets of information. 		 A blockchain is a distributed database that is shared among the nodes of a computer network. 		 Score: 0.6723

Blocks have certain storage capacities and, when filled, are closed and linked to the previously filled bl