# Setup

In [1]:
from db_funcs import get_all_content, get_db_con
import dotenv
dotenv.load_dotenv()
conn=get_db_con()

# Article Chunk Size
Determine what kinda size the current article chunks are, decided that for now, chunking is unnecesary

In [2]:
import numpy as np
import random
with conn.cursor() as cur:
    all_content = get_all_content(cur)
content_lengths = [len(content) for content in all_content]

print(f"Average length: {np.mean(content_lengths):.2f}")
print(f"Maximum length: {max(content_lengths)}")
print(f"Minimum length: {min(content_lengths)}")
print(f"Standard deviation: {np.std(content_lengths):.2f}")
print(f"\nSample of lengths: {random.sample(content_lengths, min(20, len(content_lengths)))}")

Average length: 802.61
Maximum length: 4063
Minimum length: 169
Standard deviation: 359.58

Sample of lengths: [885, 626, 1039, 869, 543, 600, 650, 665, 610, 631, 544, 508, 793, 709, 548, 531, 905, 754, 617, 1105]


# Extract Sentances and Entity Relationships

In [None]:
from transformers import pipeline
from utils import extract_triplets, Sentence_Extractor
from neo4j_funcs import Neo4j_Driver
from typing import List
from models import triplet
import time
from torch.utils.data import Dataset, DataLoader

sentence_extractor=Sentence_Extractor()
triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')

neo4j = Neo4j_Driver()

sentences=[]
print("Starting to extract sentences from content")
start_time = time.time()
for content in all_content:
    sentences.append(sentence_extractor.get_sentences(content))
end_time = time.time()
print(f"Extracted all sentences from content in {end_time - start_time:.2f} seconds")

class SentenceDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = [s for content_sentences in sentences for s in content_sentences]  # Flatten list
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return self.sentences[idx]
dataset = SentenceDataset(sentences)
batch_size=100
sentences_dataloader=DataLoader(dataset, batch_size=batch_size, shuffle=False)
total_batches = (len(dataset) + batch_size - 1) // batch_size  # Calculate total number of batches

timeDecoding = 0
timeExtracting = 0
timeInserting = 0

for batch_idx, sentences_batch in enumerate(sentences_dataloader):
    start_decoding = time.time()
    tokens = triplet_extractor(sentences_batch, return_tensors=True, return_text=False)
    token_ids = [token["generated_token_ids"] for token in tokens]
    extracted_text = triplet_extractor.tokenizer.batch_decode(token_ids)
    end_decoding = time.time()
    timeDecoding += (end_decoding - start_decoding)

    start_extracting = time.time()
    triplets = [triplet for text in extracted_text for triplet in extract_triplets(text)]
    end_extracting = time.time()
    timeExtracting += (end_extracting - start_extracting)

    start_inserting = time.time()
    for e in triplets:
        neo4j.insert_relationship(e.head, "entity", e.type, e.tail, "entity")
    end_inserting = time.time()
    timeInserting += (end_inserting - start_inserting)

    if (batch_idx + 1) % 10 == 0:
        print(f"\nCumulative time after {batch_idx + 1} batches:")
        print(f"  Decoding: {timeDecoding:.2f} seconds")
        print(f"  Extracting: {timeExtracting:.2f} seconds")
        print(f"  Inserting: {timeInserting:.2f} seconds")

    print(f"Batch {batch_idx + 1} of {total_batches} complete")

print(f"\nTotal time spent decoding: {timeDecoding:.2f} seconds")
print(f"Total time spent extracting: {timeExtracting:.2f} seconds")
print(f"Total time spent inserting: {timeInserting:.2f} seconds")
    

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to /home/chris/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Device set to use cuda:0


Starting to extract sentences from content
Extracted all sentences from content in 0.09 seconds
Batch 1 of 640 complete
Batch 2 of 640 complete
Batch 3 of 640 complete
Batch 4 of 640 complete
Batch 5 of 640 complete
Batch 6 of 640 complete
Batch 7 of 640 complete
Batch 8 of 640 complete
Batch 9 of 640 complete


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Cumulative time after 10 batches:
  Decoding: 23.46 seconds
  Extracting: 0.00 seconds
  Inserting: 1.37 seconds
Batch 10 of 640 complete
Batch 11 of 640 complete
Batch 12 of 640 complete
Batch 13 of 640 complete
Batch 14 of 640 complete
Batch 15 of 640 complete
Batch 16 of 640 complete
Batch 17 of 640 complete
Batch 18 of 640 complete
Batch 19 of 640 complete

Cumulative time after 20 batches:
  Decoding: 45.71 seconds
  Extracting: 0.00 seconds
  Inserting: 2.39 seconds
Batch 20 of 640 complete
Batch 21 of 640 complete
Batch 22 of 640 complete
Batch 23 of 640 complete
Batch 24 of 640 complete
Batch 25 of 640 complete
Batch 26 of 640 complete
Batch 27 of 640 complete
Batch 28 of 640 complete


KeyboardInterrupt: 