# Setup

In [1]:
import psycopg2
import os
from db_funcs import get_all_content

conn = psycopg2.connect(
        dbname=os.getenv('DB_NAME'),
        user=os.getenv('DB_USER'),
        password=os.getenv('DB_PASSWORD'),
        host=os.getenv('DB_HOST'),
        port=os.getenv('DB_PORT')
    )

# Article Chunk Size
Determine what kinda size the current article chunks are, decided that for now, chunking is unnecesary

In [2]:
import numpy as np
import random
with conn.cursor() as cur:
    all_content = get_all_content(cur)
content_lengths = [len(content) for content in all_content]

print(f"Average length: {np.mean(content_lengths):.2f}")
print(f"Maximum length: {max(content_lengths)}")
print(f"Minimum length: {min(content_lengths)}")
print(f"Standard deviation: {np.std(content_lengths):.2f}")
print(f"\nSample of lengths: {random.sample(content_lengths, min(20, len(content_lengths)))}")

Average length: 802.61
Maximum length: 4063
Minimum length: 169
Standard deviation: 359.58

Sample of lengths: [1109, 612, 469, 1473, 1076, 613, 731, 569, 623, 501, 1305, 682, 617, 1201, 903, 705, 1619, 1138, 565, 807]


In [10]:
from transformers import pipeline
from utils import extract_triplets, get_sentances


triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')
# We need to use the tokenizer manually since we need special tokens.
input_str="Punta Cana is a resort town in the municipality of Higuey, in La Altagracia Province, the eastern most province of the Dominican Republic"
sentances=get_sentances(all_content[0])
# print('\n'.join(sentances))

extracted=triplet_extractor(sentances[0], return_tensors=True, return_text=False)
batch=[extracted[0]["generated_token_ids"]]
extracted_text = triplet_extractor.tokenizer.batch_decode(batch)
print(extracted_text[0])
extracted_triplets = extract_triplets(extracted_text[0])
print(extracted_triplets)


Device set to use cuda:0
[nltk_data] Downloading package punkt_tab to /home/chris/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


<s><triplet> Norman <subj> France <obj> country <triplet> French <subj> France <obj> country <triplet> Normandy <subj> France <obj> country</s>
[{'head': 'Norman', 'type': 'country', 'tail': 'France'}, {'head': 'French', 'type': 'country', 'tail': 'France'}, {'head': 'Normandy', 'type': 'country', 'tail': 'France'}]
