# Setup

In [1]:
from db_funcs import get_all_content, get_db_con, insert_article, insert_question
from utils import Sentence_Extractor
from neo4j_funcs import Neo4j_Driver
from EntityRelationshipExtractor import ER_Extractor
from tqdm import tqdm
import requests
import dotenv
from my_qdrant import MyQdrant
from sentence_transformers import SentenceTransformer
import os

dotenv.load_dotenv()

neo4j = Neo4j_Driver()
sentence_extractor = Sentence_Extractor()
er_extractor=ER_Extractor()

[nltk_data] Downloading package punkt_tab to /home/chris/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Device set to use cuda:0


# SQL Database Creation
Fills a sql database with the content from the squad 2.0 Dataset

In [2]:
# Download SQuAD 2.0 dev dataset
url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json'
response = requests.get(url)
squad_dev = response.json()

conn=get_db_con()

for topic in squad_dev["data"]:
    print(topic['title'])
    for paragraph in topic["paragraphs"]:
        with conn.cursor() as cur:
            article_id = insert_article(cur, topic["title"], paragraph["context"])
            for qa in paragraph["qas"]:
                insert_question(cur, qa["id"], article_id, qa["question"], qa["answers"], qa["is_impossible"])
        conn.commit()

Normans


UniqueViolation: duplicate key value violates unique constraint "questions_pkey"
DETAIL:  Key (id)=(56ddde6b9a695914005b9628) already exists.


# Embedding Ingestion
Creates a bunch of embeddings for each of the content blocks and stores it in the qdrant embedding database alongside a content_id

In [None]:
with get_db_con().cursor() as cur:
    content_ids, all_content = get_all_content(cur)

model = SentenceTransformer("all-MiniLM-L6-v2")
qdrant_client=MyQdrant(os.getenv('QDRANT_COLLECTION_NAME'),os.getenv('QDRANT_VECTOR_SIZE'))

print("Creating embeddings from content")
content_embeddings=model.encode(all_content)

print("adding embeddings to db")
qdrant_client.add_points(content_embeddings, content_ids)

print("embedding db is ready")

# Extract Sentances and Entity Relationships
For each content block from the database, extract all sentences from the content block. For each content block, extract all entities and relationships using the model and system defined in the entity relationship extractor. Finally, insert each extracted entity into the neo4j database keeping track of which content block the entity is from. Entities can be from multiple content blocks so neo4j stores an array of content ids for each entity node.

In [None]:

with get_db_con().cursor() as cur:
    content_ids, all_content = get_all_content(cur)
for content_id, content in tqdm(zip(content_ids, all_content), total=len(content_ids), desc="Processing content"):
    sentences = sentence_extractor.get_sentences(content)
    triplets=er_extractor.extract_ERs(sentences)
    for e in triplets:
        neo4j.insert_relationship(e.head, "entity", e.type, e.tail, "entity", content_id)
    