### Document parser



In [1]:
import traceback
! pip install xmltodict
! pip install grobid grobid[json]


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


'/home/ankush/workplace/tryout_repos/chatbot-rag-app/test_notebooks'

In [50]:
import requests
import xmltodict
import json
from grobid.tei import Parser

import os
import sys

CURRENT_DIR = os.path.abspath("")


def add_folder_to_sys_path(folder_path):
    abs_path = f"{CURRENT_DIR}/{folder_path}"
    if abs_path not in sys.path:
        sys.path.append(abs_path)


# add api folder to sys path
add_folder_to_sys_path("../api")

from vector_util import compute_vector, extract_and_map_sparse_vector, tokenizer

PAPER_PATH = "/home/ankush/workplace/papers/KnowlegeGraphs/textrank_emnlp04.pdf"


# timer decorator
def timer(func):
    def wrapper(*args, **kwargs):
        import time
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        print(f"Time taken by {func.__name__} is {end - start}")
        return result

    return wrapper


@timer
def parse_pdf_by_grobid(pdf_path=""):
    # request grobid endpoint
    try:
        GROBID_URL = 'http://localhost:8080'
        url = f"{GROBID_URL}/api/processFulltextDocument"
        xml_content = requests.post(url, files={'input': open(pdf_path, 'rb')}, data={})
        parser = Parser(xml_content.text)
        article = parser.parse()
        return json.loads(article.to_json())  # raises Runtim
    except Exception as err:
        print(traceback.format_exc())
    # return json_content


article_content = parse_pdf_by_grobid(PAPER_PATH)
# print(article_content)


Time taken by parse_pdf_by_grobid is 1.7213678359985352


In [46]:

# article_content = json.loads(article_content) 
article_content["sections"][0].keys()
article_content["sections"][0]["title"]
article_content["sections"][0]["paragraphs"][0]
# article_content["sections"][0]["paragraphs"][0].keys()
# article_content

{'text': "Graph-based ranking algorithms like Kleinberg's HITS algorithm (Kleinberg, 1999) or Google's PageRank (Brin and Page, 1998) have been successfully used in citation analysis, social networks, and the analysis of the link-structure of the World Wide Web. Arguably, these algorithms can be singled out as key elements of the paradigm-shift triggered in the field of Web search technology, by providing a Web page ranking mechanism that relies on the collective knowledge of Web architects rather than individual content analysis of Web pages. In short, a graph-based ranking algorithm is a way of deciding on the importance of a vertex within a graph, by taking into account global information recursively computed from the entire graph, rather than relying only on local vertex-specific information.",
 'refs': [{'start': 63, 'end': 80, 'marker': 'bibr', 'target': '#b6'},
  {'start': 102, 'end': 123, 'marker': 'bibr', 'target': '#b0'}]}

## index paragraphs

In [44]:
# Qdrant client setup
from qdrant_client import QdrantClient
from qdrant_client import models
import uuid

# Define collection name
COLLECTION_NAME = "literature_collection"

# # Insert sparse vector into Qdrant collection
# point_id = 1  # Assign a unique ID for the point

client = QdrantClient("http://localhost:6333")
client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={},
    sparse_vectors_config={
        "text": models.SparseVectorParams(
            index=models.SparseIndexParams(
                on_disk=False,
            )
        )
    },
)

True

In [45]:
paragraphs = []

# title = article_content["title"]
for section_id, section in enumerate(article_content["sections"]):
    section_title = section["title"]
    for para_id, para in enumerate(section["paragraphs"]):
        vec, tokens = compute_vector(para["text"])
        indices = vec.nonzero().numpy().flatten()
        values = vec.detach().numpy()[indices]
        payload = {
            # "paper_title": title,
            "section_title": section_title,
            "page_content": para["text"]
        }
        point_id = str(uuid.uuid4())
        client.upsert(
            collection_name=COLLECTION_NAME,
            points=[
                models.PointStruct(
                    id=point_id,
                    payload=payload,
                    vector={
                        "text": models.SparseVector(
                            indices=indices.tolist(), values=values.tolist()
                        )
                    },
                )
            ],
        )
        # break

## find similar records

In [67]:
# Preparing a query vector

def get_context_docs(query_text):
    # query_text = "What is pagerank?"
    query_vec, query_tokens = compute_vector(query_text)
    query_vec.shape
    
    query_expansion = extract_and_map_sparse_vector(query_vec, tokenizer)
    # print(query_expansion)
    
    query_indices = query_vec.nonzero().numpy().flatten()
    query_values = query_vec.detach().numpy()[query_indices]
    
    # Searching for similar documents
    result = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=models.NamedSparseVector(
            name="text",
            vector=models.SparseVector(
                indices=query_indices,
                values=query_values,
            ),
        ),
        with_vectors=True,
    )
    
    return [res.payload for res in result]

## make prompt

In [86]:
def make_rag_prompt(question, docs):
    context = ""
    doc_texts = []
    for doc in docs:
        section_title= doc["section_title"]
        doc_texts.append(f"NAME: {section_title}\n{doc['page_content']}")
    context = "\n---\n".join(doc_texts)
    prompt = f"""
        Use the following passages and chat history to answer the user's question. 
        Each passage has a NAME which is the title of the document. After your answer, leave a blank line and then give the source name of the passages you answered from. Put them in a comma separated list, prefixed with SOURCES:.
        
        Example:
        
        Question: What is the meaning of life?
        Response:
        The meaning of life is 42.
        
        SOURCES: Hitchhiker's Guide to the Galaxy
        
        If you don't know the answer, just say that you don't know, don't try to make up an answer.
        
        ----
        {context}
        ----
        
        Question: {question}
        Response:
    """
    return prompt


## talk to LLM

In [87]:
from langchain_community.chat_models import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage

def get_llm(temperature=0):
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    return ChatOpenAI(
            openai_api_key=OPENAI_API_KEY, streaming=True, temperature=temperature
        )

query_text = "How does textrank algorithm work?"
context_docs = get_context_docs(query_text)

qa_prompt = make_rag_prompt(query_text, context_docs)

messages = [
    HumanMessage(
        content=qa_prompt
    ),
]

answer = get_llm().invoke(messages)
print(answer.content)

{'text': 2.8, '##rank': 2.28, 'algorithm': 1.83, 'work': 1.3, 'algorithms': 1.25, 'calculate': 1.13, 'rank': 1.08, 'method': 0.99, 'works': 0.91, 'index': 0.87, 'step': 0.85, 'function': 0.78, 'ranked': 0.73, 'mechanism': 0.68, 'math': 0.67, 'texts': 0.63, 'java': 0.58, 'fuzzy': 0.55, 'equation': 0.5, ':': 0.38, 'reading': 0.35, 'button': 0.34, 'tracking': 0.33, 'solve': 0.29, 'computer': 0.28, 'generator': 0.27, 'how': 0.26, 'tool': 0.26, 'create': 0.25, 'strategy': 0.24, 'help': 0.22, 'engine': 0.17, 'machine': 0.17, 'search': 0.17, 'error': 0.17, 'accuracy': 0.17, 'word': 0.14, 'graph': 0.14, 'optimization': 0.14, 'editor': 0.13, 'process': 0.11, 'data': 0.09, 'avery': 0.08, 'technique': 0.07, 'connection': 0.04, 'calculated': 0.04, 'useful': 0.03}
The TextRank algorithm works by building a graph associated with the text, where each vertex represents a text unit to be ranked. It then scores these text units based on the importance of other text units they are connected to in the gra

In [91]:
article_content.keys()# dict_keys(['bibliography', 'keywords', 'citations', 'sections', 'tables', 'abstract'])
article_content["bibliography"]

{'title': 'TextRank: Bringing Order into Texts',
 'authors': [{'person_name': {'surname': 'Mihalcea', 'first_name': 'Rada'},
   'affiliations': [{'department': 'Department of Computer Science',
     'institution': 'University of North Texas',
     'laboratory': None}],
   'email': None},
  {'person_name': {'surname': 'Tarau', 'first_name': 'Paul'},
   'affiliations': [{'department': 'Department of Computer Science',
     'institution': 'University of North Texas',
     'laboratory': None}],
   'email': 'tarau¡@cs.unt.edu'}],
 'date': None,
 'ids': None,
 'target': None,
 'publisher': None,
 'journal': None,
 'series': None,
 'scope': None}