In [None]:
%pip install -r requirements.txt

In [None]:
from pypdf import PdfReader
import pymupdf
import re
import os
import requests
from serpapi.google_scholar_search import GoogleScholarSearch
from groq import Groq
from pypdf import PdfReader, PdfWriter
from pypdf.annotations import Text
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings


Retrieve references from paper using named destinations

In [None]:
def refs_dict(pdf_path):
  cites = {}
  reader = PdfReader(pdf_path)
  doc = pymupdf.open(pdf_path)
  height = reader.pages[0].mediabox.height
  for cite, info in reader.named_destinations.items():
    if cite[:4] == 'cite':
        x1 = info['/Left']
        y1 = height-info['/Top']
        x2 = x1+400
        y2 = y1+20
        for i in range(len(reader.pages)):
          if reader.pages[i] == info['/Page']:
            page = doc[i]
            rect = pymupdf.Rect(x1,y1,x2,y2)
            cites[cite] = page.get_textbox(rect)
  return cites

Create folder to store source files

In [None]:
def make_folder(pdf_path):
  title = re.sub(r'\W+', '', pdf_path)[:-3]
  title = f"{title}_refs"
  if not os.path.exists(title):
      os.makedirs(title)
  return title

Retrieve documents via Serp API Google Scholar search

In [None]:
def get_doc(folder, title, cite):
    api_key = os.environ.get("SERP_API_KEY")
    filename = re.sub(r'\W+', '', cite[5:])

    search = GoogleScholarSearch({"q": title, "api_key":api_key})
    data = search.get_dict()
    url = data['organic_results'][0]['resources'][0]['link']
    r = requests.get(url)

    with open(f"/content/{folder}/{filename}.pdf", "wb") as f:
      f.write(r.content)
    return f"{filename}.pdf"

def get_docs(folder, cites):
  for cite, title in cites.items():
     get_doc(folder, title, cite)


Chunk text to fit in ctx length + Groq rate limit

In [None]:
# HELPER FUNCS FOR summarize()

def texthalf(text):
  return [text[:len(text)//2],text[len(text)//2:]]

def text_extractor(file, chunks = 2):
    text = ""
    reader = PdfReader(file)
    for page in reader.pages:
      text+=page.extract_text()
    return re.sub('\n', ' ', text)

def get_chunked(folder, file):
    return texthalf(text_extractor(f"/content/{folder}/{file}"))

Call Groq Llama 3 70B to summarize the halves of the document then combine the two

In [None]:
def summarize(client, text):
    chat_completion = client.chat.completions.create(

      messages=[

          {
              "role": "system",
              "content": """You are a professional summarizer hired by me to provide context to citations within papers. You will be provided with the text
              within which the citation appears as well as relevant text chunks from the cited paper. Please use the text chunks to provide context for the citation.
              Make your explanation as brief as possible while still fully explaining the citation's relevance to the passage.
              """
          },
          {
              "role": "user",
              "content": text,
          }
      ],
      model="llama3-70b-8192",
      temperature=0.5,
      max_tokens=1024,
      top_p=1,
      stop=None,
      stream=False,
    )
    return chat_completion.choices[0].message.content

def final(texts, ctx):
  return f"""Explain the following passage:
"{texts}"
using this context from the paper cited: "{ctx}". CONTEXTUALIZE: """

def summarize_paper(client, text, ref_ctx):
  inputs = final(text, ref_ctx)
  return summarize(client, inputs)



Match citation summaries to citation locations,

In [None]:
def locs_dict(pdf_path, cites):
  reader = PdfReader(pdf_path)
  locs = {cite: {} for cite in cites}
  for page_num, page in enumerate(reader.pages):
      if '/Annots' in page:
          for annot in page['/Annots']:
              obj = annot.get_object()
              if '/Subtype' in obj and obj['/Subtype'] == '/Link' and '/A' in obj and obj['/A']['/S'] == '/GoTo':
                  cite_ref = obj['/A']['/D']
                  if cite_ref in locs.keys():
                    if not locs[cite_ref]:
                      locs[cite_ref] = {page_num:[obj['/Rect']]}
                    elif page_num not in locs[cite_ref].keys():
                      locs[cite_ref][page_num] = [obj['/Rect']]
                    else:
                      locs[cite_ref][page_num].append(obj['/Rect'])
  return locs

get surrounding context

In [None]:
def get_ctx(doc, reader, cite):
  ctx = {}
  for page_num, page in enumerate(reader.pages):
    if '/Annots' in page:
        for annot in page['/Annots']:
            obj = annot.get_object()
            if '/Subtype' in obj and obj['/Subtype'] == '/Link' and '/A' in obj and obj['/A']['/S'] == '/GoTo':
                cite_ref = obj['/A']['/D']
                if cite_ref in cite:
                  info = obj['/Rect']
                  x1 = 100
                  y1 = 792-info[1]-10
                  x2 = 500
                  y2 = y1+25
                  rect = pymupdf.Rect(x1,y1,x2,y2)
                  if page_num not in ctx.keys():
                    ctx[page_num] = [re.sub('\n', ' ', doc[page_num].get_textbox(rect))]
                  else:
                    ctx[page_num].append(re.sub('\n', ' ', doc[page_num].get_textbox(rect)))
  return ctx


def ctx_dict(pdf_path, cites):
  doc = pymupdf.open(pdf_path)
  reader = PdfReader(pdf_path)
  cite_dict = {}
  for cite in cites:
    cite_dict[cite] = get_ctx(doc, reader, cite)
  return cite_dict


In [None]:
def init_rag(folder):
    docs = []
    text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=50, separator = " ")
    for file in os.listdir(folder):
        text = text_extractor(folder+"/"+file)
        docs.append(text)
    texts = text_splitter.create_documents(docs)
    embeddings = HuggingFaceEmbeddings()
    db = FAISS.from_documents(texts, embeddings)
    retreiver = db.as_retriever()
    return retreiver

In [None]:
def summaries_dict(folder_path, ctx):
  key = os.environ.get("GROQ_API_KEY")
  client = Groq(api_key = key)
  summ = {}
  index = init_rag(folder_path)
  for cite, ctxs in ctx.items():
    summ[cite] = {}
    for page, ctx in ctxs.items():
      summ[cite][page] = []
      for c in ctx:
        docs = index.get_relevant_documents(c, k = 2)
        info = str([doc.page_content for doc in docs])
        summary = summarize_paper(client, c, info)
        print(summary)
        summ[cite][page].append(summary)
  return summ




Example:

In [None]:
def add_popup_annotation(writer, page_number, rect, content):
    annotation = Text(text = content, rect = rect)
    writer.add_annotation(page_number=page_number, annotation=annotation)

def annotations(p, c, l, s):
  reader = PdfReader(p)
  writer = PdfWriter()
  for page in reader.pages:
    writer.add_page(page)
  for cite in c.keys():
    for page, coords in l[cite].items():
      for i in range(len(coords)):
        add_popup_annotation(writer, page, coords[i], re.sub('\n', '', s[cite][page][i]))
  with open("annotated.pdf", "wb") as fp:
      writer.write(fp)

In [None]:
os.getenv("GROQ_KEY") = input("Groq API Key")
os.getenv("SERP_KEY") = input("Serp API Key")

pdf_path = "1706.03762v7.pdf"

folder_path = make_folder(pdf_path)

refs = refs_dict(pdf_path)

#test with one
cite, title = list(refs.items())[0]
refs = {cite:title}

get_docs(folder_path, refs)

locs = locs_dict(pdf_path, refs.keys())

ctx = ctx_dict(pdf_path, refs.keys())

sum = summaries_dict(folder_path, ctx)