In [1]:
import os, requests, json, openai, PyPDF2, pdfplumber, re, tiktoken
from dotenv import load_dotenv; load_dotenv()
from io import BytesIO
from pdfminer.high_level import extract_text

openai.api_key = os.environ.get("OPENAI_API_KEY")
pdf_file = "attention.pdf"

In [6]:
class SectionNode:

    def __init__(self, title, start):
        self.id = str(id(self))
        self.start = start
        self.title = title
        self.text = ""
        self.children = []
        self.json = {}

        self.getText()
        self.getSummary()

    def addChild(self, other):
        self.children.append(other)

    def getJson(self):
        self.json[self.id] = {}
        self.json[self.id]["start"] = self.start
        self.json[self.id]["title"] = self.title
        self.json[self.id]["text"] = self.text
        self.json[self.id]["children"] = [child.id for child in self.children]
        return self.json

    def getText(self):
        #TODO Use PyPDF to get the text based on self.start
        self.text = self.title

    def getSummary(self):
        #TODO Use GPT to get the summary of text
        self.text = self.title

    def __str__(self):
        return f'Title: {self.title}\n' \
               f'Text: {self.text}\n' \
               f'Children: {[child.title for child in self.children]}\n'

In [7]:
def create_graph(pdf_path):
    pdf_file = open(pdf_path, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    root = SectionNode("Attention Is All You Need", "START")
    get_bookmark(root, pdf_reader.outline)

    return root


def get_bookmark(root, outline):
    new = None

    for item in outline:

        if isinstance(item, list):
            get_bookmark(new, item)
        else:
            new = SectionNode(item.title, item.page)
            root.addChild(new)


def get_json_output(json_output, root):
    json_output.append(root.getJson())
    for child in root.children:
        get_json_output(json_output, child)


def main(pdf_file):
    root = create_graph(pdf_file)
    json_output = []
    get_json_output(json_output, root)
    return json_output

[{'1660826770768': {'start': 'START',
   'title': 'Attention Is All You Need',
   'text': '',
   'children': ['1660827658064',
    '1660827658384',
    '1660827658576',
    '1660827660624',
    '1660827660880',
    '1660827662224',
    '1660827663248']}},
 {'1660827658064': {'start': IndirectObject(136, 0, 1660825721360),
   'title': 'Introduction',
   'text': '',
   'children': []}},
 {'1660827658384': {'start': IndirectObject(136, 0, 1660825721360),
   'title': 'Background',
   'text': '',
   'children': []}},
 {'1660827658576': {'start': IndirectObject(136, 0, 1660825721360),
   'title': 'Model Architecture',
   'text': '',
   'children': ['1660827658768',
    '1660827658960',
    '1660827659792',
    '1660827660048',
    '1660827660304']}},
 {'1660827658768': {'start': IndirectObject(169, 0, 1660825721360),
   'title': 'Encoder and Decoder Stacks',
   'text': '',
   'children': []}},
 {'1660827658960': {'start': IndirectObject(169, 0, 1660825721360),
   'title': 'Attention',
   'te

In [None]:
print(root)

for child in root.children:
    print(child)

    if child.children:
        for subchild in child.children:
            print(subchild)

In [None]:
child_intro = root.children[0]
child_back = root.children[1]

pdf_reader = PyPDF2.PdfReader(pdf_file)
print(pdf_reader.outline,'\n\n')
pdf_reader.get_object(child_intro.start).get('/Contents')

In [None]:
outline_dict = {
        '/Page': PyPDF2.generic.IndirectObject(186, 0, 2926067309264),
        '/Left': 108,
        '/Top': 166.913
    }

page_reference = outline_dict['/Page']
left = outline_dict['/Left']
top = outline_dict['/Top']

pdf_reader = PyPDF2.PdfReader(pdf_file)

page_number = pdf_reader.get_object(page_reference).get('/Contents')#.get('/MediaBox')
page_number = page_number.getPageNumber()

pdf_page = pdf_reader.getPage(page_number)

x0, y0 = left, top
x1, y1 = left + 10, top + 10

page_text = pdf_page.extractText()
extracted_text = ""
for line in page_text.split('\n'):
    if x0 < left and x1 > left and y0 < top and y1 > top:
        extracted_text += line + '\n'

In [None]:
def pdf2text(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        pdf_text = ''
        for page_number in range(len(pdf.pages)):
            page = pdf.pages[page_number]
            pdf_text += page.extract_text(x_tolerance=2, y_tolerance=5, layout=False).strip()
    return pdf_text


def count_tokens(history: list):
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = 0
    for message in history:
        num_tokens += 4
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += -1
    num_tokens += 2
    return num_tokens


def count_tokens_text(text: str):
    encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(text))

In [None]:
def user_said(content, history):
    history.append({"role":"user", "content":content})

def assistant_said(content, history):
    history.append({"role":"assistant", "content":content})

def ask_chatgpt(user, history, system=None, new_chat=False, max_tokens=256, only_response=False, temp=0, model='gpt-3.5-turbo'):

    history = [] if new_chat else history

    if system and new_chat:
        history.append({"role":"system", "content":system})
    user_said(user, history)

    response = openai.ChatCompletion.create(
      model=model,
      messages=history,
      temperature=temp,
      max_tokens=max_tokens,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    response = response['choices'][0]['message']['content']

    if only_response:
        return response
    else:
        assistant_said(response, history)
        return response, history

In [None]:
text = pdf2text(pdf_file)
print(text, '\n\n', count_tokens_text(text))

In [None]:
system = f'Act as a professional scientist that reviews article.'
article = f'article: {text}'

prompt = f'{article}.\nObjective: List sections and subsections of the article. To find those sections, include several exact words of the article that followed each section and subsection.'

response, history = ask_chatgpt(prompt, history=[], system=system, new_chat=True, max_tokens=1000, temp=0, model='gpt-3.5-turbo-16k')
print(response)

In [None]:
text.find('Scaled Dot-Product Attention ')

In [None]:
section2text = {}
sections_list = response.strip().split('\n')
section_indexes = []

for section in sections_list:
    section_indexes.append(text.find(section))

print(section_indexes)

In [None]:
text[20000:]

In [None]:
print("Table 1: Maximum path lengths, per-layer complexity and minimum number of sequential operations\nfor different layer types. n is the sequence length, d is the representation dimension, k is the kernel\nsize of convolutions and r the size of the neighborhood in restricted self-attention.\nLayer Type Complexity per Layer Sequential Maximum Path Length\nOperations\nSelf-Attention O(n2 · d) O(1) O(1)\nRecurrent O(n · d2) O(n) O(n)\nConvolutional O(k · n · d2) O(1) O(logk(n))\nSelf-Attention (restricted) O(r · n · d) O(1) O(n/r)\n")

In [None]:
text[2859:4780]