In [155]:
import os, requests, json, openai, PyPDF2, pdfplumber, re, tiktoken, asyncio
from bson.json_util import dumps
from dotenv import load_dotenv; load_dotenv()
from io import BytesIO
from pdfminer.high_level import extract_text

openai.api_key = os.environ.get("OPENAI_API_KEY")
pdf_file = "cell.pdf"

In [None]:
# async def your_function(input_str):
#     await asyncio.sleep(1)  # Simulating an async task
#     return f"Processed: {input_str}"
#
# input_strings = ["Input1", "Input2", "Input3"]
#
# async def main():
#     tasks = [your_function(input_str) for input_str in input_strings]
#     results = await asyncio.gather(*tasks)
#
#     for input_str, result in zip(input_strings, results):
#         print(f"Input: {input_str} - Output: {result}")
#
#
# asyncio.run(main())

In [None]:
def user_said(content, history):
    history.append({"role":"user", "content":content})

def assistant_said(content, history):
    history.append({"role":"assistant", "content":content})

def ask_chatgpt(user, history, system=None, new_chat=False, max_tokens=256, only_response=False, temp=0, model='gpt-3.5-turbo'):

    history = [] if new_chat else history

    if system and new_chat:
        history.append({"role":"system", "content":system})
    user_said(user, history)

    response = openai.ChatCompletion.create(
      model=model,
      messages=history,
      temperature=temp,
      max_tokens=max_tokens,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    response = response['choices'][0]['message']['content']

    if only_response:
        return response
    else:
        assistant_said(response, history)
        return response, history

In [156]:
class SectionNode:

    def __init__(self, title, page, top):
        self.id = str(id(self))
        self.title = title
        self.text = ""
        self.children = []
        self.page = page
        self.top = top
        self.depth = None

    def addChild(self, other):
        self.children.append(other)

    def getSummary(self):
        #TODO Use GPT to get the summary of text
        self.text = self.title

    def __str__(self):
        return f'Depth: {self.depth}\n' \
               f'Title: {self.title}\n' \
               f'Page: {self.page}\n' \
               f'Text: {self.text}\n' \
               f'Children: {[child.title for child in self.children]}\n'

In [157]:
def create_graph(pdf_reader, title, page):
    root = SectionNode(title, page, None)
    get_bookmark(root, pdf_reader.outline)

    return root


def get_bookmark(root, outline):
    new = None

    for item in outline:
        if isinstance(item, list):
            get_bookmark(new, item)
        else:
            new = SectionNode(item.title, item.page, item.top)
            root.addChild(new)


# def get_json_output(json_output, root):
#     json_output.append(root.getJson())
#     for child in root.children:
#         get_json_output(json_output, child)


def get_json_output(json_output, root):
    json_output[root.id] = {}
    json_output[root.id]["depth"] = root.depth
    json_output[root.id]["page"] = root.page
    json_output[root.id]["top"] = root.top
    json_output[root.id]["title"] = root.title
    json_output[root.id]["text"] = root.text
    json_output[root.id]["children"] = [child.id for child in root.children]

    for child in root.children:
        get_json_output(json_output, child)


def get_text_from_page(page, bottom, up):
    def visitor_body_with_range(bottom, up):
        def visitor_body(text, cm, tm, fontDict, fontSize):
            y = tm[5]
            if bottom < y < up:
                parts.append(text)
        return visitor_body

    parts = []
    page.extract_text(visitor_text=visitor_body_with_range(bottom=bottom, up=up))
    return "".join(parts).strip()


def get_all_nodes(all_nodes, root):
    all_nodes.append(root)
    for child in root.children:
        get_all_nodes(all_nodes, child)


def get_page_number(pdf_reader, node):
    if type(node.page) == PyPDF2.PageObject:
        return pdf_reader.get_page_number(node.page)
    return pdf_reader.get_page_number(pdf_reader.get_object(node.page))


def adjust_page_numbers(pdf_reader, all_nodes):
    for node in all_nodes:
        node.page = get_page_number(pdf_reader, node)


def float2string(all_nodes):
    for node in all_nodes:
        node.page = str(node.page)
        node.top = str(node.top)
        node.depth = str(node.depth)


def get_text_all_nodes(pdf_reader, all_nodes):
    prev_i = 1
    next_i = 2

    while next_i < len(all_nodes):

        prev_node = all_nodes[prev_i]
        next_node = all_nodes[next_i]
        page = pdf_reader.pages[prev_node.page]

        if prev_node.page == next_node.page:
            text = get_text_from_page(page, bottom=next_node.top, up=prev_node.top).strip()
            prev_node.text = text[text.find('\n')+1:].replace('\n', ' ')

        else:
            text = get_text_from_page(page, bottom=42, up=prev_node.top).strip()
            text = text[text.find('\n')+1:].replace('\n', ' ')

            curr_page_num = prev_node.page + 1

            while curr_page_num < next_node.page:
                text += pdf_reader.pages[curr_page_num].extract_text().replace('\n', ' ')
                curr_page_num += 1

            text += get_text_from_page(pdf_reader.pages[curr_page_num], bottom=next_node.top, up=750).replace('\n', ' ')
            prev_node.text = text

        next_i, prev_i = next_i+1, prev_i+1


    # For the last node
    prev_node = all_nodes[prev_i]
    page = pdf_reader.pages[prev_node.page]
    text = get_text_from_page(page, bottom=50, up=prev_node.top).strip()
    text = text[text.find('\n')+1:].replace('\n', ' ')

    curr_page_num = prev_node.page + 1
    while curr_page_num < len(pdf_reader.pages):
        text += pdf_reader.pages[curr_page_num].extract_text().replace('\n', ' ')
        curr_page_num += 1

    prev_node.text = text[:text.find("References")]


def get_title_root(pdf_reader):
    prompt = f"What is the title of this article?\n{pdf_reader.pages[0].extract_text()}\nTitle: "
    response, _ = ask_chatgpt(prompt, history=[], system=None, new_chat=True, max_tokens=50, temp=0)
    return response.strip()


def get_depth(root, depth):
    root.depth = depth
    for child in root.children:
        get_depth(child, depth+1)


# def main
pdf_reader = PyPDF2.PdfReader(open(pdf_file, 'rb'))
root = create_graph(pdf_reader, title=get_title_root(pdf_reader), page=pdf_reader.pages[0])
all_nodes = []
json_output = {}
get_all_nodes(all_nodes, root)
get_depth(root, 0)
adjust_page_numbers(pdf_reader, all_nodes)
get_text_all_nodes(pdf_reader, all_nodes)
# float2string(all_nodes)
get_json_output(json_output, root)
json_output

{'2334560093968': {'depth': 0,
  'page': 0,
  'top': None,
  'title': 'Cell2Sentence: Teaching Large Language Models the Language of Biology',
  'text': '',
  'children': ['2334586778512',
   '2334567165392',
   '2334567165776',
   '2334567166800']},
 '2334586778512': {'depth': 1,
  'page': 1,
  'top': 720,
  'title': 'Introduction',
  'text': 'Large language models (LLMs) such as GPT have demonstrated powerful capabilities in natural language processing tasks including question answering, summarization, and text generation ([1], [2], [3], [4], [5], [6]). However, applying LLMs to other domains like biology remains an open challenge. In particular, a method to directly apply existing LLMs to single-cell transcriptomics could enable new ways of analyzing, interpreting, and generating single-cell RNA sequencing data. Current methods in this domain rely on specialized neural networks that do not leverage the pretrained knowledge and language understanding of large language models. In this

In [None]:
# json_output2 = {}
# for node in json_output:
#     key = list(node.keys())[0]
#     json_output2[key] = node[key]
# json_output2

In [None]:
# To Debug
for child in root.children:
    print(child.text, '\n\n')
    # for subchild in child.children:
    #     for subsubchild in subchild.children:
    #         print(subsubchild.text, '\n\n')

In [98]:
def pdf2text(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        pdf_text = ''
        for page_number in range(len(pdf.pages)):
            page = pdf.pages[page_number]
            pdf_text += page.extract_text(x_tolerance=2, y_tolerance=5, layout=False).strip()
    return pdf_text


def count_tokens(history: list):
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = 0
    for message in history:
        num_tokens += 4
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += -1
    num_tokens += 2
    return num_tokens


def count_tokens_text(text: str):
    encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(text))

In [158]:
i = 3
print(all_nodes[i], '\n', count_tokens_text(all_nodes[i].text))

Depth: 1
Title: Results
Page: 2
Text: In this section, we present several benchmarks and evaluations to demonstrate the different use cases of Cell2Sentence. All presented models are trained on a human immune tissue dataset [39] (see Section 3). "NL + C2S" models are pretrained on natural language and then fine-tuned on cell sentences. "C2S" models are only trained on cell sentences. 3Figure 2: Detailed overview of the Cell2Sentence framework. Single-cell gene expression profiles are transformed into cell sentences via expression rank orderering of gene names. Cell sentences may be annotated with biological metadata, such as cell type, tissue, or disease. LLMs are then fine-tuned on the cell sentences. Inference is done by generating cells via autoregressive cell completion, generating cells from text, or generating text from cells. The resulting generated cell sentences can be converted back to gene expression.
Children: ['Cell sentence encoding is a robust and reversible operation', 

In [154]:
system = f'Act as a professional scientist that reviews articles.'
article = f'article: {all_nodes[i]}'
prompt = f'{article}.\nExplain all important information, terms, and ideas in summarized bullet points: '

response, history = ask_chatgpt(prompt, history=[], system=system, new_chat=True, max_tokens=280, temp=0)
print(response, '\n\n', count_tokens_text(response))

- The article compares self-attention layers to recurrent and convolutional layers for mapping sequences of symbol representations.
- Three desiderata are considered: computational complexity per layer, parallelizability, and path length between long-range dependencies.
- Self-attention layers have a constant number of sequentially executed operations and are faster than recurrent layers when the sequence length is smaller than the representation dimensionality.
- Self-attention can be restricted to a neighborhood of size r to improve computational performance for very long sequences.
- Convolutional layers require a stack of layers to connect all pairs of input and output positions, increasing the length of the longest paths in the network.
- Separable convolutions decrease complexity, but self-attention combined with a point-wise feed-forward layer has similar complexity.
- Self-attention could yield more interpretable models, as attention heads learn to perform different tasks relat

In [161]:
def summarize(section, max_tokens=280, temp=0):

    system = f'Act as a professional scientist that reviews articles.'
    article = f'article: {section}'
    prompt = f'{article}.\nExplain all important information, terms, and ideas in summarized bullet points: '

    history = [{"role": "system", "content": system}, {"role": "user", "content": prompt}]

    response = openai.ChatCompletion.create(
      model='gpt-3.5-turbo',
      messages=history,
      temperature=temp,
      max_tokens=max_tokens,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    return response['choices'][0]['message']['content']


def answer_question(question, max_tokens=150, temp=0):

    system = f'Act as a professional scientist that reviews articles.'
    article = f'article: {section}'
    prompt = f'{article}.\nExplain all important information, terms, and ideas in summarized bullet points: '

    history = [{"role": "system", "content": system}, {"role": "user", "content": prompt}]

    response = openai.ChatCompletion.create(
      model='gpt-3.5-turbo',
      messages=history,
      temperature=temp,
      max_tokens=max_tokens,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    return response['choices'][0]['message']['content']

print(summarize(all_nodes[i].text))

- The article discusses the use of Cell2Sentence, a framework for analyzing human immune tissue data.
- The models used in the study are trained on a dataset of human immune tissue.
- Two types of models are presented: "NL + C2S" models, which are pretrained on natural language and then fine-tuned on cell sentences, and "C2S" models, which are trained only on cell sentences.
- The Cell2Sentence framework involves transforming single-cell gene expression profiles into cell sentences using expression rank ordering of gene names.
- Cell sentences can be annotated with biological metadata such as cell type, tissue, or disease.
- Language models (LLMs) are then fine-tuned on the cell sentences.
- Inference can be done by generating cells via autoregressive cell completion, generating cells from text, or generating text from cells.
- The resulting generated cell sentences can be converted back to gene expression.


In [None]:
text = pdf2text(pdf_file)
print(text, '\n\n', count_tokens_text(text))

In [None]:
system = f'Act as a professional scientist that reviews article.'
article = f'article: {text}'

prompt = f'{article}.\nObjective: List sections and subsections of the article. To find those sections, include several exact words of the article that followed each section and subsection.'

response, history = ask_chatgpt(prompt, history=[], system=system, new_chat=True, max_tokens=1000, temp=0, model='gpt-3.5-turbo-16k')
print(response)

In [None]:
text.find('Scaled Dot-Product Attention ')

In [None]:
section2text = {}
sections_list = response.strip().split('\n')
section_indexes = []

for section in sections_list:
    section_indexes.append(text.find(section))

print(section_indexes)

In [None]:
text[20000:]

In [None]:
print("Table 1: Maximum path lengths, per-layer complexity and minimum number of sequential operations\nfor different layer types. n is the sequence length, d is the representation dimension, k is the kernel\nsize of convolutions and r the size of the neighborhood in restricted self-attention.\nLayer Type Complexity per Layer Sequential Maximum Path Length\nOperations\nSelf-Attention O(n2 · d) O(1) O(1)\nRecurrent O(n · d2) O(n) O(n)\nConvolutional O(k · n · d2) O(1) O(logk(n))\nSelf-Attention (restricted) O(r · n · d) O(1) O(n/r)\n")

In [None]:
text[2859:4780]