In [5]:
#setup 
import openai as openai #Extracting content metadata
import fitz #pdf reading library
import time #to ensure we don't call too often from openai
from bs4 import BeautifulSoup #to extract XML info -> will be eliminated eventually
import matplotlib.pyplot as plt
import numpy as np 
import math
import json

# Library to import pre-trained model for sentence embeddings
from sentence_transformers import SentenceTransformer

# Calculate similarities between sentences
from sklearn.metrics.pairwise import cosine_similarity

# package for finding local minimas
from scipy.signal import argrelextrema


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
"""
This is partially a dummy function. This extraction is limited by the fact that XML isn't standard. 
However, I want to start parsing and iterating over txt using GPT as a way of mechanizing/beginning to evaluate our 
thoughts on how to gain greater info about these papers. 
"""
def splitXMLParagraphs(filepath):
    with open(filepath, 'r') as f:
        data = f.read()

    # Passing the stored data inside
    # the beautifulsoup parser, storing
    # the returned object
    Bs_data = BeautifulSoup(data, "xml")
    para = Bs_data.find_all('p')
    paragraphs = []

    for x in range(len(para)): 
        if len(para[x].text) < 2800:
            paragraphs.append(para[x].text)
        else: 
            para[x.text].split
            x -= 1
    return paragraphs

In [4]:
def pdfMetadata(filepath): 
    doc = fitz.open(filepath)
    metadata = doc.metadata
    return metadata 

{'format': 'PDF 1.4',
 'title': 'doi:10.1016/j.snb.2008.10.030',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': 'Elsevier',
 'producer': 'Acrobat Distiller 7.0 (Windows)',
 'creationDate': 'D:20090112130006Z',
 'modDate': "D:20090117135818+05'30'",
 'trapped': '',
 'encryption': None}

In [14]:
def pdfTextExtraction(filename, filepath): 
    doc = fitz.open(filepath)  # open document
    with open(filename + '.txt', 'w') as out:
        for page in doc:  # iterate the document pages
            text = page.get_text().encode("utf8")  # get plain text (is in UTF-8)
            out.write(text.decode('utf-8'))  # write text of page
            out.write(bytes((12,)).decode('utf-8'))  # write page delimiter (form feed 0x0C) 
    

In [None]:
#a bunch of code from This guy -> https://medium.com/@npolovinkin/how-to-chunk-text-into-paragraphs-using-python-8ae66be38ea6
def rev_sigmoid(x:float)->float:
    return (1 / (1 + math.exp(0.5*x)))
    
def activate_similarities(similarities:np.array, p_size=10)->np.array:
    """ Function returns list of weighted sums of activated sentence similarities

    Args:
        similarities (numpy array): it should square matrix where each sentence corresponds to another with cosine similarity
        p_size (int): number of sentences are used to calculate weighted sum 

    Returns:
        list: list of weighted sums
    """
    # To create weights for sigmoid function we first have to create space. P_size will determine number of sentences used and the size of weights vector.
    x = np.linspace(-10,10,p_size)
    # Then we need to apply activation function to the created space
    y = np.vectorize(rev_sigmoid) 
        # Because we only apply activation to p_size number of sentences we have to add zeros to neglect the effect of every additional sentence and to match the length ofvector we will multiply
    activation_weights = np.pad(y(x),(0,similarities.shape[0]-p_size))
    ### 1. Take each diagonal to the right of the main diagonal
    diagonals = [similarities.diagonal(each) for each in range(0,similarities.shape[0])]
    ### 2. Pad each diagonal by zeros at the end. Because each diagonal is different length we should pad it with zeros at the end
    diagonals = [np.pad(each, (0,similarities.shape[0]-len(each))) for each in diagonals]
    ### 3. Stack those diagonals into new matrix
    diagonals = np.stack(diagonals)
    ### 4. Apply activation weights to each row. Multiply similarities with our activation.
    diagonals = diagonals * activation_weights.reshape(-1,1)
    ### 5. Calculate the weighted sum of activated similarities
    activated_similarities = np.sum(diagonals, axis=0)
    return activated_similarities
  

def CreateModularContent(path, fname, sentencetransformer):
    """ Function returns a list of paragraphs from a pdf

    Args:
        path (string): the file path to the PDF in concern
        fname (string): file name of the pdf
        sentencetransformer (sentencetransformer instance): Takes an instance of the sentence transformer library

    Returns:
        paragraphs: list of paragraphs in the file
    """
    #reading the desired file
    with open(path+fname + ".txt", 'r') as file:
     contents = file.read()

    #separating the file into an array based on when there are periods. 
    list_of_contents = contents.split(".")
    embeddings = sentencetransformer.encode(list_of_contents)

        
    # Create similarities matrix
    similarities = cosine_similarity(embeddings)
    
    # Lets apply activated_similarities. For long sentences i reccomend to use 10 or more sentences (not sure what p_size does)
    activated_similarities = activate_similarities(similarities, p_size=similarities.shape[0])

    ### 6. Find relative minima of our vector. For all local minimas and save them to variable with argrelextrema function
    minmimas = argrelextrema(activated_similarities, np.less, order=2) #order parameter controls how frequent should be splits. I would not reccomend changing this parameter.
    # plot the flow of our text with activated similarities

    #visualization stuff that we don't need 
    # lets create empty fig for our plor
    #fig, ax = plt.subplots()
    #sns.lineplot(y=activated_similarities, x=range(len(activated_similarities)), ax=ax).set_title('Relative minimas');
    # Now lets plot vertical lines in order to see where we created the split
    #plt.vlines(x=minmimas, ymin=min(activated_similarities), ymax=max(activated_similarities), colors='purple', ls='--', lw=1, label='vline_multiple - full height')

    #Get the order number of the sentences which are in splitting points
    split_points = [each for each in minmimas[0]]
    # Create empty string
    text = ''
    for num,each in enumerate(list_of_contents):
        # Check if sentence is a minima (splitting point)
        if num in split_points:
            # If it is than add a dot to the end of the sentence and a paragraph before it.
            text+=f'\n{each}. '
        else:
            # If it is a normal sentence just add a dot to the end and keep adding sentences.
            text+=f'{each}. '
   
    with open(path + fname + "_para" + ".txt", 'w') as f:
        f.write(text)

    paragraphs = f.readline()
    return paragraphs   
   

In [39]:
#limitation -> recipe for paragraph extraction, but not necessarily basic metadata like authors etc
def contentMetadataRecipe(openai, filename, prompt): 
# Imports GPT3 model. Using davinci at the moment for final outputs. Curie for testing. 
    counter = 0
    res = ""
    ans = []
    #Wondering if we can retrieve the model earlier on -> so we don't have to do this multiple times. 
    #openai.Model.retrieve("text-curie-001")
    openai.Model.retrieve("text-curie-001")
    
    with open(filename, 'r') as f: 
        paragraphs = f.readlines()

    # structures the base prompt for the model
    #TO BE UPDATED. I want to train my own version of this. 
    #base_prompt = "Paragraph:So yeah, do you see in those ecosystems really cool as pop in? Lots of cool projects, many more I forgot a bunch, but yeah, Jocelyn is always curating this cool landscape, so just check it out. I have the Twitter right there. And yeah, so we just heard about it. So sharing scientific data is super important. Why? Because, well, if we share data, we can collaborate much more easily. We can build bigger data sets and bigger data sets means more statistical power, reliable results, right? So that's pretty cool. And it also means more access to the data that, so there's not the same access to cool instruments that help you with data collection across labs. So if you're in an underfunded research institution, you just may not have the ability to collect the same type of data that a well-funded institution may have. So if we all share data, we all have better access to make cool scientific discoveries. So that's pretty cool, right? But also sharing scientific data right now. It's pretty expensive, it's pretty vulnerable because it's stored on centralized databases where we just have to trust that they keep the database running. It's also not rewarded. So currently, what counts in science is having your PDF cited, but it doesn't matter if you make your data accessible, like you just cannot accrue credit to it. Or there's some ways you can, but it's just not really easy. And it's also pretty painful. So there's a couple of repos out there where you can store your data. These are funded by some governmental institutions. There you access not great. And then also, if you want to find the data, you need to know which repo it's stored at. So you need to find the repo. Then you need to find the data. It's all, it's a hassle, so it's not great.\nExample Summary:Sharing scientific data is important as it allows for better collaboration, bigger data sets, reliable results, and better access for researchers in underfunded institutions. However, currently sharing data is expensive, vulnerable, and not rewarded. It is stored on centralized databases which requires that we trust those servers to keep running. Also, there are no incentives for for making the data accessible. Currently, the only way that we can give credit for using someone else's work is citing their PDF. But with PDF citations, it doesn't matter if you make your data accessible. Sharing data right now isn't worth the cost and time for the researcher.\nParagraph:"

    base_prompt = "Does this paragraph describe the paper's " + prompt + "Answer with a Yes or No."
    
    for x in range(len(paragraphs)): 
        if len(paragraphs[x]) > 100: 
            thought = paragraphs[x].strip()
            p = base_prompt + thought
        else: 
            continue
        
        print('I enter the loop when my paragraph is as tiny as:' +str(len(paragraphs[x])))
        # Model parameters were determined through sandbox testing. Temp is fairly high to allow the model
        response = openai.Completion.create(
            #model = "text-curie-001",
            model="text-curie-001",
            prompt = p,
            max_tokens=400,
            temperature=0.7,
            top_p=1,
            frequency_penalty=0.5,
            presence_penalty=0.5
        )
        answer = response["choices"][0]["text"]
        ans.append(answer)
        
        if answer.find("Yes") != -1: 
            question = "What is the paper's" + prompt + "?" + thought
            # Model parameters were determined through sandbox testing. Temp is fairly high to allow the model
            response2 = openai.Completion.create(
                #model = "text-curie-001",
                model="text-curie-001",
                prompt = question,
                max_tokens=400,
                temperature=0.7,
                top_p=1,
                frequency_penalty=0.5,
                presence_penalty=0.5
            )
            res += response2["choices"][0]["text"]    
        
        
        counter+=1
        #print(counter)
        # A sleep counter because microsoft keeps limiting my creativity
        if counter%30==0 and counter!=0:
            print("\n\n\nI am so sleepy\n\n\n")
            time.sleep(60)
        
        if len(res) < 1600: 
            final = "Summarize these responses into one sentence that tells me the paper's" + prompt + "\n" + res
            response3 = openai.Completion.create(
                        #model = "text-curie-001",
                        model="text-curie-001",
                        prompt = final,
                        max_tokens=400,
                        temperature=0.7,
                        top_p=1,
                        frequency_penalty=0.5,
                        presence_penalty=0.5
                )
        else:
            modular = res.split('.')
            for i in range(len(modular)): 
                if i< len(modular)/2:
                    res1 += modular[i]
                else:
                    res2 += modular[i]
            
            if len(res1) < 1600: 
                final = "Summarize this responses into one sentence that tells me the paper's" + prompt + "\n" + res1
                response4 = openai.Completion.create(
                            #model = "text-curie-001",
                            model="text-curie-001",
                            prompt = final,
                            max_tokens=400,
                            temperature=0.7,
                            top_p=1,
                            frequency_penalty=0.5,
                            presence_penalty=0.5
                    )
            if len(res2) < 1600: 
                final = "Summarize this responses into one sentence that tells me the paper's" + prompt + "\n" + res2
                response5 = openai.Completion.create(
                            #model = "text-curie-001",
                            model="text-curie-001",
                            prompt = final,
                            max_tokens=400,
                            temperature=0.7,
                            top_p=1,
                            frequency_penalty=0.5,
                            presence_penalty=0.5
                    )

    return(response3)

In [40]:
def main(): 
    """
    A potential combo of the functions above to get a set of metadata out. 
    """
    openai.api_key = "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"

    filename = 'Papageorgiou et al_2017_Mechanical properties of graphene and graphene-based nanocomposites'

    filepathpdf = "/Users/desot1/Dev/desci/Papageorgiou et al_2017_Mechanical properties of graphene and graphene-based nanocomposites.pdf"
    
    text = pdfTextExtraction('Papageorgiou et al_2017_Mechanical properties of graphene and graphene-based nanocomposites.txt', filepathpdf)
    
    filepathtxt = filename + '.txt'

    descriptiveMetadata = pdfMetadata(filepathpdf)
    
    contentMetadata = {}


    categories = ['Research Question', 'Alterative Approaches', 'Hypothesis', 'Methodology', 'Results', 'Inferences']
    
    #paragraphs = splitXMLParagraphs(filepathxml)
    contentMetadata['Research Question'] = 'this is the research question'


    for i in range(len(categories)):
        print(contentMetadata)

        contentMetadata = {categories[i]: contentMetadataRecipe(openai, filepathtxt, categories[i])}
        
    print(contentMetadata)

    metadata = [contentMetadata, descriptiveMetadata]

    with open("metadata.json", "w") as write_file:
        json.dump(metadata, write_file, indent=4)  

if __name__ == "__main__": 
    main()    


{'Research Question': 'this is the research question'}
I enter the loop when my paragraph is as tiny as:119
I enter the loop when my paragraph is as tiny as:102
I enter the loop when my paragraph is as tiny as:150
I enter the loop when my paragraph is as tiny as:147
I enter the loop when my paragraph is as tiny as:145
I enter the loop when my paragraph is as tiny as:146
I enter the loop when my paragraph is as tiny as:146
I enter the loop when my paragraph is as tiny as:147
I enter the loop when my paragraph is as tiny as:148
I enter the loop when my paragraph is as tiny as:153
I enter the loop when my paragraph is as tiny as:142
I enter the loop when my paragraph is as tiny as:149
I enter the loop when my paragraph is as tiny as:151
I enter the loop when my paragraph is as tiny as:147
I enter the loop when my paragraph is as tiny as:147
I enter the loop when my paragraph is as tiny as:123
I enter the loop when my paragraph is as tiny as:202
I enter the loop when my paragraph is as tin

InvalidRequestError: This model's maximum context length is 2049 tokens, however you requested 2059 tokens (1659 in your prompt; 400 for the completion). Please reduce your prompt; or completion length.