In [1]:
import os
import glob

# Get the absolute path of the current project directory
project_dir = os.path.abspath('.')

# Get the parent of the parent directory
WORK_DIR = os.path.abspath(os.path.join(project_dir, '../../'))

# Change the working directory to the parent of the parent directory
os.chdir(WORK_DIR)

# Verify the change by printing the current working directory
print("Current Working Directory:", os.getcwd())

Current Working Directory: /Users/david.amat/Documents/david/pdf-search-llm-rag


In [2]:
import os
from openai import OpenAI
import getpass
import json
import pandas as pd
import json_repair
from tqdm import tqdm
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

## Open AI

In [4]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [5]:
client = OpenAI(
    api_key=os.environ["OPENAI_API_KEY"],
)

## Read embeddings from pdf

In [6]:
data = pd.read_parquet("data/debug_embeddings_create.parquet", engine="pyarrow")
data.reset_index(drop=True, inplace=True)

In [7]:
data

Unnamed: 0,file_name,page_number,paragraph,text,embeddings,context_id
0,attention_is_all_you_need,1,1,"Provided proper attribution is provided, Googl...","[-0.017848478630185127, 0.014465652406215668, ...",0
1,attention_is_all_you_need,1,2,Attention Is All You Need\nAshish Vaswani∗\nGo...,"[-0.07263379544019699, -0.1257372349500656, 0....",1
2,attention_is_all_you_need,1,3,∗Equal contribution. Listing order is random. ...,"[-0.08534739911556244, -0.09007889777421951, -...",2
3,attention_is_all_you_need,1,4,†Work performed while at Google Brain,"[-0.0912894532084465, -0.0209952425211668, 0.0...",3
4,attention_is_all_you_need,1,5,‡Work performed while at Google Research,"[-0.11088573932647705, 0.03885276988148689, 0....",4
...,...,...,...,...,...,...
138,attention_is_all_you_need,15,2,<EOS>\n<pad>\nThe\nLaw\nwill\nnever\nbe\nperfe...,"[-0.04332689568400383, 0.022461799904704094, -...",138
139,attention_is_all_you_need,15,3,<EOS>\n<pad>\nInput-Input Layer5\nThe\nLaw\nwi...,"[-0.06781154125928879, -0.0022821910679340363,...",139
140,attention_is_all_you_need,15,4,<EOS>\n<pad>\nThe\nLaw\nwill\nnever\nbe\nperfe...,"[-0.04332689568400383, 0.022461799904704094, -...",140
141,attention_is_all_you_need,15,5,<EOS>\n<pad>Figure 5: Many of the attention he...,"[0.019566060975193977, -0.022429874166846275, ...",141


In [8]:
def generate_questions(context):

    instruction = """
    The given text is the result of the text extraction from the PDF files. 
    Generate 2 meaningful questions on the text and the respective answers.
    Reply strictly in the JSON format:
    {
      "questions": ["question1", "question2", "question3"],
      "answers": ["answer1", "answer2", "answer3"]
    }

    Ensure that the lists of questions and answers are complete and properly formatted. 
    DO NOT include any additional information or characters outside the specified JSON format. 
    The response must consist only of the requested JSON structure. 
    If the generated content does not meet the specified format, please make the necessary adjustments to ensure compliance."""

    prompt = f"\nContext: {context}\nQuestion: {instruction}"

    # Create a chatbot
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        # Pre-define conversation messages for the possible roles 
        messages=[
            {"role": "user", "content": prompt},
        ]
    )
    response = json_repair.loads(completion.choices[0].message.content)
    
    response['context'] = context
    
    return response


### Generate questions for a given text

In [9]:
# Define a function to count the number of words
def word_count(text):
    return len(text.split())

# Filter the DataFrame to keep only rows with 5 or more words
df_filtered = data[data['text'].apply(word_count) >= 5]
df_filtered.reset_index(drop=True, inplace=True)


In [10]:
df_filtered['text'][0]

'Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works'

In [12]:
df_filtered['text'][7]

'Recurrent models typically factor computation along the symbol positions of the input and output\nsequences. Aligning the positions to steps in computation time, they generate a sequence of hidden\nstates ht, as a function of the previous hidden state ht−1and the input for position t. This inherently\nsequential nature precludes parallelization within training examples, which becomes critical at longer\nsequence lengths, as memory constraints limit batching across examples. Recent work has achieved\nsignificant improvements in computational efficiency through factorization tricks [ 21] and conditional\ncomputation [ 32], while also improving model performance in case of the latter. The fundamental\nconstraint of sequential computation, however, remains'