In [1]:
import os
import fitz
import openai
from dotenv import load_dotenv
import tiktoken
import pandas as pd
from openai.embeddings_utils import distances_from_embeddings

In [2]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
def extract_text(pdf_file):
    """Extract text from PDF file"""
    doc = fitz.open(pdf_file)
    text = "".join(page.get_text() for page in doc)

    return text

In [4]:
pdfs_path = os.path.join(os.getcwd(), 'pdf_files') # Path to the pdfs folder
# Get all the pdf files in the pdfs_path directory
pdfs = [os.path.join(pdfs_path, pdf) for pdf in os.listdir(pdfs_path) if pdf.endswith('.pdf')]
pdfs

['D:\\Git_Code_Repositories\\Mini-ChatPDF\\pdf_files\\bert.pdf',
 'D:\\Git_Code_Repositories\\Mini-ChatPDF\\pdf_files\\transformer.pdf']

In [5]:
# Extract the text from each pdf file
texts = " ".join(extract_text(pdf) for pdf in pdfs)
texts

'BERT: Pre-training of Deep Bidirectional Transformers for\nLanguage Understanding\nJacob Devlin\nMing-Wei Chang\nKenton Lee\nKristina Toutanova\nGoogle AI Language\n{jacobdevlin,mingweichang,kentonl,kristout}@google.com\nAbstract\nWe introduce a new language representa-\ntion model called BERT, which stands for\nBidirectional Encoder Representations from\nTransformers. Unlike recent language repre-\nsentation models (Peters et al., 2018a; Rad-\nford et al., 2018), BERT is designed to pre-\ntrain deep bidirectional representations from\nunlabeled text by jointly conditioning on both\nleft and right context in all layers. As a re-\nsult, the pre-trained BERT model can be ﬁne-\ntuned with just one additional output layer\nto create state-of-the-art models for a wide\nrange of tasks, such as question answering and\nlanguage inference, without substantial task-\nspeciﬁc architecture modiﬁcations.\nBERT is conceptually simple and empirically\npowerful.\nIt obtains new state-of-the-art re-\n

In [6]:
tokenizer = tiktoken.get_encoding("cl100k_base") # Load the tokenizer

In [7]:
max_tokens = 500

def split_into_many(text, max_tokens=max_tokens):
    """Split the text into chunks of a maximum number of tokens"""
    sentences = text.split(". ") # Split the text into sentences
    n_tokens = [len(tokenizer.encode(f" {sentence}")) for sentence in sentences] # Count the number of tokens in each sentence

    chunks = [] # Initialize an empty list to store the chunks
    tokens_so_far = 0 # Count the number of tokens so far
    chunk = [] # Initialize an empty list to store the sentences in a chunk

    for sentence, token in zip(sentences, n_tokens): # Iterate over the sentences and their number of tokens
        
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk)+". ") # Append the chunk to the list of chunks
            chunk = [] # Reinitialize the chunk
            tokens_so_far = 0 # Reinitialize the number of tokens so far

        if token > max_tokens:
            continue # If the sentence is longer than the maximum number of tokens, skip it

        chunk.append(sentence) # Append the sentence to the chunk
        tokens_so_far += token + 1 # Update the number of tokens so far
    
    return chunks

In [8]:
def create_embeddings(text):
    """Create embeddings from a text"""
    data = pd.DataFrame([text], columns=['text'])
    data['n_tokens'] = data.text.apply(lambda x: len(tokenizer.encode(x)))

    shortened = [] # Initialize an empty list to store the shortened texts

    for row in data.iterrows(): # Iterate over the rows of the dataframe

        if row[1]['text'] is None:
            continue
    
        if row[1]['n_tokens'] > max_tokens:
            shortened.extend(split_into_many(row[1]['text']))
        else:
            shortened.append(row[1]['text'])
    
    df = pd.DataFrame(shortened, columns=['text']) # Create a new dataframe with the shortened texts
    df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x))) # Count the number of tokens in each text
    df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding']) # Create the embeddings

    return df

In [9]:
data = create_embeddings(texts)
data

Unnamed: 0,text,n_tokens,embeddings
0,BERT: Pre-training of Deep Bidirectional Trans...,404,"[-0.016198763623833656, -0.007893914356827736,..."
1,These include sentence-level tasks such as\nna...,413,"[-0.018787944689393044, -0.00808318518102169, ..."
2,Such re-\nstrictions are sub-optimal for sente...,401,"[-0.02014259621500969, -0.011141416616737843, ..."
3,BERT is the ﬁrst ﬁne-\ntuning based representa...,494,"[-0.017435533925890923, -0.010742495767772198,..."
4,They extract\ncontext-sensitive features from ...,349,"[-0.015512029640376568, 0.008189160376787186, ..."
...,...,...,...
58,Deep residual learning for im-\nage recognitio...,480,"[-0.019290529191493988, 0.0012030559591948986,..."
59,"arXiv preprint\narXiv:1703.10722, 2017.\n[22] ...",498,"[-0.03611760213971138, 0.01731313019990921, 0...."
60,"arXiv\npreprint arXiv:1608.05859, 2016.\n[31] ...",456,"[-0.023556077852845192, -0.0073468321934342384..."
61,In\nAdvances in Neural Information Processing ...,470,"[-0.043427955359220505, 0.006641719955950975, ..."


In [10]:
def create_context(question, df, max_len=1800, size="ada"):
    """Create a context from a question and a dataframe of embeddings"""
    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']

    # Calculate the distances between the question and the embeddings
    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')

    res = [] # Initialize an empty list to store the results
    cur_len = 0 # Initialize the current length of the context

    for _, row in df.sort_values('distances', ascending=True).iterrows(): # Iterate over the rows of the dataframe
        cur_len += row['n_tokens'] + 4 # Update the current length of the context

        if cur_len > max_len:
            break
        
        res.append(row['text']) # Append the text to the list of results
    
    return "\n\n###\n\n".join(res)

In [11]:
def answer_question(
        df,
        question="What is the meaning of life?",
        max_len=1800,
        size="ada",
        debug=False,
        max_tokens=150,
        stop_sequence=None
):
    """Answer a question from a dataframe of embeddings"""
    context = create_context(question, df, max_len=max_len, size=size) # Create a context from the question and the dataframe of embeddings

    prompt = f"""Answer the question as truthfully as possible using the provided text, and if the answer is not contained within the text below, say "I don't know"

        Context:{context}

        Q:{question}
        A:"""

    if debug:
        print("Context:\n" + context)
        print("\n\n")
    
    try:
        response = openai.ChatCompletion.create(
                        model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}]
        )

        return response['choices'][0]["message"]["content"]
    except Exception as e:
        print(e)
        return ""


In [12]:
res = answer_question(data, question="What is transformer and bert?", debug=False)
res

'BERT is a language representation model that is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. It is based on a multi-layer bidirectional Transformer encoder architecture and can be finetuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. Transformer is a type of neural network architecture used in BERT.'

In [13]:
res = answer_question(data, question="What are the advantages of transformer?", debug=False)
res

'The advantages of the Transformer are discussed, including relying entirely on self-attention to compute representations without using sequence-aligned RNNs or convolution, and achieving state-of-the-art results on tasks such as machine translation at a fraction of the training cost of previous models. Specific advantages over other models are also discussed.'

In [14]:
res = answer_question(data, question="What are the advantages of bert?", debug=False)
res

'BERT has state-of-the-art performance on a large suite of sentence-level and token-level tasks, and can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, without substantial task-specific architecture modifications. BERT is also designed to pretrain deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers.'

In [15]:
res = answer_question(data, question="The relationship between bert and transformer?", debug=False)
res

'BERT is a language representation model that is a multi-layer bidirectional Transformer encoder. It is specifically designed for pre-training deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. Therefore, BERT utilizes the Transformer architecture as its underlying model.'

In [16]:
res = answer_question(data, question="What is gpt-3?", debug=False)
res

Rate limit reached for default-gpt-3.5-turbo in organization org-txHll6TuOANv0CmnNngAbqOl on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.


''

In [17]:
res = answer_question(data, question="What is the text discusses?", debug=False)
res

'The text discusses the BERT model and its fine-tuning results on 11 NLP tasks. It also includes additional details and ablation studies for BERT.'

In [18]:
res = answer_question(data, question="Please summarize what is in the text?", debug=False)
res

Rate limit reached for default-gpt-3.5-turbo in organization org-txHll6TuOANv0CmnNngAbqOl on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.


''