In [16]:
import os
import re
import pandas as pd
import numpy as np
from typing import List
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from pydantic import FieldSerializationInfo, validator
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.output_parsers import RegexParser, PydanticOutputParser
from pprint import pprint
import random
import warnings
warnings.filterwarnings(action = "ignore")
import requests
from langchain.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
import PyPDF2
import dotenv
from langchain_openai import ChatOpenAI
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone
import torch
import ftfy
from tqdm import tqdm
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryByteStore
# from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import fitz 
from PIL import Image
import io
from openai import OpenAI
from pinecone import Pinecone

from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

import pdfplumber
import json

In [17]:
# with open("my_key.txt",'r') as f:
#     line = f.readline().strip()
# my_OpenAI_key = line

In [18]:
def clean_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', "", filename)

def get_unique_filename(output_folder, base_name, file_ext, side=None):
    if side:
        unique_name = f"{base_name}_{side}.{file_ext}"
        if os.path.exists(os.path.join(output_folder, unique_name)):
            sides = ["left", "right", "top", "bottom"]
            sides.remove(side)
            for new_side in sides:
                unique_name = f"{base_name}_{new_side}.{file_ext}"
                if not os.path.exists(os.path.join(output_folder, unique_name)):
                    break
    else:
        counter = 1
        unique_name = f"{base_name}.{file_ext}"
        while os.path.exists(os.path.join(output_folder, unique_name)):
            unique_name = f"{base_name}_{counter}.{file_ext}"
            counter += 1
    return unique_name

def extract_images_from_pdf(pdf_path, output_folder):
    pdf_document = fitz.open(pdf_path)
    pdf_file = open(pdf_path, "rb")
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    
    image_count = 0

    for page_number in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_number]
        try:
            xObject = page['/Resources']['/XObject'].get_object()
            for obj in xObject:
                if xObject[obj]['/Subtype'] == '/Image':
                    image = xObject[obj]
                    size = (image['/Width'], image['/Height'])
                    data = image.get_data()

                    if image['/ColorSpace'] == '/DeviceRGB':
                        mode = "RGB"
                    elif image['/ColorSpace'] == '/DeviceCMYK':
                        mode = "CMYK"
                    else:
                        mode = "P"

                    if image['/Filter'] == '/DCTDecode':
                        file_ext = "jpg"
                        image_data = io.BytesIO(data)
                        img = Image.open(image_data)
                    elif image['/Filter'] == '/FlateDecode':
                        file_ext = "png"
                        img = Image.frombytes(mode, size, data)
                    elif image['/Filter'] == '/JPXDecode':
                        file_ext = "jp2"
                        image_data = io.BytesIO(data)
                        img = Image.open(image_data)
                    else:
                        file_ext = "bmp"
                        image_data = io.BytesIO(data)
                        img = Image.open(image_data)

                    pdf_page = pdf_document.load_page(page_number)
                    text = pdf_page.get_text("text")
                    figure_number = None
                    side = None

                    for line in text.split('\n'):
                        if 'Figure' in line:
                            figure_number = line.strip().split()[1]
                            if 'left' in line.lower():
                                side = "left"
                            elif 'right' in line.lower():
                                side = "right"
                            elif 'top' in line.lower():
                                side = "top"
                            elif 'bottom' in line.lower():
                                side = "bottom"
                            break

                    if figure_number:
                        sanitized_figure_number = clean_filename(figure_number)
                        base_name = f"{os.path.basename(pdf_path).split('.')[0]}_Figure_{sanitized_figure_number}"
                        unique_name = get_unique_filename(output_folder, base_name, file_ext, side)
                    else:
                        image_count += 1
                        base_name = f"{os.path.basename(pdf_path).split('.')[0]}_image{image_count}"
                        unique_name = get_unique_filename(output_folder, base_name, file_ext)

                    image_path = os.path.join(output_folder, unique_name)
                    img.save(image_path)
                    print(f"Saved image {image_path}")

        except KeyError:
            print(f"No images found on page {page_number + 1}")
        except Exception as e:
            print(f"Could not extract image on page {page_number + 1}, object {obj}: {e}")

    pdf_file.close()

def extract_text(pdf_path):
    reader = PyPDF2.PdfReader(pdf_path)
    text = ""
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text += page.extract_text() + "\n"
    return text

def chunk_text(text, chunk_size=2500, chunk_overlap=800):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    return splitter.create_documents([text])

def process_pdf(pdf_path, output_dir):
    extract_images_from_pdf(pdf_path, output_dir)
    text = extract_text(pdf_path)
    chunks = chunk_text(text)

    doc_name = os.path.basename(pdf_path)
    serial_id = 0
    total_word_count = 0

    final_docs = []
    for chunk in chunks:
        word_count = len(chunk.page_content.split())
        chunk.metadata['document_id'] = doc_name
        chunk.metadata['word_cnt'] = word_count
        chunk.metadata['sid'] = serial_id
        final_docs.append(chunk)
        total_word_count += word_count
        serial_id += 1

    print(f'Documents were split into {len(final_docs)} total docs for {doc_name}')
    mean_words = total_word_count / len(final_docs)
    print(f'Average word count of each document: {mean_words}')

    return final_docs

def process_all_pdfs(folder_path, output_dir):
    all_final_docs = []

    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith('.pdf'):
            pdf_path = os.path.join(folder_path, file_name)
            final_docs = process_pdf(pdf_path, output_dir)
            all_final_docs.extend(final_docs)

    return all_final_docs

In [19]:
input_folder = '/Users/arielazria/Library/CloudStorage/OneDrive-TheUniversityofChicago/Summer Hackathon/Academic_Papers'
output_folder = 'extracted_images'

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Process all PDFs in the folder
all_final_docs = process_all_pdfs(input_folder, output_folder)

No images found on page 2
No images found on page 3
No images found on page 4
No images found on page 6
No images found on page 9
No images found on page 10
No images found on page 12
No images found on page 13
No images found on page 14
No images found on page 15
No images found on page 16
No images found on page 17
No images found on page 19
No images found on page 24
No images found on page 25
No images found on page 26
No images found on page 27
No images found on page 29
No images found on page 30
No images found on page 31
No images found on page 32
No images found on page 33
No images found on page 34
No images found on page 35
No images found on page 36
No images found on page 37
No images found on page 38
No images found on page 40
No images found on page 42
No images found on page 43
No images found on page 44
No images found on page 45
No images found on page 47
No images found on page 49
No images found on page 50
No images found on page 51
No images found on page 52
No ima

In [20]:
# Setting up environment
from openai import OpenAI

my_OpenAI_key = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
HUGGING_FACE_API_KEY = os.getenv('HUGGING_FACE_API_KEY')
PINECONE_API_ENV = os.getenv('PINECONE_API_ENV')

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("ai-cademic")
client = OpenAI(api_key=my_OpenAI_key)
EMBEDDING_MODEL = "text-embedding-3-small"

In [22]:
embeddings_model = OpenAIEmbeddings(model=EMBEDDING_MODEL,
openai_api_key=my_OpenAI_key,
disallowed_special=())

# Assuming chunks is a list of strings
def get_embedding(text, model="text-embedding-3-small"):
    return client.embeddings.create(input=[text], model=model).data[0].embedding

def split_text(text, max_length=8000):
    text = text.replace("\n", " ")
    words = text.split()
    chunks = []
    current_chunk = []

    current_length = 0
    for word in words:
        if current_length + len(word) + 1 > max_length:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(word)
        current_length += len(word) + 1

    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks


In [23]:

texts = [doc.page_content for doc in all_final_docs]
text_string = ' '.join(texts)

chunks = split_text(text_string)
chunk_list = [ ]

for x,y in enumerate(chunks):
    chunk_list.append((x,y))

In [24]:
embedding_list = []
exceptions = [ ]

for row in chunk_list:
    if len(row) != 2:
        print(f"Skipping row {row} because it does not contain exactly two elements")
        continue

    x, y = row
    print(f"Processing x: {x}, y: {y}")  # Debugging line to check the values of x and y

    try:
        y = get_embedding(y, model='text-embedding-3-small')
        z = (str(x), y)
        embedding_list.append(z)
    except Exception as e:
        exceptions.append(e)
        print(f"Error processing row {row}: {e}")

print("Final embedding list:", embedding_list)

Processing x: 0, y: Challenges and Applications of Large Language Models Jean Kaddourα,†,∗, Joshua Harrisβ,∗, Maximilian Mozesα, Herbie Bradleyγ,δ,ϵ, Roberta Raileanuζ, and Robert McHardyη,∗ αUniversity College LondonβUK Health Security AgencyγEleutherAI δUniversity of CambridgeϵStability AIζMeta AI ResearchηInstaDeep Abstract Large Language Models (LLMs) went from non-existent to ubiquitous in the machine learn- ing discourse within a few years. Due to the fast pace of the field, it is difficult to identify the remaining challenges and already fruitful application areas. In this paper, we aim to es- tablish a systematic set of open problems and application successes so that ML researchers can comprehend the field’s current state more quickly and become productive. Contents 1 Introduction 1 2 Challenges 2 2.1 Unfathomable Datasets . . . . . . 2 2.2 Tokenizer-Reliance . . . . . . . . 4 2.3 High Pre-Training Costs . . . . . 6 2.4 Fine-Tuning Overhead . . . . . . 10 2.5 High Inference Lat

In [25]:
exceptions # there were no exceptions

[]

In [26]:

# # embeddings = [get_embedding(chunk, model='text-embedding-3-small') for chunk in chunks]

# id_vector_tuples = [(str(idx), embedding) for idx, embedding in enumerate(embeddings)]

# # Now we have to split up our id_vector_tuples for upload
# # We'll do this in batches
# def upload_in_batches(index, vectors, batch_size=100):
#     for i in range(0, len(vectors), batch_size):
#         batch = vectors[i:i + batch_size]
#         index.upsert(vectors=batch)


In [27]:
def upload_in_batches(index, vectors, batch_size=100):
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch)

upload_in_batches(index, embedding_list)

In [28]:
dic_embed = { }
for x,y in chunk_list:
    dic_embed[x] = y

In [44]:
user_question="Why does processing multi-model text pose such a challenge for researchers?"

In [45]:
import os
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
'''
    # Now do stuff
    if 'trial' not in pc.list_indexes().names():
        pc.create_index(
            name='trial', 
            dimension=1536, 
            metric='cosine',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
   
        )

'''

myindex=pc.Index('ai-cademic')

queryresult=myindex.query(
    #namespace="( Default )",
    vector=get_embedding(user_question, model='text-embedding-3-small') ,
    top_k=5,
    include_values=True
)

queryresult['matches'][0]

{'id': '0',
 'score': 0.528125346,
 'values': [0.00493527623,
            0.0196973663,
            0.069808282,
            -0.00589025579,
            0.015687909,
            0.0125532439,
            0.0252231248,
            -0.000379531499,
            -0.0302094314,
            0.0613519736,
            0.0153379934,
            -0.0665423945,
            -0.0482009575,
            0.0192016512,
            -0.0134790633,
            -0.00104245869,
            0.0222196784,
            -0.0265207309,
            0.0139018791,
            0.0506212078,
            0.0350499377,
            0.0210387111,
            -0.00473844865,
            0.0149078881,
            0.0281682536,
            -0.0318423733,
            0.047851041,
            0.020542996,
            7.75123626e-05,
            -0.0302094314,
            0.0583776869,
            -0.0363329649,
            -0.0501254946,
            -0.0269727074,
            -0.0209803917,
            0.0584651642,
          

In [32]:
dic_embed[int(queryresult['matches'][0]['id'])]

'Challenges and Applications of Large Language Models Jean Kaddourα,†,∗, Joshua Harrisβ,∗, Maximilian Mozesα, Herbie Bradleyγ,δ,ϵ, Roberta Raileanuζ, and Robert McHardyη,∗ αUniversity College LondonβUK Health Security AgencyγEleutherAI δUniversity of CambridgeϵStability AIζMeta AI ResearchηInstaDeep Abstract Large Language Models (LLMs) went from non-existent to ubiquitous in the machine learn- ing discourse within a few years. Due to the fast pace of the field, it is difficult to identify the remaining challenges and already fruitful application areas. In this paper, we aim to es- tablish a systematic set of open problems and application successes so that ML researchers can comprehend the field’s current state more quickly and become productive. Contents 1 Introduction 1 2 Challenges 2 2.1 Unfathomable Datasets . . . . . . 2 2.2 Tokenizer-Reliance . . . . . . . . 4 2.3 High Pre-Training Costs . . . . . 6 2.4 Fine-Tuning Overhead . . . . . . 10 2.5 High Inference Latency . . . . . . 11

In [33]:
import json

def save_dict_to_json(dictionary, file_path):
    with open(file_path, 'w') as file:
        json.dump(dictionary, file, indent=4)
save_dict_to_json(dic_embed, "dicembed.json")

In [34]:
len(queryresult['matches'])

5

In [35]:
answer_context = ""

for x in range(0, len(queryresult['matches'])): 
    j = dic_embed[int(queryresult['matches'][x]['id'])]

    answer_context += j
    answer_context += '\n'


In [36]:
answer_context



In [37]:
file_path = 'answer_context.txt'

with open(file_path, 'w') as file: 
    file.write(answer_context)

In [38]:
with open('answer_context.txt', 'r') as f:
    doc_text = f.read()

doc_name = 'answer_context.txt.txt'

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500,
    chunk_overlap=800,
    length_function = len,
)

# with open(file_path, "r") as f:
#     text = f.read()

split_docs = text_splitter.create_documents([doc_text])
# Keep serial id counter to assign a chronological number to each document
serial_id = 0
# Append modified split docs to this final list
final_docs = []
# To keep track of word count, will divide by number of sections to see the avg section length
total_word_cnt = 0
for doc in split_docs:
    word_cnt = len(doc.page_content.split())
    # doc.metadata['folder_id'] = folder_id
    doc.metadata['document_id'] = doc_name
    doc.metadata['word_cnt'] = word_cnt
    # sub_doc.metadata = doc.metadata
    doc.metadata['sid'] = serial_id
    final_docs.append(doc)
    total_word_cnt += word_cnt
    serial_id += 1
print(f'Legal document split into {len(final_docs)} total docs\n')
avg_word_cnt = total_word_cnt / len(final_docs)
print(f'Average word count of each document: {avg_word_cnt}\n')

Legal document split into 25 total docs

Average word count of each document: 339.16



#### QUERYING

In [39]:
from langchain import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate

# Initialize the OpenAI model (make sure you have set your OpenAI API key in the environment)
model = OpenAI(model_name="gpt-4o")  # Example model name

qa_prompt_template = '''<s>[INST] You are an expert in academic papers that specializes in helping students, faculty, and researchers 
answer pressing question. Given the following context, answer the provided question using wording that anyone can understand. 
Use a level of detail in your answer that helps the researcher understand the topic better. 
If you don't know the answer to the question from the conext, say you don't know the answer. Do not make anything up [/INST]<s>
[INST]
Context: {context}
Question: {question}
Helpful answer: [/INST]'''

# Define your prompt template
qa_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="Context: {context}\n\nQuestion: {question}\n\nAnswer:"
)

#Ouput parser
output_parser = RegexParser(
    regex = r"(.*?)\nScore: (.*)", 
    output_keys=["answer", "score"],
)

qa_prompt = PromptTemplate(
    template = qa_prompt_template, 
    input_variables=["context", 
    "question"
    ], 
    output_parser =output_parser,
)

# Load the QA chain with the appropriate parameters
load_qa_chain_text = load_qa_chain(
    model,
    chain_type="stuff",
    prompt=qa_prompt
)

print("QA Chain successfully loaded!")

QA Chain successfully loaded!


In [50]:

def get_llm_answer(question, context, api_key):
    # Setting up the headers for the API request
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    
    # Setting up the payload for the API request
    payload = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {
                "role": "user",
                "content": question + context
            }
        ],
        "max_tokens": 300
    }
    
    # Making the POST request to the OpenAI API
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    
    # Returning the JSON response
    return response.json()



In [54]:
def get_llm_answer(question, context, api_key):
    # Validate input parameters
    if not api_key:
        raise ValueError("API key is required")
    if not question:
        raise ValueError("Question is required")
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    
    payload = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {
                "role": "user",
                "content": question + context
            }
        ],
        "max_tokens": 300
    }
    
    try:
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()  # Raise an error for bad HTTP status codes
        
        # Extract the plain text answer from the response
        answer = response.json()['choices'][0]['message']['content'].strip()
        return answer
    
    except requests.exceptions.RequestException as e:
        return {"error": str(e)}


In [57]:
answer_context = dic_embed[int(queryresult['matches'][0]['id'])]
# checking work

In [58]:
# Example usage:
# api_key = "your_openai_api_key"
# response = get_llm_answer("What is the capital of France?", "", api_key)
# print(response)
openai_api_key # your open AI key here
instructions=" Here's some context, only use it if it is relevant to answer the question, if it is not, mention that the information found did not satisfy the need of the user."
get_llm_answer(question=user_question+instructions,context=answer_context,api_key=openai_api_key)

'Multi-model text processing poses a challenge for researchers due to a variety of reasons, including:\n\n1. Unfathomable Datasets: The size of modern pre-training datasets for large language models (LLMs) renders it impractical for any individual to read or conduct quality assessments thoroughly. This can result in issues such as near-duplicates, data contamination, and other dataset-related challenges.\n\n2. Tokenizer-Reliance: There is a heavy reliance on tokenization methods in processing multi-model text, which can sometimes be limiting or insufficient for capturing the complexities of natural language.\n\n3. High Pre-Training Costs: Training and pre-training large language models on vast amounts of data can be resource-intensive and expensive, posing a challenge for researchers with limited resources.\n\n4. Fine-Tuning Overhead: Fine-tuning LLMs for specific tasks can also be challenging and time-consuming, as it requires additional data and resources to adapt the model to specif