In [1]:
from pinecone import Pinecone, ServerlessSpec
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_pinecone import PineconeVectorStore
import os

os.environ["PINECONE_API_KEY"]="7c6028ed-8d2a-4003-8e7f-fca445aad9a1"
api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=api_key)

In [2]:
# Load PDF gpt technical report and split
loader = PyPDFLoader("gpt-4.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=550,
                                                   chunk_overlap=55)
texts = text_splitter.split_documents(documents)

In [3]:
# Load the embedding model 
model_name="BAAI/bge-m3"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)



In [4]:
index_name = 'langchain-retrieval'
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1024,
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    )

In [4]:
embeddings

HuggingFaceBgeEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='BAAI/bge-m3', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': False}, query_instruction='Represent this question for searching relevant passages: ', embed_instruction='')

In [11]:
len(texts)

610

In [12]:
print(pc.Index(index_name).describe_index_stats())

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [5]:
# Connect to Pinecone index and insert the chunked docs as contents

gptDB = PineconeVectorStore.from_documents(texts, embeddings, index_name=index_name)

In [14]:
print(pc.Index(index_name).describe_index_stats())

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 610}},
 'total_vector_count': 610}


In [6]:
query = "How is the performance of GPT-4 compared to GPT-3 or GPT-3.5?"
docs = gptDB.similarity_search_with_score(query, k=3)
for i in docs:
    doc, score = i
    print({"score": score, "content": doc.page_content, "metadata": doc.metadata },"\n")

{'score': 0.674135506, 'content': 'gpt-4 (no vision)\ngpt3.5Figure 4. GPT performance on academic and professional exams. In each case, we simulate the\nconditions and scoring of the real exam. Exams are ordered from low to high based on GPT-3.5\nperformance. GPT-4 outperforms GPT-3.5 on most exams tested. To be conservative we report the\nlower end of the range of percentiles, but this creates some artifacts on the AP exams which have very\nwide scoring bins. For example although GPT-4 attains the highest possible score on AP Biology (5/5),', 'metadata': {'page': 5.0, 'source': 'gpt-4.pdf'}} 

{'score': 0.669958115, 'content': 'Figure 11: Results on IF evaluations across GPT3.5, GPT3.5-Turbo, GPT-4-launch\n98', 'metadata': {'page': 97.0, 'source': 'gpt-4.pdf'}} 

{'score': 0.659658134, 'content': 'Askell et al.\n2022Askell et al.\n2022gpt-3.5-base gpt-3.5-base gpt-3.5-turbo gpt-4-base gpt-4-base gpt-4\n0%10%20%30%40%50%60%70%\nModelAccuracyAccuracy on adversarial questions (TruthfulQA

In [7]:
query = "Wie hat sich die Performance von GPT-4 im Vergleich zu GPT-3 ode GPT3.5 verändert?"
docs = gptDB.similarity_search_with_score(query, k=3)
for i in docs:
    doc, score = i
    print({"score": score, "content": doc.page_content, "metadata": doc.metadata },"\n")

{'score': 0.689277887, 'content': 'Figure 11: Results on IF evaluations across GPT3.5, GPT3.5-Turbo, GPT-4-launch\n98', 'metadata': {'page': 97.0, 'source': 'gpt-4.pdf'}} 

{'score': 0.660343409, 'content': '2 GPT-4 Observed Safety Challenges\nGPT-4 demonstrates increased performance in areas such as reasoning, knowledge retention, and\ncoding, compared to earlier models such as GPT-2[ 22] and GPT-3.[ 10] Many of these improvements\nalso present new safety challenges, which we highlight in this section.\nWe conducted a range of qualitative and quantitative evaluations of GPT-4. These evaluations\nhelped us gain an understanding of GPT-4’s capabilities, limitations, and risks; prioritize our', 'metadata': {'page': 43.0, 'source': 'gpt-4.pdf'}} 

{'score': 0.644460142, 'content': 'GPT-4 signiﬁcantly reduces hallucinations relative to previous GPT-3.5 models (which have them-\nselves been improving with continued iteration). GPT-4 scores 19 percentage points higher than our\nlatest GPT-3.

In [8]:
retriever = gptDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [9]:
retriever.invoke(query)

[Document(metadata={'page': 97.0, 'source': 'gpt-4.pdf'}, page_content='Figure 11: Results on IF evaluations across GPT3.5, GPT3.5-Turbo, GPT-4-launch\n98'),
 Document(metadata={'page': 43.0, 'source': 'gpt-4.pdf'}, page_content='2 GPT-4 Observed Safety Challenges\nGPT-4 demonstrates increased performance in areas such as reasoning, knowledge retention, and\ncoding, compared to earlier models such as GPT-2[ 22] and GPT-3.[ 10] Many of these improvements\nalso present new safety challenges, which we highlight in this section.\nWe conducted a range of qualitative and quantitative evaluations of GPT-4. These evaluations\nhelped us gain an understanding of GPT-4’s capabilities, limitations, and risks; prioritize our'),
 Document(metadata={'page': 9.0, 'source': 'gpt-4.pdf'}, page_content='GPT-4 signiﬁcantly reduces hallucinations relative to previous GPT-3.5 models (which have them-\nselves been improving with continued iteration). GPT-4 scores 19 percentage points higher than our\nlatest 

In [10]:
from langfuse.callback import CallbackHandler
from langfuse import Langfuse

langfuse_handler = CallbackHandler(
  secret_key="sk-lf-1bd1db59-41de-49e5-896b-ee636349abd2",
  public_key="pk-lf-17015370-b722-4971-bc9b-898e2ff784bd",
  host="https://cloud.langfuse.com"
)

In [12]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate


llm = ChatOllama(model="llama3")
system_prompt = ("You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question."
    "If you don't know the answer, say that you don't know. "
    "Use four sentences maximum and keep the answer as precise as possible."
    "\n\n"
    "{context}")
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


chain = prompt | llm | StrOutputParser()


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

response = rag_chain.invoke({"input": "How is the performance of GPT-4 compared to GPT-3 or GPT-3.5?"}, config={"callbacks": [langfuse_handler]})
#response = rag_chain.invoke({"input": "Wie ist das Wetter heute?"})

print(response["answer"])

According to the context, GPT-4 outperforms GPT-3.5 on most exams tested, and it significantly outperforms both GPT-3.5 and Askell et al [100] in terms of accuracy on adversarial questions (TruthfulQA mc1).


In [None]:
pc.delete_index(index_name)