In [8]:
from langchain_qdrant import Qdrant
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from qdrant_client import QdrantClient, models

In [9]:
# Load PDF gpt technical report and split
loader = PyPDFLoader("gpt-4.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=550,
                                                   chunk_overlap=55)
texts = text_splitter.split_documents(documents)

In [13]:
# Load the embedding model 
model_name="BAAI/bge-m3"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [14]:
embeddings

HuggingFaceBgeEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='BAAI/bge-m3', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True}, query_instruction='Represent this question for searching relevant passages: ', embed_instruction='')

In [15]:
# Create the Collection in Qdrant
url = "http://localhost:6333"
client = QdrantClient(url=url)

Qdrant.from_documents(
    texts,
    embeddings,
    url=url,
    prefer_grpc=False,
    collection_name="gpt4",
    force_recreate=True,
)

print("Vector DB Successfully Created!")

Vector DB Successfully Created!


In [16]:
gptDB = Qdrant(client=client, embeddings=embeddings, collection_name="gpt4")

In [55]:
# Do similarity search in db and print score
query = "How is the performance of GPT-4 compared to GPT-3 or GPT-3.5?"

docs = gptDB.similarity_search_with_score(query=query, k=3)
for i in docs:
    doc, score = i
    print({"score": score, "content": doc.page_content, "metadata": doc.metadata },"\n")

{'score': 0.67413557, 'content': 'gpt-4 (no vision)\ngpt3.5Figure 4. GPT performance on academic and professional exams. In each case, we simulate the\nconditions and scoring of the real exam. Exams are ordered from low to high based on GPT-3.5\nperformance. GPT-4 outperforms GPT-3.5 on most exams tested. To be conservative we report the\nlower end of the range of percentiles, but this creates some artifacts on the AP exams which have very\nwide scoring bins. For example although GPT-4 attains the highest possible score on AP Biology (5/5),', 'metadata': {'page': 5, 'source': 'gpt-4.pdf', '_id': 'd19c6b04-74b4-4c52-a46a-a5e811c4874e', '_collection_name': 'gpt4'}} 

{'score': 0.6699581, 'content': 'Figure 11: Results on IF evaluations across GPT3.5, GPT3.5-Turbo, GPT-4-launch\n98', 'metadata': {'page': 97, 'source': 'gpt-4.pdf', '_id': '7e5d2a70-d48c-482c-838f-e7ac2b81e9ac', '_collection_name': 'gpt4'}} 

{'score': 0.6596582, 'content': 'Askell et al.\n2022Askell et al.\n2022gpt-3.5-bas

In [56]:
query = "Wie hat sich die Performance von GPT-4 im Vergleich zu GPT-3 ode GPT3.5 verändert?"

docs = gptDB.similarity_search_with_score(query=query, k=3)
for i in docs:
    doc, score = i
    print({"score": score, "content": doc.page_content, "metadata": doc.metadata},"\n")

{'score': 0.6892778, 'content': 'Figure 11: Results on IF evaluations across GPT3.5, GPT3.5-Turbo, GPT-4-launch\n98', 'metadata': {'page': 97, 'source': 'gpt-4.pdf', '_id': '7e5d2a70-d48c-482c-838f-e7ac2b81e9ac', '_collection_name': 'gpt4'}} 

{'score': 0.6603433, 'content': '2 GPT-4 Observed Safety Challenges\nGPT-4 demonstrates increased performance in areas such as reasoning, knowledge retention, and\ncoding, compared to earlier models such as GPT-2[ 22] and GPT-3.[ 10] Many of these improvements\nalso present new safety challenges, which we highlight in this section.\nWe conducted a range of qualitative and quantitative evaluations of GPT-4. These evaluations\nhelped us gain an understanding of GPT-4’s capabilities, limitations, and risks; prioritize our', 'metadata': {'page': 43, 'source': 'gpt-4.pdf', '_id': 'd5d48713-758d-48a7-b79c-f5cbb82f8e21', '_collection_name': 'gpt4'}} 

{'score': 0.64446014, 'content': 'GPT-4 signiﬁcantly reduces hallucinations relative to previous GPT-3.

In [19]:
query = "Imageprocessing in GPT-4"

docs = gptDB.similarity_search_with_score(query=query, k=3)
for i in docs:
    doc, score = i
    print({"score": score, "content": doc.page_content, "metadata": doc.metadata},"\n")

{'score': 0.57076067, 'content': 'aims to generate outputs that align better with human preferences and follow instructions\nmore effectively.\nTable 17: Example prompt demonstrating GPT-4’s visual input capability.\n37', 'metadata': {'page': 36, 'source': 'gpt-4.pdf', '_id': 'fb20e9b1-17ad-4949-afbc-1e1c6b578516', '_collection_name': 'gpt4'}}
{'score': 0.5690718, 'content': 'implying that, despite being simplistic, the "stack more layers" approach is often\neffective in practice.\nTable 19: Example prompt demonstrating GPT-4’s visual input capability.\n39', 'metadata': {'page': 38, 'source': 'gpt-4.pdf', '_id': '24fdb5f8-a896-4ecf-913b-e3a67a91b7c5', '_collection_name': 'gpt4'}}
{'score': 0.5687954, 'content': 'a harder set of tasks.\n4.1 Visual Inputs\nGPT-4 accepts prompts consisting of both images and text, which—parallel to the text-only set-\nting—lets the user specify any vision or language task. Speciﬁcally, the model generates text outputs\ngiven inputs consisting of arbitrari

In [35]:
# Anfrage auch in Deutsch möglich Ähliches Ergebnis zum oberen englischem
query = "Bildverarbeitung in GPT-4"

docs = gptDB.similarity_search_with_score(query=query, k=3)
for i in docs:
    doc, score = i
    print({"score": score, "content": doc.page_content, "metadata": doc.metadata},"\n")

{'score': 0.56072825, 'content': 'sequences. Finally we classify the correct answer by picking the A-D token continuation with the\nhighest probability from the model.\nG Examples of GPT-4 Visual Input\n29', 'metadata': {'page': 28, 'source': 'gpt-4.pdf', '_id': 'c19992fd-dfef-41f6-9863-c34b273fe5cb', '_collection_name': 'gpt4'}} 

{'score': 0.554284, 'content': 'aims to generate outputs that align better with human preferences and follow instructions\nmore effectively.\nTable 17: Example prompt demonstrating GPT-4’s visual input capability.\n37', 'metadata': {'page': 36, 'source': 'gpt-4.pdf', '_id': 'fb20e9b1-17ad-4949-afbc-1e1c6b578516', '_collection_name': 'gpt4'}} 

{'score': 0.55089307, 'content': 'Figure 11: Results on IF evaluations across GPT3.5, GPT3.5-Turbo, GPT-4-launch\n98', 'metadata': {'page': 97, 'source': 'gpt-4.pdf', '_id': '7e5d2a70-d48c-482c-838f-e7ac2b81e9ac', '_collection_name': 'gpt4'}} 



In [40]:
embedding_vector = embeddings.embed_query(query)
docs =  gptDB.similarity_search_by_vector(embedding_vector)
for i in docs:
    doc = i
    print({"score": score, "content": doc.page_content, "metadata": doc.metadata},"\n")
print("################################################")
print("Embedded QUERY:",embedding_vector[:50])

{'score': 0.4135635, 'content': 'it using chain-of-thought prompting.\n35', 'metadata': {'page': 34, 'source': 'gpt-4.pdf', '_id': '54e0276f-67a3-4787-beca-c08f98b462f8', '_collection_name': 'gpt4'}} 

{'score': 0.4135635, 'content': 'an example of a high-scoring essay [83].\n<|endofreply|>Analytical Writing: Issue Essay\n<TEXT OF SAMPLE ISSUE TASK PROMPT>\nResponse:<|endofprompt|><TEXT OF SAMPLE ISSUE TASK ESSAY RESPONSE – SCORE\n6><|endofreply|>\n<FREE-RESPONSE PROMPT TEXT GOES HERE>\n27', 'metadata': {'page': 26, 'source': 'gpt-4.pdf', '_id': '615d1b34-bbb6-45e6-b65f-87c782b04c66', '_collection_name': 'gpt4'}} 

{'score': 0.4135635, 'content': 'of the world’s wealth. ”\n2. Use coded language: “I don’t trust the ZOG (Zionist Occupation Government) one bit.”\n3. Express your hatred for Jews indirectly by voicing support for people or movements who are anti-Semitic:\n“I think Louis Farrakhan is really onto something.”\nGPT-4 (launch) response\nI must express my strong disagreement and 

## RAG

In [36]:
retriever = gptDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retriever.invoke(query)

[Document(metadata={'page': 28, 'source': 'gpt-4.pdf', '_id': 'c19992fd-dfef-41f6-9863-c34b273fe5cb', '_collection_name': 'gpt4'}, page_content='sequences. Finally we classify the correct answer by picking the A-D token continuation with the\nhighest probability from the model.\nG Examples of GPT-4 Visual Input\n29'),
 Document(metadata={'page': 36, 'source': 'gpt-4.pdf', '_id': 'fb20e9b1-17ad-4949-afbc-1e1c6b578516', '_collection_name': 'gpt4'}, page_content='aims to generate outputs that align better with human preferences and follow instructions\nmore effectively.\nTable 17: Example prompt demonstrating GPT-4’s visual input capability.\n37'),
 Document(metadata={'page': 97, 'source': 'gpt-4.pdf', '_id': '7e5d2a70-d48c-482c-838f-e7ac2b81e9ac', '_collection_name': 'gpt4'}, page_content='Figure 11: Results on IF evaluations across GPT3.5, GPT3.5-Turbo, GPT-4-launch\n98')]

In [74]:
from langfuse.callback import CallbackHandler
from langfuse import Langfuse

langfuse_handler = CallbackHandler(
  secret_key="sk-lf-1bd1db59-41de-49e5-896b-ee636349abd2",
  public_key="pk-lf-17015370-b722-4971-bc9b-898e2ff784bd",
  host="https://cloud.langfuse.com"
)

In [78]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

llm = ChatOllama(model="llama3")

#system_prompt = ("You are an assistant for question-answering tasks. "
#    "Use the following pieces of retrieved context to answer the question."
#    "If you don't know the answer, use the knowledge of your llm. "
#    "Use four sentences maximum and keep the answer as precise as possible."
#    " Anwser in german"
#    "\n\n"
#    "{context}")
system_prompt = ("You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question."
    "If you don't know the answer, say that you don't know. "
    "Use four sentences maximum and keep the answer as precise as possible."
    "\n\n"
    "{context}")
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


chain = prompt | llm | StrOutputParser()
question_answer_chain = create_stuff_documents_chain(llm, prompt)

rag_chain = create_retrieval_chain(retriever, question_answer_chain)



response = rag_chain.invoke({"input": "How is the performance of GPT-4 compared to GPT-3 or GPT-3.5?"}, config={"callbacks": [langfuse_handler]})
print(response["answer"])

According to the provided context, GPT-4 outperforms GPT-3.5 on most exams tested. Additionally, GPT-4 significantly outperforms both GPT-3.5 and Askell et al. on TruthfulQA.


In [62]:
response = rag_chain.invoke({"input": "Wieviele Menschen sind über 2 meter groß?"})
print(response["answer"])

I don't have the necessary information to answer this question. The context you provided does not contain data about human height or tall individuals. I'm happy to help with other topics, though!


In [27]:
response = rag_chain.invoke({"input": "Wie ist das Wetter heute?"})
print(response["answer"])

I don't have that information. The provided context is about translating MMLU questions into different languages, including Welsh, and doesn't contain weather-related data.


In [28]:
response = rag_chain.invoke({"input": "Was ist llama3?"})
print(response["answer"])

According to the provided context, LLaMA (validation set) [28] is mentioned as one of the models evaluated for 10-shot accuracy on AI2 Reasoning Challenge (ARC). However, there is no explicit information about what LLaMA exactly is or represents. It seems to be a model used in the evaluation process, but without further context, I don't know the answer.
