## Import libraries

In [39]:
from langchain_google_vertexai import ChatVertexAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_pinecone import PineconeVectorStore
from langchain_google_vertexai import VertexAIEmbeddings

from pinecone import Pinecone
from dotenv import load_dotenv
import os

In [12]:
# Import environment variables from .env file
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")

## Instantiate vectorstore

In [9]:
# Initialize Pinecone and fetch the index
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
index = pc.Index(PINECONE_INDEX_NAME)

In [13]:
# Initialize embedding model
embedding_model = VertexAIEmbeddings(model_name="text-embedding-005")

# Initialise vectorstore
vector_store = PineconeVectorStore(index=index, embedding=embedding_model)

In [14]:
# Test if the vectorstore is connected through a similarity search
vector_store.similarity_search("What are the latest advancements in transformer models?", k=3)

[Document(id='4ea51373-146b-4e09-883c-27d8559cb887', metadata={'source': 'raw_data/LLMs_Transformers_2405.06640v1.pdf'}, page_content='Efficiency improvements for vanilla transformers have narrowed the capabilities gap between\nvanilla and linear transformers. The KV cache Pope et al. (2023)greatly narrows the inference\nefficiency gap between linear and vanilla transformers. RingAttention Liu et al. (2023) allows for\nvery long context scaling of vanilla attention without approximation.\nState Space Models.\nState-space models (SSMs) such as H3 (Dao et al., 2022), Hyena (Poli et al.,\n2023), and Mamba (Gu & Dao, 2023) are recent alternatives to vanilla transformers, combining the\nstrengths of convolutional and recurrent models with efficient hardware implementations. Instead\nof parallelizing training over the sequence, they produce an efficient way to train the sequential\nRNN. While these models are competitive with vanilla transformers on some tasks, we show that\nSSMs share the l

## Instantiate the prompt, parser and LLM

In [16]:
# Instantiate LLM
llm = ChatVertexAI(model="gemini-2.5-pro-exp-03-25", temperature=0.2)

# Instantiate output parser
parser = StrOutputParser()

In [18]:
# Initialize the prompt template string
prompt_template = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Keep the answer as concise as possible.
Question: {question} 
Context: {context} 
Answer:
"""

# Generate the prompt template from the template string
prompt = ChatPromptTemplate.from_template(prompt_template)

## Setup retriever and function

In [33]:
def invoke_system(question):
    # Retrieve relevant docs
    docs = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5}).invoke(question)
    context = "\n\n".join([doc.page_content for doc in docs])

    # Instantiate the chain
    chain = prompt | llm | parser

    # Run your existing chain
    result = chain.invoke({"question": question, "context": context})

    # Print the answer
    print("Answer:\n", result)

    # Print sources
    print("\nSources used:")
    seen = set()
    for i, doc in enumerate(docs, 1):
        source = doc.metadata.get("source", "unknown")
        if source not in seen:
            print(f"{i}. {source}")
            seen.add(source)


## Invoke the chain

In [37]:
invoke_system("What are the latest advancements in transformer models?")

Answer:
 Recent advancements in transformer models include:
*   Efficiency improvements such as KV cache for inference and RingAttention for long context scaling.
*   Architectural improvements like RMSNorm, SwiGLU, and removal of bias.
*   Development of more scalable variants like DIFF Transformer.
*   Emergence of alternatives like State-Space Models (SSMs) such as Mamba, Hyena, and H3.

Sources used:
1. raw_data/LLMs_Transformers_2405.06640v1.pdf
2. raw_data/LLMs_Transformers_2412.07201v1.pdf
3. raw_data/LLMs_Transformers_2410.05258v1.pdf
4. raw_data/LLMs_Transformers_2405.04515v2.pdf
5. raw_data/LLMs_Transformers_2108.00104v1.pdf


## Query Loop

In [38]:
while True:
    question = input("Ask a question (or 'exit'): ")
    if question.lower() == "exit":
        break
    invoke_system(question)

Answer:
 A Transformer is a machine learning architecture introduced in 2017, initially for machine translation, which has become a major breakthrough in AI and natural language processing (NLP). It uses attention mechanisms and multi-layer perceptrons (MLPs) to understand relationships within sequences, like words in a sentence. Formally, it's defined as a function mapping an input sequence to an output sequence through layers containing multi-head attention and position-wise feed-forward networks.

Sources used:
1. raw_data/LLMs_Transformers_2403.00807v1.pdf
2. raw_data/LLMs_Transformers_2402.09748v1.pdf
3. raw_data/LLMs_Transformers_2412.07201v1.pdf
5. raw_data/LLMs_Transformers_2410.14706v2.pdf
Answer:
 NLP is an interdisciplinary field involving linguistics, computer science, and mathematics that aims to enable computers to understand, process, and generate natural language text or speech.

Sources used:
1. raw_data/LLMs_Transformers_2503.02435v1.pdf
5. raw_data/LLMs_Transformers_