In [31]:
import pymupdf4llm
md_text = pymupdf4llm.to_markdown(r"file_location")

Processing C:\Users\Anubhav\OneDrive\Desktop\Projects\GenAI\RAG Langchain\data\time-to-place-our-bets-europes-ai-opportunity.pdf...


In [82]:
md_text

'QuantumBlack, AI by McKinsey\n# Time to place our bets: Europe’s AI opportunity\n\n##### Boosting Europe’s competitiveness across the AI value chain.\n\n_[by Alexander Sukharevsky, Eric Hazan, Sven Smit, Marc-Antoine de la Chevasnerie, Marc de Jong,](https://www.mckinsey.com/our-people/alexander-sukharevsky)_\n_[Solveigh Hieronimus, Jan Mischke, and Guillaume Dagorret](https://www.mckinsey.com/our-people/solveigh-hieronimus)_\n\n\n-----\n\n###### At a glance\n\n— A three-lens approach–on adoption,\n\ncreation, and energy–is required to assess\nEurope’s competitiveness in the emerging\ngenerative AI (gen AI) economy. While much\nof the current discourse centers around large\nlanguage models (LLMs), European policy\nmakers and business leaders must look\nbeyond LLMs. Adopting a holistic approach to\ncapitalize fully on gen AI’s potential could boost\nEuropean labor productivity by up to 3 percent\nannually through 2030.\n\n— On adoption, European organizations lag\n\nbehind their US cou

In [81]:
# CHunking
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter  = RecursiveCharacterTextSplitter(
    chunk_size = 800,
    chunk_overlap = 50,
)

chunks = text_splitter.create_documents([md_text])

In [85]:
len(chunks)

86

In [63]:
# Setting up the vector store
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(
    documents = chunks,
    collection_name = 'rag_langggraph',
    embedding = OpenAIEmbeddings()
)

retriever = vectorstore.as_retriever()

In [86]:
retriever.invoke("What is Europe's Gen AI strategy?")

[Document(metadata={}, page_content='Creation of gen AI in Europe\nRegarding creating gen AI, winning in every\nsegment isn’t a realistic strategy for Europe.\nA differentiated approach, based on current\nstrengths, is crucial for the region to stay relevant.\nPotential steps include the following:\n\n— Increase investment. In 2023, US private\n\ninvestments in AI reached $67 billion, compared\nwith just $11 billion in Europe.[60] This gap is\neven more striking when looking specifically\nat investments in gen AI. In 2023, US private'),
 Document(metadata={}, page_content='Creation of gen AI in Europe\nRegarding creating gen AI, winning in every\nsegment isn’t a realistic strategy for Europe.\nA differentiated approach, based on current\nstrengths, is crucial for the region to stay relevant.\nPotential steps include the following:\n\n— Increase investment. In 2023, US private\n\ninvestments in AI reached $67 billion, compared\nwith just $11 billion in Europe.[60] This gap is\neven more

In [88]:
# Setting up the router

from typing import Literal
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI

# Schema for Router
class Router(BaseModel):
    """
    Route a user query to the most relevant
    path for response generation.

    """
    path: Literal["vectorstore","web_search"] = Field(...,
                                                      description = "Given a user question choose to route it to web search or vectorstore")


# LLM

llm = ChatOpenAI(model = 'gpt-3.5-turbo',temperature = 0)
structured_llm_router = llm.with_structured_output(Router)

# Prompt
system = """ You're an expert at routing a user question to a vector store or web search.
The vectorstore contains documents related to Europe's competitive position and opportunities in the 
generative AI value chain, covering sectors like AI semiconductor manufacturing, cloud infrastructure, 
and energy demands.

Use the vectorstore for questions around these topics.

Otherwise use Web Search

"""
route_prompt = ChatPromptTemplate.from_messages(
    [
        ('system',system),
        ('human',"{question}")
    ]
)

router_chain = route_prompt | structured_llm_router


# Testing
route = router_chain.invoke({'question': 'How much of Europe’s productivity growth can generative AI potentially contribute annually by 2030?'})

route_2 = router_chain.invoke({'question': 'What is capital of Finland?'})


print(route_2)



path='web_search'


In [94]:
# Retrieval Grader

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI

# Schema for grading

class GradeDocuments(BaseModel):
    """
    Binary score for quanitifying the relevance of the retrieved documents

    """
    binary_score: str = Field(
        description = "Documents are relevant to the question,'yes' or 'no'"
    )

# LLM 

llm = ChatOpenAI(model = 'gpt-3.5-turbo',temperature = 0)

# Getting structured o/p from llm for grading
llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt
system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""

grading_prompt = ChatPromptTemplate.from_messages(
    [
        ('system',system),
        ('human',"Retrieved_document: \n\n {document} \n\n User question: {question}"),
    ]
)

grader_chain = grading_prompt | llm_grader

question = "Which country in Europe has a leading position in AI semiconductor equipment manufacturing?"

docs = retriever.get_relevant_documents(question)

# Firs retrieved chunk from the retrieval
doc_text = docs[2].page_content

print(grader_chain.invoke({'question': question,
                           'document':doc_text}))

binary_score='yes'


In [96]:
# Generating response

from langchain import hub
from langchain_core.output_parsers import StrOutputParser

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# llm 

llm = ChatOpenAI(model = 'gpt-3.5-turbo')

# combining the retrieved docs
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = prompt | llm | StrOutputParser()

# Response

response = rag_chain.invoke({'context': docs,
                             'question': question})

print(response)


The Netherlands has a leading position in AI semiconductor equipment manufacturing, with companies like ASML being the market leader for lithography machines required for high-end semiconductors suitable for AI. Additionally, European companies also lead in other equipment segments like atomic layer deposition and metal-organic chemical vapor deposition. However, in some key niches like dry etchers and dicing machines, European companies are less present.


In [97]:
# Hallucinations measure

# Schema

class GradeHallucinations(BaseModel):
    """
    Binary Score for indicating if hallucinations
    are present in the generated answer

    """
    binary_score: str = Field(
        description = "Generated answer is grounded in the context provided, 'yes' or 'no'"
        )

# Hallucination grading LLM
llm = ChatOpenAI(model = 'gpt-3.5-turbo', temperature = 0)
llm_hallucinations = llm.with_structured_output(GradeHallucinations)

# Prompt
system = """You are a grader assessing whether an Generated answer is grounded in / supported by a set of retrieved context. \n 
     Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""

hallucination_prompt = ChatPromptTemplate.from_messages(
    [
        ('system',system),
        ('human',"Context: \n\n {documents} \n\n Generated answer: {response}"),
   
    ]
)

hallucination_grader_chain = hallucination_prompt | llm_hallucinations

hallucination_grader_chain.invoke({'documents':docs,
                                   'response': response})

GradeHallucinations(binary_score='yes')

In [98]:
### Answer Grader


# Data model
class GradeAnswer(BaseModel):
    """Binary score to assess answer addresses question."""

    binary_score: str = Field(
        description="Answer addresses the question, 'yes' or 'no'"
    )


# LLM with function call
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature = 0)

structured_llm_grader = llm.with_structured_output(GradeAnswer)

# Prompt
system = """You are a grader assessing whether an answer addresses / resolves a question \n 
     Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question."""
answer_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "User question: \n\n {question} \n\n LLM generation: {response}"),
    ]
)

answer_grader = answer_prompt | structured_llm_grader
answer_grader.invoke({"question": question, "response": response})

GradeAnswer(binary_score='yes')

In [100]:
question

'Which country in Europe has a leading position in AI semiconductor equipment manufacturing?'

In [99]:
### Question Re-writer

# LLM
llm = ChatOpenAI(model="gpt-3.5-turbo")

# Prompt
system = """You are a question re-writer that converts an input question to a better version that is optimized \n 
     for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
re_write_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        (
            "human",
            "Here is the initial question: \n\n {question} \n Formulate an improved question.",
        ),
    ]
)

question_rewriter = re_write_prompt | llm | StrOutputParser()
question_rewriter.invoke({"question": question})

'Which European country is at the forefront of manufacturing AI semiconductor equipment?'

In [77]:
# Web Search tool

from langchain_community.tools.tavily_search import TavilySearchResults

web_search_tool = TavilySearchResults(k=3)

In [101]:
web_search_tool.invoke("What is capital of Finland?")

[{'url': 'https://www.mappr.co/capital-cities/finland/',
  'content': "What is the Capital of Finland? Helsinki is the capital of Finland. It is also the most populous city and serves as the country's economic, political, educational, and cultural center. Helsinki, the capital city of Finland. Helsinki has a long and complex history, and over the centuries, it evolved into a modern and cosmopolitan city."},
 {'url': 'https://www.worldatlas.com/articles/what-is-the-capital-of-finland.html',
  'content': "Helsinki is the capital of Finland. The city is located in southern Finland's Uusimaa region on the Gulf of Finland's shores. The city is 80 km away from the north of Tallinn in Estonia, 388 km west of Russia's St. Petersburg, 400 km to the east of Stockholm in Sweden. The capital city of Finland hosts a population of 629,512, and it is"},
 {'url': 'https://www.wikiwand.com/en/articles/Helsinki',
  'content': "Helsinki[lower-alpha 1][lower-alpha 2] is the capital and most populous city 