## Initialization

In [None]:
#Connecter au cloud google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install llama-index
!pip install llama-index-core
!pip install llama-index-embeddings-openai
!pip install llama-index-postprocessor-flag-embedding-reranker
!pip install git+https://github.com/FlagOpen/FlagEmbedding.git
!pip install llama-parse

Collecting llama-index
  Downloading llama_index-0.10.26-py3-none-any.whl (6.9 kB)
Collecting llama-index-agent-openai<0.3.0,>=0.1.4 (from llama-index)
  Downloading llama_index_agent_openai-0.2.2-py3-none-any.whl (12 kB)
Collecting llama-index-cli<0.2.0,>=0.1.2 (from llama-index)
  Downloading llama_index_cli-0.1.11-py3-none-any.whl (26 kB)
Collecting llama-index-core<0.11.0,>=0.10.26 (from llama-index)
  Downloading llama_index_core-0.10.26-py3-none-any.whl (15.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama-index-embeddings-openai<0.2.0,>=0.1.5 (from llama-index)
  Downloading llama_index_embeddings_openai-0.1.7-py3-none-any.whl (6.0 kB)
Collecting llama-index-indices-managed-llama-cloud<0.2.0,>=0.1.2 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.1.5-py3-none-any.whl (6.7 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Downloading 

In [None]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import os
# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = "YOUR LLAMA KEY"

# Using OpenAI API for embeddings/llms
os.environ["OPENAI_API_KEY"] = "YOUR OPEN AI KEY"


In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings

embed_model=OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-3.5-turbo-0125")

Settings.llm = llm
Settings.embed_model = embed_model

# token counter
import tiktoken
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler

token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
)

Settings.callback_manager = CallbackManager([token_counter])

## PARSE PDF

In [None]:
from llama_parse import LlamaParse

documents = LlamaParse(result_type="markdown").load_data('/content/drive/MyDrive/sample_pdfs/download_file_2022-TCFDReport-e.pdf')

Started parsing the file under job_id 9dde571d-e60a-4077-983e-a29e670d997e


In [None]:
print(documents[0].text[:1000] + '...')

NO_CONTENT_HERE
---
|Commitment to Net Zero Emissions|03|
|---|---|
|Roadmap to Net Zero Emissions and Milestones|4|
|Sustainable Climate Governance|07|
|Climate Change Governance and Management Framework|7|
|Enforce Climate Governance|8|
|Enterprise Risk Management|11|
|Risk Management Framework|11|
|Major Climate Risks and Opportunities|12|
|Financial Impact Analysis of Climate Risks and Opportunities|14|
|Climate Scenario Analysis|15|
|Forward-looking Climate Strategy|22|
|Energy Conservation and Carbon Reduction in TSMC Operations|23|
|Adaptation to Climate Disasters|28|
|Low-carbon Product Innovations|31|
|Increase Climate Influence on Supply Chain|33|
|Management Performance and Goals|36|
|Net Zero Performance Evaluation and Commitment|36|
|Prospects|43|
|Appendix|44|
|About This Report|44|
|TCFD Disclosure Index|44|
|TCFD Index on Cross-industry, Climate-related Metric Categories|45|
|TSMC's Reports and Policies on Climate Change|46|
|Reference|46|

Source of Cover Photo: wpd Ta

In [None]:
from llama_index.core.node_parser import MarkdownElementNodeParser

node_parser = MarkdownElementNodeParser(llm=OpenAI(model="gpt-3.5-turbo-0125"), num_workers=8)

In [None]:
nodes = node_parser.get_nodes_from_documents(documents)

43it [00:00, 13459.33it/s]
columns
  field required (type=value_error.missing)
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py", line 477, in _agive_response_single
    structured_response = await program.acall(
  File "/usr/local/lib/python3.10/dist-packages/llama_index/core/response_synthesizers/refine.py", line 92, in acall
    answer = await self._llm.astructured_predict(
  File "/usr/local/lib/python3.10/dist-packages/llama_index/core/llms/llm.py", line 377, in astructured_predict
    return await program.acall(**prompt_args)
  File "/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py", line 220, in acall
    return _parse_tool_calls(
  File "/usr/local/lib/python3.10/dist-packages/llama_index/program/openai/base.py", line 64, in _parse_tool_calls
    output = output_cls.parse_raw(function_call.arguments)
  File "/usr/local/lib/python3.10/dist-packages/pydantic/v1/main.py", l

In [None]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

Ici, nous avons un `VectorDB` custom

In [None]:
recursive_index = VectorStoreIndex(nodes=base_nodes+objects)
raw_index = VectorStoreIndex.from_documents(documents)

Ici on utilise `bge-reranker-large`

In [None]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)

recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=15,
    node_postprocessors=[reranker],
    verbose=True
)

raw_query_engine = raw_index.as_query_engine(similarity_top_k=15, node_postprocessors=[reranker])
no_reranker_query_engine = raw_index.as_query_engine(similarity_top_k=15)

In [None]:
print(len(nodes))

126


##Q&A

Types de query engines permis:
- \<raw_query_engine>
- \<recursive_query_engine>

Token counter part 1

In [None]:
query = "What company is this and when was the report written?"

response_1 = raw_query_engine.query(query)
print("\n***********New LlamaParse+ Basic Query Engine***********")
print(response_1)

query2 = "Does X acknowledge climate change as a significant issue for the business?"
response_2 = raw_query_engine.query(query2)
print("\n***********New LlamaParse+ Basic Query Engine Engine***********")
print(response_2)



***********New LlamaParse+ Basic Query Engine***********
The company mentioned in the context is TSMC (Taiwan Semiconductor Manufacturing Company). The report was written in 2023.

***********New LlamaParse+ Basic Query Engine Engine***********
Yes, X acknowledges climate change as a significant issue for the business.


Token counter part 2

In [None]:
print(
    "Embedding Tokens: ",
    token_counter.total_embedding_token_count,
    "\n",
    "LLM Prompt Tokens: ",
    token_counter.prompt_llm_token_count,
    "\n",
    "LLM Completion Tokens: ",
    token_counter.completion_llm_token_count,
    "\n",
    "Total LLM Token Count: ",
    token_counter.total_llm_token_count,
    "\n",
)

Embedding Tokens:  38988 
 LLM Prompt Tokens:  36417 
 LLM Completion Tokens:  4854 
 Total LLM Token Count:  41271 



Benchmarks avec différents engines

In [None]:
from time import time

query = "What company is this and when was the report written?"

now = time()
response_1 = raw_query_engine.query(query)
print("\n***********Raw Basic Query Engine***********")
print(f"Elapsed: {round(time() - now, 2)}s")
print(response_1)

now = time()
response_2 = recursive_query_engine.query(query)
print("\n***********Recursive Query Engine Engine***********")
print(f"Elapsed: {round(time() - now, 2)}s")
print(response_2)

now = time()
response_3 = no_reranker_query_engine.query(query)
print("\n***********No Reranker Query Engine Engine***********")
print(f"Elapsed: {round(time() - now, 2)}s")
print(response_3)

Elapsed: 114.62s

***********Raw Basic Query Engine***********
The company is General Electric (GE), and the report was written in 2022.
[1;3;38;2;11;159;203mRetrieval entering id_47dfc781-e6d9-42a0-8381-a247f16b6eda_38_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What company is this and when was the report written?
[0m[1;3;38;2;11;159;203mRetrieval entering id_176bbb0e-1828-4738-948d-a545c1c358b0_50_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What company is this and when was the report written?
[0m[1;3;38;2;11;159;203mRetrieval entering id_088bac8d-f94b-49c6-87fe-51be39e1459f_25_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What company is this and when was the report written?
[0m[1;3;38;2;11;159;203mRetrieval entering id_176bbb0e-1828-4738-948d-a545c1c358b0_46_table: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query What company 

In [None]:
query3 = "Does the company acknowledge climate change as a significant issue for the business?"
response_3 = raw_query_engine.query(query3)
print("\n***********New LlamaParse+ Basic Query Engine Engine***********")
print(response_3)


***********New LlamaParse+ Basic Query Engine Engine***********
The company acknowledges climate change as a significant issue for the business.


# Output Parser

TODO: formatter les reponses sous format JSON pour integrer au site web

In [None]:
from pydantic import BaseModel
from typing import List

from llama_index.core.program import LLMTextCompletionProgram

classes

In [None]:
class Year(BaseModel):
  company: str
  year: int

class Questions(BaseModel):
  question: str
  answer: bool

test_questions = ['Does X acknowledge climate change as a significant issue for the business?',
                  'Does X recognize climate change as a relevant risk and/or opportunity for the business?',
                  'Does X have a policy (or equivalent) commitment to action on climate change?']

In [None]:
# define jupyter display function
def display_eval_df(query: str, response: Response, eval_result: str) -> None:
    eval_df = pd.DataFrame(
        {
            "Query": query,
            "Response": str(response),
            "Source": (
                response.source_nodes[0].node.get_content()[:1000] + "..."
            ),
            "Evaluation Result": eval_result,
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)