In [29]:
# install llamaindex, llamaindex azure-openai and embedding libraries
!pip install -U -q llama-index llama-hub pypdf
!pip install -U -q llama-index-embeddings-azure-openai
!pip install -U -q llama-index-llms-azure-openai
!pip install -U -q llama-index-embeddings-instructor
!pip install llama-index-question-gen-guidance

# install other utility libraries
!pip install -U -q pypdf

Collecting llama-index-question-gen-guidance
  Downloading llama_index_question_gen_guidance-0.1.2-py3-none-any.whl (2.5 kB)
Collecting llama-index-program-guidance<0.2.0,>=0.1.1 (from llama-index-question-gen-guidance)
  Downloading llama_index_program_guidance-0.1.2-py3-none-any.whl (4.6 kB)
Collecting guidance<0.2.0,>=0.1.10 (from llama-index-program-guidance<0.2.0,>=0.1.1->llama-index-question-gen-guidance)
  Downloading guidance-0.1.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.7/239.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting diskcache (from guidance<0.2.0,>=0.1.10->llama-index-program-guidance<0.2.0,>=0.1.1->llama-index-question-gen-guidance)
  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ordered-set (from guidance<0.2.0,>=0.1.10-

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [4]:
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine

In [6]:
from google.colab import userdata
from google.colab import drive

In [7]:
# configure the llm model

api_key = userdata.get('AZURE_OPENAI_API_KEY')
azure_endpoint = userdata.get("AZURE_OPENAI_ENDPOINT")
api_version = "2023-12-01-preview"

llm = AzureOpenAI(
    model="gpt-35-turbo",
    api_type = "azure",
    deployment_name = "gpt-4-32k",
    api_key=api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version,
)

In [10]:
# configure the embedding

# this doesn't work like this, some configuration error?
embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name="text-embedding-ada",
    api_key=api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version,
)

In [11]:
# set the global configuration
Settings.llm = llm
Settings.embed_model = embed_model

In [13]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [14]:
hs_data_path = '/content/drive/MyDrive/Colab_Notebooks/HS_Data'

doc1_text_a_path = f'{hs_data_path}/Dataset1_TextA.pdf'
doc1_text_x_path = f'{hs_data_path}/Dataset1_TextX.pdf'
doc2_text_a_path = f'{hs_data_path}/Dataset2_TextA.pdf'
doc2_text_x_path = f'{hs_data_path}/Dataset2_TextX.pdf'

doc3_text_a_path = f'{hs_data_path}/Dataset3_TextA_National_Coalition_Party_election_program_2023.pdf'
doc3_text_b_path = f'{hs_data_path}/Dataset3_TextB_Finns_Party_election_program_2023.pdf'
doc3_text_c_path = f'{hs_data_path}/Dataset3_TextC_SFP_election_program_2023.pdf'
doc3_text_d_path = f'{hs_data_path}/Dataset3_TextD_Christian_Democrats_election_program_2023.pdf'
doc3_text_x_path = f'{hs_data_path}/Dataset3_TextX_Government_Programme_2023.pdf'

lastensuojelu_1_path = f'{hs_data_path}/Selvityshenkilon_edellytykset_lastensuojelun.pdf'
lastensuojelu_2_path = f'{hs_data_path}/Toimiva lastensuojelu loppuraportti_final_19062013.pdf'
lastensuojelu_3_path = f'{hs_data_path}/R_31_18_Selvityshenkilon_ehdotus_lastensuojelun_laatu.pdf'
lastensuojelu_4_path = f'{hs_data_path}/STM_2020_28_rap.pdf'
lastensuojelu_5_path = f'{hs_data_path}/STM_2023_r_27.pdf'

# Simple Query Engine
This doens't keep the documents separate, it doesn't garantee that information is taken equally from both documents

In [40]:
documents = SimpleDirectoryReader(
    input_files=[lastensuojelu_1_path, lastensuojelu_2_path]
).load_data()
base_index = VectorStoreIndex.from_documents(documents)

In [43]:
query_engine = base_index.as_query_engine(similarity_top_k=4)

In [44]:
query = "What are the differences about the subject \"education\" in the two documents? answer in finnish"
response = query_engine.query(query)

print(response)

Ensimmäisessä dokumentissa keskitytään sosiaalityöntekijöiden koulutukseen ja sen kehittämiseen. Siinä ehdotetaan, että osa erikoistumiskoulutuksesta voisi keskittyä erityisesti vastavalmistuneiden sosiaalityöntekijöiden lastensuojelun asiakastyöhön täydentävän osaamisen hankkimiseen. Dokumentissa tuodaan esille myös koulutuksen ja käytännön työn väliset ristiriidat, kuten teoreettisen tiedon soveltamisen vaikeus käytäntöön ja eettisten periaatteiden noudattamisen haasteet suuren asiakasmäärän kanssa.

Toisessa dokumentissa ei keskitytä suoraan koulutukseen, vaan se käsittelee laajemmin lastensuojelun kysymyksiä, kuten ammattilaisten tilanteen kehittämistä, tehtävärakennesuosituksen tarkistamista, eri ammattilaisten yhteistyötä, lasten ja nuorten osallisuusasioita sekä jälkihuollon ja maahanmuuttajataustaisten lasten ja perheiden kysymyksiä.


# SubQuestionQueryEngine

This query engine does the following:
- Can treat different documents as different "tools"
- Breaks down a complex question into sub-questions over any subset of different documents
- Does retrieval independently per document
- Combines results at the end.

In [15]:
# create a separate vector store index for each document

document_1_document = SimpleDirectoryReader(
    input_files=[doc3_text_a_path]
).load_data()
document_1_index = VectorStoreIndex.from_documents(document_1_document)

document_2_document = SimpleDirectoryReader(
    input_files=[doc3_text_b_path]
).load_data()
document_2_index = VectorStoreIndex.from_documents(document_2_document)

document_3_document = SimpleDirectoryReader(
    input_files=[doc3_text_c_path]
).load_data()
document_3_index = VectorStoreIndex.from_documents(document_3_document)

document_4_document = SimpleDirectoryReader(
    input_files=[doc3_text_d_path]
).load_data()
document_4_index = VectorStoreIndex.from_documents(document_4_document)

document_5_document = SimpleDirectoryReader(
    input_files=[doc3_text_x_path]
).load_data()
document_5_index = VectorStoreIndex.from_documents(document_5_document)



In [22]:
# create a query engine for each document

document_1_engine = document_1_index.as_query_engine(similarity_top_k=3)
document_2_engine = document_2_index.as_query_engine(similarity_top_k=3)
document_3_engine = document_3_index.as_query_engine(similarity_top_k=3)
document_4_engine = document_4_index.as_query_engine(similarity_top_k=3)
document_5_engine = document_5_index.as_query_engine(similarity_top_k=3)

In [23]:
# ask the question to each query engine separately

question="Provides information about climate actions"

query_engine_tools = [
    QueryEngineTool(
        query_engine=document_1_engine,
        metadata=ToolMetadata(
            name="National_Coalition_Party_election_program_2023",
            description=question,
        ),
    ),
    QueryEngineTool(
        query_engine=document_2_engine,
        metadata=ToolMetadata(
            name="Finns_Party_election_program",
            description=question,
        ),
    ),
    QueryEngineTool(
        query_engine=document_3_engine,
        metadata=ToolMetadata(
            name="SFP_election_program",
            description=question,
        ),
    ),
    QueryEngineTool(
        query_engine=document_4_engine,
        metadata=ToolMetadata(
            name="Christian_Democrats_election_program",
            description=question,
        ),
    ),
    QueryEngineTool(
        query_engine=document_5_engine,
        metadata=ToolMetadata(
            name="Government_program",
            description=question,
        ),
    ),
]

s_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=query_engine_tools)

In [24]:
response = s_engine.query("Compare and contrast the climate actions of the five documents")

print(response)

Generated 5 sub questions.
[1;3;38;2;237;90;200m[National_Coalition_Party_election_program_2023] Q: What are the key climate actions proposed in the National Coalition Party election program 2023?
[0m[1;3;38;2;90;149;237m[Finns_Party_election_program] Q: What are the key climate actions proposed in the Finns Party election program?
[0m[1;3;38;2;11;159;203m[SFP_election_program] Q: What are the key climate actions proposed in the SFP election program?
[0m[1;3;38;2;155;135;227m[Christian_Democrats_election_program] Q: What are the key climate actions proposed in the Christian Democrats election program?
[0m[1;3;38;2;237;90;200m[Government_program] Q: What are the key climate actions proposed in the Government program?
[0m[1;3;38;2;237;90;200m[National_Coalition_Party_election_program_2023] A: The National Coalition Party election program 2023 proposes to ensure the sustainable use of natural resources. They believe that responsible forestry and circular economy can offer solut

In [165]:
print(len(response.source_nodes))

20


In [28]:
for source in response.source_nodes:
  print(source.get_content(metadata_mode="all"))
  print()
  print('--------------------------------------------')
  print()

Sub question: What are the key climate actions proposed in the National Coalition Party election program 2023?
Response: The National Coalition Party election program 2023 proposes to ensure the sustainable use of natural resources. They believe that responsible forestry and circular economy can offer solutions for fighting climate change and increase well-being everywhere in Finland.

--------------------------------------------

Sub question: What are the key climate actions proposed in the Finns Party election program?
Response: The Finns Party election program proposes several key climate actions. They advocate for a realistic, properly scheduled, and effective climate policy, promoting low-emission technologies as quickly as technically and economically possible. They propose moving Finland's carbon neutrality goal to 2050 and believe that the technology needed for a carbon-neutral society is in a good stage of development. They also propose that the licensing of wind and solar po