# Try different models for the embedding

- Creating and Quering Index
- Saving and Loading Index
- Customize LLM
- Customize Prompt
- Customize Embedding


In [1]:
import openai
import environ
from IPython.display import Markdown, display

from llama_index import LLMPredictor, ServiceContext
from llama_index import VectorStoreIndex
from llama_index import SimpleDirectoryReader
from llama_index import Prompt
from llama_index.llms import OpenAI

from langchain.chat_models import ChatOpenAI


# import pickle
# import os
# from datetime import datetime
# from pathlib import Path

In [2]:
def get_response(user_query, query_engine):
    """
    Return the response from the bit (using query_engine) based on the user query.

    Args:
        user_query (str): query from user.
        query_engine: a query_engine created from the index.

    Returns:
        response: A Response object with the response and other metadata.
    """
    response = query_engine.query(user_query)
    return response


def display_response(response):
    """
    Get the answer from the query engine.

    Args:
        response: A response from a query engine

    Returns:
        None, format (for a Notebook) and print the response
    """
    display(Markdown(f"{response}"))
    return None

In [3]:
env = environ.Env()
environ.Env.read_env()
API_KEY = env('OPENAI_API_KEY')
openai.api_key = API_KEY



In [4]:
doc_path = "./docs/merkblatt_fuer_arbeitslose/merkblatt-fuer-arbeitslose_ba036520.pdf"
documents = SimpleDirectoryReader(input_files=[doc_path]).load_data()

In [32]:
documents[:3]

[Document(id_='e05407b7-5016-44a5-93ef-f666b9b191c0', embedding=None, metadata={'page_label': '1', 'file_name': 'merkblatt-fuer-arbeitslose_ba036520.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='2d9a660e58719a47a0a844e0a4f2e1aeab80a2910c2e579c81af05daf01afe99', text='49466_BA_MB_1.indd   1 10.02.2015   13:20:58Agentur für Arbeit  \nMusterstadthausen  Merkblatt\n1Merkblatt für\nArbeitslose \nIhre Rechte –\nIhre Pflichten ', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='e1c40455-db23-48fd-99e9-a8985e1c69db', embedding=None, metadata={'page_label': '2', 'file_name': 'merkblatt-fuer-arbeitslose_ba036520.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='96291ef98137c571c20b29d30df5a6c087563271148520a97ce911686ac9592a', text='3 \nIhre Agentur für Arbeit hält eine Fülle von \n Informati

In [11]:
llm = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"))
service_context = ServiceContext.from_defaults(llm_predictor=llm)

In [12]:
llm_index = VectorStoreIndex.from_documents(
    documents,
    service_context=service_context
)

In [14]:
llm_query_engine = llm_index.as_query_engine()
response = llm_query_engine.query("Who is the president of the U.S.A.?")
print(response)

I'm sorry, but I cannot answer that question based on the given context information.


In [23]:
# Define prompt
template = (
    "We have provided context information below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Given this information, please answer the question and each answer should start with code word Response: {query_str}\n"
)
qa_template = Prompt(template)

# Use the custom prompt when querying
query_engine_with_prompt = llm_index.as_query_engine(text_qa_template=qa_template)

In [54]:
query_user = "I worked in Germany for 3 years. My contract will end in four months How long will I receive the unemployment benefit?"
response = get_response(query_user, query_engine_with_prompt)
display_response(response)

Response: Based on the provided information, the duration of your entitlement to unemployment benefits depends on the total duration of your insurance obligations with the Federal Employment Agency within the last 5 years. Unfortunately, the specific duration cannot be determined without knowing the exact number of months you have been insured during the past 5 years. Please refer to the table provided in section 3.2 of the document "merkblatt-fuer-arbeitslose_ba036520.pdf" to determine the duration of your entitlement.

In [55]:
query_user = "I worked in Germany for 3 years. In the last 5 years I have worked in Germany for 36 months. My contract will end in four months How long will I receive the unemployment benefit?"
response = get_response(query_user, query_engine_with_prompt)
display_response(response)

Response: Based on the information provided, if you have worked in Germany for 36 months within the last 5 years, you would be eligible for a maximum of 24 months of unemployment benefit. However, please note that the specific duration of your benefit will also depend on your age and other factors. It is recommended to refer to the "Merkblatt 20" or consult with your local employment agency for more accurate information regarding your individual case.

# Custom Embedding

In [5]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding, ServiceContext

In [26]:
# Load in a specific embedding model
embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2'))

# Create a service context with the custom embedding model
service_context_emb = ServiceContext.from_defaults(embed_model=embed_model)

# Create an index using the service context
index_emb = VectorStoreIndex.from_documents(
    documents,
    service_context=service_context_emb,
)

query_engine_emb = index_emb.as_query_engine()

  from .autonotebook import tqdm as notebook_tqdm


In [56]:
query_user = "List the points to remember"
response = get_response(query_user, query_engine_emb)
display_response(response)

- The eligibility criteria for accessing statutory health insurance and long-term care insurance include being at least 55 years old and not having had any form of statutory health insurance in the last 5 years.
- Another requirement is that during this time, the individual must have been exempt from health insurance, exempt from insurance, or not subject to insurance due to self-employment for at least two and a half years.
- If these conditions apply, a certificate of exemption is not necessary, and there is no need to submit an exemption application or certificate to a health insurance fund.
- If the individual is not subject to compulsory insurance, the Employment Agency will cover the contributions to their private health and long-term care insurance, up to the amount of contributions for statutory health and long-term care insurance.
- The obligation to be a member of statutory health and long-term care insurance ends when the entitlement to unemployment benefits is exhausted, unless a new circumstance of insurance obligation or family insurance in statutory health insurance occurs immediately thereafter.
- For questions regarding the continuation of private health and long-term care insurance during or after the receipt of benefits, individuals should contact their insurance company.

In [57]:
query_user = "List the points to remember"
# response = display_response(query_engine_with_prompt, query_user)
response = get_response(query_user, query_engine_with_prompt)
display_response(response)

Response: 

1. You are required to notify the employment agency of your job search at least three months before the termination of your employment or training. If you are informed of the termination less than three months in advance, you must notify within three days. Failure to do so may result in a penalty.
2. You can notify the employment agency of your job search online, in person, by phone, or in writing.
3. The obligation to notify does not apply to vocational training within a company or school.
4. Unemployment benefits will be paid from the day you register as unemployed online or in person at the employment agency.
5. You are responsible for actively seeking employment, accepting suitable job offers, or participating in vocational integration measures to prevent or end unemployment.
6. The employment agency may request proof of your job search efforts.
7. You are solely responsible for notifying the employment agency of your employment start, including probationary employment.
8. You must inform the employment agency when you become unable to work due to illness and when you become fit for work again.
9. If your illness or incapacity for work is due to your fault or the fault of a third party, or if it is related to organ/tissue donation or sterilization, you must provide separate information.
10. You must provide a certificate from your doctor if you have an individual employment ban under the Maternity Protection Act.
11. You must inform the employment agency if you apply for or receive maternity benefits, transition benefits, or any other type of pension.
12. You must inform the employment agency if you have a part-time job that is less than 15 hours per week, even if it is not subject to taxation or social insurance contributions.
13. You must inform the employment agency if your income or the time commitment of your part-time job increases.
14. You must inform the employment agency if you are attending school, a similar educational institution, or if you are enrolled as a student.
15. You must inform the employment agency if you change your place of residence or your address.
16. You must inform the employment agency if you get married or permanently separate from your spouse or life partner.

# Compare Embedding models

In [6]:
def query_engine_from_model(mod_name, documents):
    """
    Create a query engine give a model name from the "sentence-transformers" package.
    See for emaple: https://www.sbert.net/docs/pretrained_models.html

    Args:
        mod_name: The name of the model to use
        documents: A list of Documents loaded with "SimpleDirectoryReader"

    Returns:
        A query engine to use to send queries to a LLM.
    """
    # Embedding model
    embedding_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=mod_name))

    # For now we use a model from OpenAI
    llm = OpenAI(model='text-davinci-003', temperature=0, max_tokens=256)

    # Service context with the embedding model
    service_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embedding_model
    )

    # Index
    index = VectorStoreIndex.from_documents(
        documents,
        service_context=service_context,
    )

    print("DONE!")

    # Return a query engine
    return index.as_query_engine()

Baseline for `ServiceContext`

`llm`: The LLM used to generate natural language responses to queries.
- If not provided, defaults to `gpt-3.5-turbo` from OpenAI.
- If your OpenAI key is not set, defaults to `llama2-chat-13B` from Llama.cpp.

`embed_model`: The embedding model used to generate vector representations of text.
- If not provided, defaults to `text-embedding-ada-002`.
- If your OpenAI key is not set, defaults to `BAAI/bge-small-en`.

In [7]:
# Baseline (OpenAI models)
# 
# llm="gpt-3.5-turbo"
# embed_model="text-embedding-ada-002"
service_context_baseline = ServiceContext.from_defaults()
index_baseline = VectorStoreIndex.from_documents(
    documents,
    service_context=service_context_baseline
)
query_engine_baseline = index_baseline.as_query_engine()

In [8]:
doc_path = "./docs/merkblatt_fuer_arbeitslose/merkblatt-fuer-arbeitslose_ba036520.pdf"
docs = SimpleDirectoryReader(input_files=[doc_path]).load_data()

query_engine_minilm_v2 = query_engine_from_model("sentence-transformers/all-MiniLM-L6-v2", docs)
query_engine_multilingual_v1 = query_engine_from_model("sentence-transformers/distiluse-base-multilingual-cased-v1", docs)
query_engine_bert_base_german = query_engine_from_model("PM-AI/bi-encoder_msmarco_bert-base_german", docs)
query_engine_german_semantic_sts_v2 = query_engine_from_model("aari1995/German_Semantic_STS_V2", docs)

  from .autonotebook import tqdm as notebook_tqdm


DONE!
DONE!
DONE!


Downloading (…)5dc24/.gitattributes: 100%|██████████| 1.48k/1.48k [00:00<00:00, 6.32MB/s]
Downloading (…)a19105dc24/README.md: 100%|██████████| 5.67k/5.67k [00:00<00:00, 18.2MB/s]
Downloading (…)9105dc24/config.json: 100%|██████████| 685/685 [00:00<00:00, 2.82MB/s]
Downloading model.safetensors: 100%|██████████| 1.34G/1.34G [02:54<00:00, 7.69MB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.34G/1.34G [04:37<00:00, 4.84MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 371kB/s]
Downloading (…)5dc24/tokenizer.json: 100%|██████████| 729k/729k [00:00<00:00, 1.65MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 454/454 [00:00<00:00, 1.67MB/s]
Downloading (…)a19105dc24/vocab.txt: 100%|██████████| 240k/240k [00:00<00:00, 1.70MB/s]
No sentence-transformers model found with name /home/daniele/.cache/torch/sentence_transformers/aari1995_German_Semantic_STS_V2. Creating a new one with MEAN pooling.


DONE!


In [9]:
query_user = "List the most important points to remember"

query_engine_dict = {
    "baseline": query_engine_baseline,
    "minilm_v2": query_engine_minilm_v2,
    "multilingual_v1": query_engine_multilingual_v1,
    "bert_base_german": query_engine_bert_base_german,
    "german_semantic_sts_v2": query_engine_german_semantic_sts_v2,
}

resp_dict = dict()
resp_dict = {
    "query_1": {
        "query": query_user,
    }
}

tot_mods = len(query_engine_dict)
i = 1
for qe in query_engine_dict:
    resp_dict["query_1"][qe] = get_response(query_user, query_engine_dict[qe])
    # resp_dict["query_1"][qe] = display_response(query_engine_dict[qe], query_user, print_response=False)
    print(f"DONE {str(i)} of {tot_mods}")
    i += 1

DONE 1 of 5
DONE 2 of 5
DONE 3 of 5
DONE 4 of 5
DONE 5 of 5


In [10]:
display_response(resp_dict["query_1"]["baseline"])

The most important points to remember are:
1. When ending an employment or training relationship, you must notify the relevant authorities of your job search status at least three months in advance. If you are informed of the termination less than three months in advance, you must notify them within three days.
2. You can notify them online, in person, by phone, or in writing.
3. Failure to notify them on time may result in a penalty.
4. The obligation to notify does not apply to company or school-based training relationships.
5. Unemployment benefits will only be paid from the day you register as unemployed online or in person.
6. You are responsible for actively seeking employment, accepting suitable job offers, or participating in vocational integration measures to prevent or end unemployment.
7. You may be required to provide evidence of your job search efforts upon request.
8. You must inform the employment agency when you start a new job, including trial employment.
9. If you become sick or injured and unable to work, or if you have a medical restriction due to pregnancy, you must inform the agency.
10. You must inform the agency if you receive any type of benefits, such as maternity benefits or pensions.
11. If you have a part-time job that is less than 15 hours per week, you must inform the agency.
12. You must inform the agency if there are any changes to your income or the hours of your part-time job.
13. You must inform the agency if you are a student or if there are any changes to your address or marital status.

In [11]:
display_response(resp_dict["query_1"]["minilm_v2"])


1. Familiarize yourself with the Meldepflicht, Mitwirkungspflichten, and Mithelfende/r Familienangehörige/r regulations.
2. Understand the rules for Nebeneinkommen, Nebentätigkeit, and Private Altersversorgung.
3. Be aware of the Pfändung, Pflegeunterstützungsgeld, and Pflegeversicherung regulations.
4. Know the rules for Reise, Rentenversicherung, and Rente wegen Erwerbsminderung.
5. Familiarize yourself with the Saisonkurzarbeitergeld, Schüler, and Selbständige regulations.
6. Understand the rules for Sozialdaten, Sperrzeit, and Transferkurzarbeitergeld.
7. Be aware of the Übergangsgeld, Umzug, and Unfallversicherung regulations.
8. Know the rules for Veränderungsmitteilung, Verfügbarkeit, and Verletztengeld.
9.

In [12]:
display_response(resp_dict["query_1"]["multilingual_v1"])


1. When ending an employment or training relationship, you must register as unemployed at least three months before the end date. If you find out less than three months before the end date, you must register within three days.
2. Unemployment benefits will be paid from the day you register as unemployed online or in person at the Federal Employment Agency.
3. To prevent or end unemployment, you are obligated to search for employment independently, take up a suitable job, or participate in a professional integration program.
4. You must provide proof of your own efforts upon request from the Employment Agency.

In [13]:
display_response(resp_dict["query_1"]["bert_base_german"])


1. Make sure to provide complete and accurate information to the Agency for Employment.
2. Notify the Agency for Employment of any changes in your circumstances.
3. If you receive benefits to which you are not entitled, you must repay them.
4. You must also reimburse the contributions to health and nursing care insurance, which is approximately 35% of the benefit.
5. If you are leaving your job, you must notify the Agency for Employment of your job search.
6. If you are unemployed, you must register with the Agency for Employment.
7. You can register online with the Federal Employment Agency.

In [14]:
display_response(resp_dict["query_1"]["german_semantic_sts_v2"])


1. Notify your local employment agency of any new employment.
2. Notify your local employment agency if you become ill or are unable to work.
3. Notify your local employment agency if you receive maternity leave or similar benefits.
4. Notify your local employment agency if you receive any type of pension.
5. Notify your local employment agency if you take on a part-time job.
6. Notify your local employment agency if you change your address.
7. Notify your local employment agency if you get married or separate from your partner.

In [65]:
# mod_name = "PM-AI/bi-encoder_msmarco_bert-base_german"
# em_tmp = LangchainEmbedding(HuggingFaceEmbeddings(model_name=mod_name))

In [89]:
resp_dict["query_1"]["baseline"].metadata

{'fc4cb656-6935-4c4e-8e22-caa4590baaaa': {'page_label': '6',
  'file_name': 'merkblatt-fuer-arbeitslose_ba036520.pdf'},
 'c85c6abb-6d7b-4529-a76a-3fe8ac2630a0': {'page_label': '65',
  'file_name': 'merkblatt-fuer-arbeitslose_ba036520.pdf'}}

In [91]:
resp_dict["query_1"]["minilm_v2"].metadata

{'199da534-8a0c-401c-b0df-436af2154d21': {'page_label': '100',
  'file_name': 'merkblatt-fuer-arbeitslose_ba036520.pdf'},
 '675b4290-32de-425a-8a05-c8bdb912ca4e': {'page_label': '99',
  'file_name': 'merkblatt-fuer-arbeitslose_ba036520.pdf'}}

In [90]:
resp_dict["query_1"]["multilingual_v1"].metadata

{'4c143f8b-c281-4f01-b00b-6298a5ebb8c9': {'page_label': '9',
  'file_name': 'merkblatt-fuer-arbeitslose_ba036520.pdf'},
 'cf22f8c4-050f-465d-91b2-e00ebb2512db': {'page_label': '6',
  'file_name': 'merkblatt-fuer-arbeitslose_ba036520.pdf'}}

In [88]:
resp_dict["query_1"]["bert_base_german"].metadata

{'e7c71a41-6342-4c6c-b940-ae73956fb4ab': {'page_label': '67',
  'file_name': 'merkblatt-fuer-arbeitslose_ba036520.pdf'},
 '996c6e24-2838-494f-92da-1b07d60983ba': {'page_label': '12',
  'file_name': 'merkblatt-fuer-arbeitslose_ba036520.pdf'}}

In [15]:
resp_dict["query_1"]["german_semantic_sts_v2"].metadata

{'fca8b094-4f0c-4d8e-b153-eae3d0847957': {'page_label': '11',
  'file_name': 'merkblatt-fuer-arbeitslose_ba036520.pdf'},
 'd3593aa0-14a5-449a-8574-9b51d6c8fdbc': {'page_label': '65',
  'file_name': 'merkblatt-fuer-arbeitslose_ba036520.pdf'}}