<a href="https://colab.research.google.com/github/AreebAhmad-02/Embedding-Models-Finetuning/blob/main/Fine_tuning_Embedding_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## imports and install

In [10]:
!pip install llama_index -q -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
import json

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader,StorageContext,load_index_from_storage
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import MetadataMode
from llama_index.core import Settings

In [17]:
TRAIN_CORPUS_FPATH_SENTENCE_SPLIT = './corpus/train_corpus_sentence_split.json'
EVAL_CORPUS_FPATH_SENTENCE_SPLIT = './corpus/eval_corpus_sentence_split.json'


In [14]:
TRAIN_FILES = ["/content/data/batch1preprocessed_file.txt","/content/data/batch2preprocessed_file.txt"]
VAL_FILES = ['/content/data/batch0preprocessed_file.txt']

TRAIN_CORPUS_FPATH = './corpus/train_corpus.json'
VAL_CORPUS_FPATH = './corpus/val_corpus.json'

#Chunking

## simple chunking splitting

In [None]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f'Loaded {len(docs)} docs')

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f'Parsed {len(nodes)} nodes')

    corpus = {node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) for node in nodes}
    return corpus

In [None]:
train_corpus = load_corpus(TRAIN_FILES, verbose=True)
val_corpus = load_corpus(VAL_FILES, verbose=True)

Loading files ['/content/batch1preprocessed_file.txt', '/content/batch2preprocessed_file.txt']
Loaded 2 docs


Parsing nodes:   0%|          | 0/2 [00:00<?, ?it/s]

Parsed 66 nodes
Loading files ['/content/data/preprocessed_file.txt']
Loaded 1 docs


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsed 30 nodes


In [None]:
with open(TRAIN_CORPUS_FPATH, 'w+') as f:
    json.dump(train_corpus, f)

with open(VAL_CORPUS_FPATH, 'w+') as f:
    json.dump(val_corpus, f)

## Sentence split chunking using

In [13]:
from llama_index.core.node_parser import SentenceSplitter


def load_corpus_sentence_splitter(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f'Loaded {len(docs)} docs')


    splitter = SentenceSplitter(
    chunk_size=524,
    chunk_overlap=20,)
    nodes = splitter.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f'Parsed {len(nodes)} nodes')

    corpus = {node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) for node in nodes}
    return corpus


In [15]:
# train_corpus_sentence_split = load_corpus_sentence_splitter(TRAIN_FILES, verbose=True)
val_corpus_sentence_split = load_corpus_sentence_splitter(VAL_FILES, verbose=True)

Loading files ['/content/data/batch0preprocessed_file.txt']
Loaded 1 docs


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsed 52 nodes


We do a very naive train/val split by having the
.


In [18]:
# with open(TRAIN_CORPUS_FPATH_SENTENCE_SPLIT, 'w+') as f:
#     json.dump(train_corpus_sentence_split, f)

with open(EVAL_CORPUS_FPATH_SENTENCE_SPLIT, 'w+') as f:
    json.dump(val_corpus_sentence_split, f)

## importings and inporting data

#  Initilazing HuggingFace LLM

In [5]:
!pip install llama-index-llms-huggingface -q -U

In [2]:
%pip install -q -U git+https://github.com/huggingface/transformers.git
%pip install accelerate -q -U
%pip install -i https://pypi.org/simple/ bitsandbytes -q
%pip install sentence-transformers -q
%pip install --upgrade transformers -q -U
%pip install llama-index-llms-openai -q -U
%pip install llama-index-embeddings-openai -q -U
%pip install llama-index-finetuning



  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.9/141.9 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m326.8/326.8 kB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m11.3 MB/s[0m e

In [6]:
!pip install --upgrade transformers -q


In [32]:
!pip install transformers[torch] -q


In [3]:
!pip install accelerate -q -U

## quantization Configuration and initializing LLM zephyr-7b-beta

In [7]:
import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM


import os
import warnings

# warnings.filterwarnings('ignore')




In [8]:
# use this code when using GPU
quantization_conf = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


# # Disable GPU-specific configurations
# quantization_conf = BitsAndBytesConfig(
#     load_in_4bit=False,  # Disable 4-bit quantization
#     bnb_4bit_compute_dtype=torch.float32,  # Use float32 for CPU
#     # bnb_4bit_quant_type= None,  # Disable quantization type
#     bnb_4bit_use_double_quant=False  # Disable double quantization
# )




#We recreate template (or messages) and ensure that they have the correct format, as per,
#https://github.com/run-llama/llama_index/issues/9277#issuecomment-1837545398 for zephyr-7b-beta

def messages_to_prompt(messages):
    prompt = ""
    for message in messages:
        if message.role == 'system':
            prompt += f"<|system|>\n{message.content}</s>\n"
        elif message.role == 'user':
            prompt += f"<|user|>\n{message.content}</s>\n"
        elif message.role == 'assistant':
            prompt += f"<|assistant|>\n{message.content}</s>\n"

    # ensure we start with a system prompt, insert blank if needed
    if not prompt.startswith("<|system|>\n"):
        prompt = "<|system|>\n</s>\n" + prompt

    # add final assistant prompt
    prompt = prompt + "<|assistant|>\n"
    return prompt

### for GPU settings

In [9]:
def huggingface_llm(model_name="HuggingFaceH4/zephyr-7b-beta",
                    tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
                    context_window=3900,
                    max_new_tokens=256,
                    quantization_config = quantization_conf
                   ):
    llm = HuggingFaceLLM(
        model_name=model_name,
        tokenizer_name=tokenizer_name,
        query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
        context_window=context_window,
        max_new_tokens=max_new_tokens,
        model_kwargs={"quantization_config": quantization_config},
        # tokenizer_kwargs={},
        generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
        messages_to_prompt=messages_to_prompt,
        device_map="auto",
    )

    return llm

llm = huggingface_llm()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

### LLM  with CPU Setting

In [None]:
%pip install llama-index-llms-huggingface-api -q -U


In [None]:
def huggingface_llm(model_name="stabilityai/stable-code-3b",
                    tokenizer_name="stabilityai/stable-code-3b",
                    context_window=3900,
                    max_new_tokens=256,
                    # quantization_config = quantization_conf
                   ):
    llm = HuggingFaceLLM(
        model_name=model_name,
        tokenizer_name=tokenizer_name,
        query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
        context_window=context_window,
        max_new_tokens=max_new_tokens,
        # model_kwargs={"quantization_config": quantization_config},
        # tokenizer_kwargs={},
        generate_kwargs={ "top_k": 50, do_sample=True},
        messages_to_prompt=messages_to_prompt,
        device_map="cpu",
    )

    return llm

llm = huggingface_llm()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Generate synthetic data


## Generating QA Embedding Pair

In [None]:
TRAIN_QUERIES_FPATH = './data/train_queries.json'
TRAIN_RELEVANT_DOCS_FPATH = './data/train_relevant_docs.json'

VAL_QUERIES_FPATH = './data/val_queries.json'
VAL_RELEVANT_DOCS_FPATH = './data/val_relevant_docs.json'

In [None]:
# with open(TRAIN_CORPUS_FPATH_SENTENCE_SPLIT, 'r+') as f:
#     train_corpus = json.load(f)

with open(EVAL_CORPUS_FPATH_SENTENCE_SPLIT, 'r+') as f:
    val_corpus = json.load(f)

In [21]:
from typing import Dict, List, Tuple

from llama_index.core.bridge.pydantic import BaseModel
from llama_index.core.llms.utils import LLM
from llama_index.core.schema import MetadataMode, TextNode
from tqdm import tqdm


In [22]:
class EmbeddingQAFinetuneDataset(BaseModel):
    """Embedding QA Finetuning Dataset.

    Args:
        queries (Dict[str, str]): Dict id -> query.
        corpus (Dict[str, str]): Dict id -> string.
        relevant_docs (Dict[str, List[str]]): Dict query id -> list of doc ids.

    """

    queries: Dict[str, str]  # dict id -> query
    corpus: Dict[str, str]  # dict id -> string
    relevant_docs: Dict[str, List[str]]  # query id -> list of doc ids
    mode: str = "text"

    @property
    def query_docid_pairs(self) -> List[Tuple[str, List[str]]]:
        """Get query, relevant doc ids."""
        return [
            (query, self.relevant_docs[query_id])
            for query_id, query in self.queries.items()
        ]

    def save_json(self, path: str) -> None:
        """Save json."""
        with open(path, "w") as f:
            json.dump(self.dict(), f, indent=4)

    @classmethod
    def from_json(cls, path: str) -> "EmbeddingQAFinetuneDataset":
        """Load json."""
        with open(path) as f:
            data = json.load(f)
        return cls(**data)


DEFAULT_QA_GENERATE_PROMPT_TMPL = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and no prior knowledge.
generate only questions based on the below query.

You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided."
"""
# generate queries as a convenience function

def generate_qa_embedding_pairs(
    node_dict,
    llm: LLM,
    qa_generate_prompt_tmpl: str = DEFAULT_QA_GENERATE_PROMPT_TMPL,
    num_questions_per_chunk: int = 2,
) -> EmbeddingQAFinetuneDataset:
    """Generate examples given a set of nodes."""


    queries = {}
    relevant_docs = {}
    for node_id, text in tqdm(node_dict.items()):
        query = qa_generate_prompt_tmpl.format(
            context_str=text, num_questions_per_chunk=num_questions_per_chunk
        )
        response = llm.complete(query)

        result = str(response).strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0][
            :num_questions_per_chunk
        ]

        num_questions_generated = len(questions)
        if num_questions_generated < num_questions_per_chunk:
            warnings.warn(
                f"Fewer questions generated ({num_questions_generated}) "
                f"than requested ({num_questions_per_chunk})."
            )

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [node_id]

        print(questions)

    # construct dataset
    return EmbeddingQAFinetuneDataset(
        queries=queries, corpus=node_dict, relevant_docs=relevant_docs
    )

In [None]:
train_dataset = generate_qa_embedding_pairs(train_corpus, llm)

  0%|          | 0/119 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 1/119 [00:30<1:00:15, 30.64s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▏         | 2/119 [00:53<50:37, 25.96s/it]  Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  3%|▎         | 3/119 [01:01<34:28, 17.83s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  3%|▎         | 4/119 [01:15<31:29, 16.43s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  4%|▍         | 5/119 [01:21<24:02, 12.65s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  5%|▌         | 6/119 [01:33<23:01, 12.23s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  6%|▌         | 7/119 [01:43<21:32, 11.54s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  7%|▋         | 8/119 [02:05<27:48, 15.03s/it]Setting `pad_token_id` to `eos_token_

In [None]:
train_dataset.save_json("trained_synthetic_dataset.json")

In [None]:
train_synthetic_dataset = EmbeddingQAFinetuneDataset.from_json("/content/trained_synthetic_dataset.json")

In [23]:
# generating evaluation synthetic data set
eval_dataset = generate_qa_embedding_pairs(val_corpus, llm)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▏         | 1/52 [00:29<25:29, 29.99s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the purpose of the Ordinance presented in the context information, and what actions does it propose to take?', 'Who are the sponsors of this Ordinance, and what is the proposed timeline for its implementation?']


  4%|▍         | 2/52 [00:43<16:42, 20.05s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the purpose of Ordinance No. 153-22, and what actions does it propose to take in relation to the County Transportation Authority and sales tax?', "How does the proposed Ordinance amend the Business and Tax Regulations Code, and what is the duration and rate of the local transactions and use tax that it seeks to continue in effect? Additionally, what is the amount by which the Transportation Authority's appropriations limit would be increased, and for how long? Finally, what is the role of limited tax bonds in this proposal, and what is the basis for the Transportation Authority's determination under the California Environmental Quality Act?"]


  6%|▌         | 3/52 [01:05<17:25, 21.35s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is Proposition B, and how was it approved by the voters in California in 1989?', "What is the New Transportation Expenditure Plan adopted as part of Proposition K in 2003, and how does it differ from Proposition B's Transportation Expenditure Plan?"]


  8%|▊         | 4/52 [01:29<17:42, 22.13s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the purpose of amending Business and Tax Regulations Code Article 14 and Division 12.5 of the California Public Utilities Code?', 'a) To increase the tax rate from 0.5% to 1.0%']


 10%|▉         | 5/52 [01:52<17:37, 22.51s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the purpose of the Tax imposed by Article 14, and where will the proceeds be spent?', 'What is the definition of "Authority" and "District" in the context of Article 14? What is the significance of these terms in relation to the Tax?']


 12%|█▏        | 6/52 [02:15<17:22, 22.65s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the purpose of the ordinance being considered by the Board of Supervisors, as outlined in the context information?', 'What is the role of the Expenditure Plan Advisory Committee in relation to the proposed ordinance?']


 13%|█▎        | 7/52 [02:25<13:51, 18.48s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['How does the proposed New Transportation Expenditure Plan aim to achieve its purposes, and what provisions are included to accomplish these goals?', 'What similarities and differences are there between the proposed transactions and use tax in San Francisco and the Sales and Use Tax Law of California? How will the tax be administered and collected by the State Board of EqualizationCalifornia Department of Tax and Fee Administration? What measures are in place to minimize the cost of collecting the tax and minimize the burden of recordkeeping for those subject to taxation?']


 15%|█▌        | 8/52 [02:32<11:00, 15.01s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the purpose of the tax proposed in this ordinance, and what projects will it fund? (refer to sections (g) and (h))', 'What is the role of the Authority in administering the tax and delivering transportation improvements, and what powers does it have? (refer to section SEC. 1404)']


 17%|█▋        | 9/52 [02:56<12:38, 17.64s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the operative date for the implementation of the Tax and the amendments passed in the November 8, 2022 election, as outlined in Section 1405?', 'What is the rate of the Transactions Tax, as specified in Section 1406?']


 19%|█▉        | 10/52 [03:02<09:56, 14.20s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the rate of the use tax imposed in this district, and when does it take effect? (SEC. 1408)', 'How will the place of consumption be determined for retail sales made by a retailer without a permanent place of business in California? (SEC. 1407)']


 21%|██        | 11/52 [03:26<11:38, 17.04s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['In implementing the provisions of Part 1 (commencing with Section 6001) of Division 2 of the California Revenue and Taxation Code, where should the name of the Authority be substituted instead of the State of California? (a) Whenever the State of California is named or referred to as the taxing agency, (b) When the result of that substitution would require action to be taken by or against the Authority or any agency, officer, or employee thereof rather than by or against the State Board of EqualizationCalifornia Department of Tax and Fee Administration, in performing the functions incident to the administration or operation of this ordinance Article 14, (c) The substitution shall not be made when the word "State" is used as part of the title of the State Controller, the State Treasurer, the State Board of Control, the State Board of Equalization, the State Treasury, or the Constitution of the State of California, or (d) The substitution shall not be made in those sections, including,

 23%|██▎       | 12/52 [03:38<10:20, 15.51s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['Question 1:', 'Explain the significance of Subsection (b) in the context information provided. How does it impact retailers engaged in business in the District, and what are the criteria for a retailer to be considered engaged in business in the District?']


 25%|██▌       | 13/52 [04:01<11:35, 17.82s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


["Question 1: (2) Sales of property to be used outside the District which is shipped to a point outside the District, pursuant to the contract of sale, by delivery to such point by the retailer or the retailer's agent, or by delivery by the retailer to a carrier for shipment to a consignee at such point. For the purposes of this paragraph (b)(2), delivery to a point outside the District shall be satisfied:", 'a) With respect to vehicles (other than commercial vehicles) subject to registration pursuant to Chapter 1 (commencing with Section 4000) of Division 3 of the California Vehicle Code, aircraft licensed in compliance with Section 21411 of the California Public Utilities Code, and undocumented vessels registered under Chapter 2 of Division 3.5 (commencing with Section 98509840) of the California Vehicle Code by registration to an out-of-District address and by a declaration under penalty of perjury, signed by the buyer, stating that such address is, in fact, his/her principal place 

 27%|██▋       | 14/52 [04:18<11:04, 17.49s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['Question 1:', 'Explain the meaning of the term "exempted" as used in subsection (c) of the context information provided. How does this exemption differ from the exemptions provided in Sections 6366 and 6366.1 of the California Revenue and Taxation Code? Provide examples of the types of tangible personal property that are exempted under this subsection.']


 29%|██▉       | 15/52 [04:29<09:41, 15.72s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['Question 1:', 'Explain the difference between transactions tax and use tax, and how they are related in the context of the provided text material. Additionally, provide an example of a situation where a person may be able to claim a credit against use tax based on transactions tax paid.']


 31%|███       | 16/52 [04:52<10:44, 17.90s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the total principal amount that can be borrowed through the tax imposed by Article 14? (a) $1,880,00,000 (b) $1,910,000,000 (c) $2,000,000,000 (d) $2,100,000,000', 'How should the proceeds from the tax imposed by Article 14 be used? (a) For the projects and purposes set forth in the New Transportation Expenditure Plan approved by voters in 2003 and its updates and revisions (b) For the projects and purposes set forth in the 2022 Transportation Expenditure Plan and any updates or revisions to such Plan, as well as for interest and principal on bonds and administration costs (c) For any purpose deemed necessary by the Board of Supervisors (d) For the projects and purposes set forth in the New Transportation Expenditure Plan approved by voters in 2003, as well as for interest and principal on bonds and administration costs.']


 33%|███▎      | 17/52 [05:16<11:23, 19.52s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the condition for allocation of funds by the Authority mentioned in the context information?', 'What is the appropriations limit for the Authority for fiscal year 2003-04 and each year thereafter, as stated in the context information?']


 35%|███▍      | 18/52 [05:27<09:42, 17.13s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the significance of Section 1418 in the context of this ordinance? How does it affect the validity of the provisions in Article 14 and Part 1.6 of Division 2 of the California Revenue and Taxation Code?', 'What is the purpose of Section 1419 in this ordinance? How does it prevent legal action against the State of California or the Authority in relation to the collection of taxes under this ordinance and Part 1.6 of Division 2 of the California Revenue and Taxation Code?']


 37%|███▋      | 19/52 [05:49<10:16, 18.69s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the purpose of the 2022 Transportation Expenditure Plan, and how long is it intended to be implemented over?', 'What major categories of transportation improvements are included in the 2022 Transportation Expenditure Plan, and what specific benefits do they aim to provide?']


 38%|███▊      | 20/52 [06:13<10:41, 20.04s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['How was equity prioritized in the development of the 2022 Transportation Expenditure Plan?', 'What types of transportation investments will be prioritized for funding through the 2022 Sales Tax, as outlined in the transportation expenditure plan?']


 40%|████      | 21/52 [06:36<10:53, 21.07s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


["How does the SFTP 2050 align with regional trends and available funding to address changing needs in San Francisco's transportation system?", "What are the goals of ConnectSF and the SFTP 2050, and how do they contribute to the overall vision for San Francisco's transportation system as outlined in the SFTP 2050?"]


 42%|████▏     | 22/52 [07:00<10:55, 21.86s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the purpose of the 2022 Transportation Expenditure Plan, and why is the adoption of a sales tax ordinance necessary for its implementation? (Section 1: Introduction)', 'What policies and administration guidelines are outlined in Section 2: General Provisions of the 2022 Transportation Expenditure Plan?']


 44%|████▍     | 23/52 [07:12<09:11, 19.01s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['How does the 2022 Transportation Expenditure Plan differ from the Proposition K Expenditure Plan adopted in 2003? What is the duration of the 2022 Transportation Expenditure Plan and at what rate will the sales tax be imposed?', 'What are the two scenarios for revenue projections under the 2022 Transportation Expenditure Plan, and how do they differ in terms of average growth rate and inflation-based discount rate? Which scenario corresponds to Priority 1 funding levels, and what is the estimated total revenue level for this scenario?']


 46%|████▌     | 24/52 [07:20<07:17, 15.61s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What conditions must be met for 2022 Sales Tax funds to be spent outside the territorial limits of San Francisco, as outlined in the context information?', 'What is the definition of a "quantifiable benefit" in relation to the expenditure of 2022 Sales Tax funds outside of San Francisco, as described in the context information? How is this benefit measured?']


 48%|████▊     | 25/52 [07:43<08:05, 17.97s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the role of the San Francisco County Transportation Authority in administering the funds from the Proposition K sales tax and the new Proposition _ sales tax?', 'What is the maximum amount that can be used for administration costs from the annual net amount of revenues raised by the 2022 Sales Tax, as outlined in Public Utilities Code Section 131107?']


 50%|█████     | 26/52 [08:07<08:30, 19.63s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the National Environmental Policy Act (NEPA) and how does it relate to the approval and implementation of projects funded by the 2022 Sales Tax?', 'What is the California Environmental Quality Act (CEQA) and how does it apply to projects funded by the 2022 Sales Tax?']


 52%|█████▏    | 27/52 [08:20<07:21, 17.66s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['Analyze the Transportation Expenditure Plan presented in Table 1 and calculate the percentage of funding allocated to major transit projects under Proposition 2. What are the specific projects included in this category and how much funding is allocated to each?', 'Compare and contrast the funding allocated to transit maintenance and enhancements in Section B of the Transportation Expenditure Plan. Which mode of transportation receives the most funding for maintenance and why? How much funding is allocated to each specific mode of transportation, and what specific projects are included in this category? Additionally, what is the total amount allocated to transit enhancements, and which specific projects fall under this category?']


 54%|█████▍    | 28/52 [08:31<06:17, 15.73s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['How does the 2022 Transportation Expenditure Plan allocate funds across different transportation initiatives, and what specific projects fall under each category? (Refer to Table 1: 2022 Transportation Expenditure Plan)', 'What are the Safe and Complete Streets initiative and its components, as mentioned in the context information? How does this initiative differ from the Pedestrian and Bicycle Facilities maintenance category? Provide examples of projects that fall under each category. (Refer to sections 1 and 2)']


 56%|█████▌    | 29/52 [08:49<06:19, 16.50s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['How does the Equity Priority Transportation Program prioritize funding for transportation projects?', 'a. What is the Equity Priority Transportation Program and how does it prioritize funding for transportation projects?']


 58%|█████▊    | 30/52 [09:06<06:04, 16.56s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What types of projects are eligible for funding under the "Muni Reliability and Efficiency Improvements" program in the 2022 Transportation Expenditure Plan? Who is the sponsoring agency for this program?', 'What is the purpose of the "Muni Rail Core Capacity" program in the 2022 Transportation Expenditure Plan, and what types of improvements does it support for Muni\'s rail system? Who is the sponsoring agency for this program?']


 60%|█████▉    | 31/52 [09:17<05:15, 15.00s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What are the priority projects for improving public transportation in San Francisco, as outlined in the context information? Which project has the highest priority, and what are the key engineering improvements and funding sources involved?', 'What is the BART Core Capacity project, and how will it increase the capacity of BART through the Transbay Tube? What types of project expenses are covered by the funding, and what is the total funding amount and expected environmental impact statement (EP) cost? Additionally, what is the prerequisite for allocating funds to this project, and which counties are involved?']


 62%|██████▏   | 32/52 [09:26<04:24, 13.20s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What types of projects are eligible for funding under the Transit Maintenance and Enhancements category? Provide specific examples of project types that fall under this category.', 'What is the Caltrain Downtown Rail Extension, and what are its key features? How will this project accommodate blended service with future California High-Speed Rail? What is the total funding for this project, and who are the sponsor agencies?']


 63%|██████▎   | 33/52 [09:50<05:09, 16.30s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What types of facilities, equipment, and systems are eligible for funding under the Rehabilitation, Upgrades, and Replacement Program for Muni and BART?', 'What specific improvements are being made to Muni and BART facilities, equipment, and systems under this program, and how will they address issues related to electrification, climate change, and transit priority and reliability?']


 65%|██████▌   | 34/52 [10:14<05:35, 18.62s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the total funding and local match contribution for the Caltrain capital program, and who is responsible for providing the local match until 2022 Sales Tax funds run out?', 'What types of improvements are eligible for funding under the Transit Enhancements program, and which agencies are responsible for sponsoring these projects?']


 67%|██████▋   | 35/52 [10:22<04:21, 15.38s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What are the sponsor agencies involved in funding the Bayview Caltrain Station project, and how much funding is allocated for this project?', 'What is the Next Generation Transit Investments program, and which major transit capital projects are eligible for funding through this program? How much funding is allocated for this program, and how is it divided between Priority 1 and Priority 2 projects?']


 69%|██████▉   | 36/52 [10:45<04:45, 17.83s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the total funding allocated for the maintenance, rehabilitation, and replacement of city streets, including sidewalks and curb ramps, as well as equipment replacement for street repair and cleaning? How much of this funding is considered priority 1 and priority 2, and what is the estimated EP for this project? (Refer to section "Street Resurfacing, Rehabilitation, and Maintenance")', 'What types of pedestrian and bicycle facilities are included in the maintenance project, and which agencies are responsible for this project\'s project development and capital costs? How much funding is allocated for this project, and what is the estimated EP? (Refer to section "Pedestrian and Bicycle Facilities Maintenance")']


 71%|███████   | 37/52 [10:54<03:44, 14.98s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['How does the Safe and Complete Streets program aim to improve transportation safety in the city, and what types of projects are eligible for funding?', 'What are the objectives of the Multi-modal street improvements program, and which agencies are responsible for its implementation and funding? Additionally, what types of projects fall under this program, and how are they prioritized for funding?']


 73%|███████▎  | 38/52 [11:04<03:08, 13.45s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What are the two freeway safety and operational improvement projects listed in the context information, and what are their eligible project types and sponsor agencies?', "What is the Vision Zero Ramps program, and which city streets and intersections will benefit from programmatic improvements to support the City's policy to eliminate traffic deaths? What types of pedestrian safety measures are eligible for funding through this program, and which sponsor agencies are responsible for planning, project development, and capital costs?"]


 75%|███████▌  | 39/52 [11:12<02:34, 11.87s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What types of projects are eligible for funding under the Improvements to the Regional Transportation System category? Provide specific examples of project types that fall under this category.', 'What is Transportation Demand Management (TDM), and how can it be implemented to shift trips to sustainable modes of transportation? What types of projects are eligible for TDM funding, and what are some examples of successful TDM initiatives in San Francisco?']


 77%|███████▋  | 40/52 [11:35<03:03, 15.30s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the Neighborhood Transportation Program (NTP) and how does it differ from the Equity Priority Transportation Program (EPTP)? What types of projects are eligible for funding through each program?', 'How much funding is allocated to the Neighborhood Transportation Program (NTP) and the Equity Priority Transportation Program (EPTP) in the 2022 Transportation Expenditure Plan? Which priority level does the majority of funding go to for each program?']


 79%|███████▉  | 41/52 [11:47<02:35, 14.14s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the Development-Oriented Transportation Program, and how does it support increased housing density in low-density neighborhoods? Which sponsor agencies are involved in funding this program, and what types of projects are prioritized?', 'What is the Bayshore Caltrain Pedestrian Connection project, and what phases of planning, development, and capital costs are included in the funding? Which sponsor agencies are involved in this project, and what is the total funding amount, including EP? Which priority level is allocated to this project, and why?']


 81%|████████  | 42/52 [11:58<02:13, 13.33s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['How does the Strategic Plan contribute to the prioritization process for transportation projects in San Francisco? What factors should be considered when preparing a 5YPP for allocation of 2022 Sales Tax revenues?', "What is the role of the Transportation Authority in the prioritization process for transportation projects in San Francisco? How does the Transportation Authority ensure that proposed projects are consistent with the SFTP and the City's General Plan? What agencies are responsible for preparing the 5YPPs, and what factors should they consider when doing so?"]


 83%|████████▎ | 43/52 [12:14<02:07, 14.18s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['How does the Transportation Authority prioritize projects for funding through the 5YPP program? (d) and (e) from the context information provide specific criteria for prioritization. Based on this information, create a question that asks students to explain how community support and benefits to disadvantaged populations are factored into project selection.', 'What role does public outreach and engagement play in the development of 5YPPs and referrals to City Departments or Commissions? (e) from the context information highlights the importance of an inclusive planning process. Create a question that asks students to describe the public outreach and engagement requirements for 5YPP development and referrals. Additionally, students should be able to explain how this process ensures that the needs and perspectives of disadvantaged populations are taken into account.']


 85%|████████▍ | 44/52 [12:32<02:02, 15.32s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the role of designated agencies in the development of the 5YPP, and how can sponsoring agencies explore alternative methods for project delivery?', 'What guidelines will the Transportation Authority Board adopt for project delivery oversight of major capital projects funded by the 2022 Sales Tax, and how will these guidelines consider the total cost and complexity of a project? Additionally, what objectives will these guidelines aim to achieve, and how will the status of these projects be communicated to the Transportation Authority Board?']


 87%|████████▋ | 45/52 [12:48<01:47, 15.39s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the role of the Transportation Authority Board in distributing remaining funds from completed programs or legacy projects in the 2022 Transportation Expenditure Plan? How does this process differ from funding new programs in compliance with prioritization provisions? (2 points)', 'What abbreviations are used in the 2022 Transportation Expenditure Plan, and which organizations or departments do they represent? (1 point each for identifying 5 abbreviations)']


 88%|████████▊ | 46/52 [13:10<01:44, 17.48s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['How will the creation of the Department of Sanitation and Streets impact specific duties currently performed by San Francisco Public Works, as outlined in Board of Supervisors Motion 21-181?', 'In what ways does the Expenditure Plan Advisory Committee contribute to the allocation of resources in San Francisco, and which specific committees fall under the categories of Advocacy: Environment, Advocacy: Seniors and People with Disabilities, and Neighborhoods/Communities?']


 90%|█████████ | 47/52 [13:18<01:12, 14.57s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


["In relation to the Board of Supervisors' ordinance, what is the scope of the amendments being made to Article 14 of the Business and Tax Regulations Code?", 'Which individuals and organizations are listed as advocacy groups, equity priority communities, and business/civic entities in the context information provided? What are their specific areas of focus or priorities?']


 92%|█████████▏| 48/52 [13:29<00:54, 13.60s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the significance of Section 5 in the context of this ordinance? How does it impact the validity of the remaining portions of the ordinance if a section is declared invalid or unconstitutional?', 'What is the effective and operative date of the 2022 Transportation Expenditure Plan and the amendments to Article 14 of the Business and Tax Regulations Code, as outlined in Section 6 of the ordinance? How will the City Attorney replace references to the operative date in Article 14 after the election?']


 94%|█████████▍| 49/52 [13:53<00:49, 16.54s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['What is the purpose of calling a special election in San Francisco on November 8, 2022, as outlined in the context information?', 'What specific proposals will be presented to voters in the special election, as detailed in the context information?']


 96%|█████████▌| 50/52 [14:07<00:31, 15.98s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


["What is the purpose of the Board of Supervisors' decision to adopt, establish, and designate voting precincts, polling places, and officers of election for the Special Election called?", 'What is the role of the Department of Elections in relation to the Special Election called by the Board of Supervisors, as directed by California Public Utilities Code Section 131108, subdivision (h)?']


 98%|█████████▊| 51/52 [14:17<00:14, 14.13s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['Based on the context information provided, what is the role of the Supervisors Mandelman, Walton, Peskin, Ronen, Melgar, Stefani, Preston, Mar, and Safai in this situation?', 'What is the significance of California Public Utilities Code Section 131055 mentioned in the context information? What action is being directed by the Board of Supervisors in relation to this section?']


100%|██████████| 52/52 [14:25<00:00, 16.64s/it]

['What is the purpose of the Ordinance passed by the Board of Supervisors on July 19, 2022, and what does it entail?', 'What is the California Environmental Quality Act, and why is it mentioned in the context information provided? Does the Ordinance mentioned in the context information require any environmental assessments or approvals? If so, what are they?']





In [24]:
eval_dataset.save_json("eval_synthetic_qa_dataset.json")

## Generating Anchor and positive pair

In [None]:
class EmbeddingAnchorPositiveDataset(BaseModel):
    """Embedding Anchor-Positive Dataset.

    Args:
        anchors (Dict[str, str]): Dict id -> anchor text.
        positives (Dict[str, str]): Dict id -> positive text.
        similarities (Dict[str, float]): Dict id -> similarity score.
        corpus (Dict[str, str]): Dict id -> string.
    """

    anchors: Dict[str, str]  # dict id -> anchor text
    positives: Dict[str, str]  # dict id -> positive text
    similarities: Dict[str, float]  # dict id -> similarity score
    corpus: Dict[str, str]  # dict id -> string
    mode: str = "text"

    @property
    def anchor_positive_pairs(self) -> List[Tuple[str, str, float]]:
        """Get anchor, positive pairs, and their similarity scores."""
        return [
            (self.anchors[anchor_id], self.positives[anchor_id], self.similarities[anchor_id])
            for anchor_id in self.anchors.keys()
        ]

    def save_json(self, path: str) -> None:
        """Save json."""
        with open(path, "w") as f:
            json.dump(self.dict(), f, indent=4)

    @classmethod
    def from_json(cls, path: str) -> "EmbeddingAnchorPositiveDataset":
        """Load json."""
        with open(path) as f:
            data = json.load(f)
        return cls(**data)


In [None]:
DEFAULT_ANCHOR_POSITIVE_PROMPT_TMPL = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and no prior knowledge,
generate anchor and positive pairs based on the below query.

You are an NLP specialist. Your task is to identify \
{num_pairs_per_chunk} anchor-positive pairs for semantic similarity. \
For each pair, also provide a similarity score as a float point value. \
The similarity score should be between 0 and 1, indicating how similar \
the anchor is to the positive example. Restrict the pairs to the \
context information provided.

Format:
Anchor | Positive | Similarity
------------------------------
"""

def generate_anchor_positive_pairs(
    node_dict,
    llm: LLM,
    prompt_tmpl: str = DEFAULT_ANCHOR_POSITIVE_PROMPT_TMPL,
    num_pairs_per_chunk: int = 2,
) -> EmbeddingAnchorPositiveDataset:
    """Generate anchor-positive pairs given a set of nodes."""

    anchors = {}
    positives = {}
    similarities = {}

    for node_id, text in tqdm(node_dict.items()):
        print(node_id)
        print(text)
        query = prompt_tmpl.format(
            context_str=text, num_pairs_per_chunk=num_pairs_per_chunk
        )
        response = llm.complete(query)

        print("response:  ->", resonse)
        result = str(response).strip().split("\n")
        print("results:  ->", results)
        pairs = [
            line.split(" | ") for line in result if "|" in line
        ]
        pairs = [
            (anchor.strip(), positive.strip(), float(similarity.strip()))
            for anchor, positive, similarity in pairs
        ]

        num_pairs_generated = len(pairs)
        if num_pairs_generated < num_pairs_per_chunk:
            warnings.warn(
                f"Fewer pairs generated ({num_pairs_generated}) "
                f"than requested ({num_pairs_per_chunk})."
            )

        for anchor, positive, similarity in pairs:
            pair_id = str(uuid.uuid4())
            anchors[pair_id] = anchor
            positives[pair_id] = positive
            similarities[pair_id] = similarity

        print(pairs)

    # construct dataset
    return EmbeddingAnchorPositiveDataset(
        anchors=anchors, positives=positives, similarities=similarities, corpus=node_dict
    )



NameError: name 'response' is not defined

In [None]:
# Example usage
train_corpus = {
    "1": "the quick brown fox jumps over the lazy dog.",
    # "2": "The Way It Is Now\nCHANGES TO BUSINESS TAXES\nThe City collects various business taxes on an annual basis including:\nO\n\u2022\nSAN FRANCISCO\nFILED\n2024 MAY 15 PM 3:10\nDEPARTMENT OF ELECTIONS\nA gross receipts tax that is a percentage of a business's San Francisco gross receipts.\nDepending on business type, the City determines a business's San Francisco gross\nreceipts based on sales in San Francisco, payroll expenses for employees working there,\nor both. Rates range from 0.053% to 1.008% and are scheduled to increase in coming\nyears. Rates depend on business type, and higher rates apply as a business generates\nmore gross receipts. For 2023, most businesses with gross receipts up to $2.19 million\nare exempt.\nA homelessness gross receipts tax that is an additional tax on businesses with San\nFrancisco gross receipts over $50 million. Rates range from 0.175% to 0.69%.\nAn overpaid executive gross receipts tax that is an additional tax on businesses that pay\ntheir highest-paid managerial employee much higher than the median compensation they\npay their San Francisco employees. Rates are between 0.1% and 0.6%.\nA business registration fee that is an additional tax. For most businesses the fee is\ncurrently between $47 and $45,150, based on business type and amount of gross receipts.\n\u2022 An administrative office tax on payroll expenses that certain large businesses pay instead\nof these other business taxes. The combined rates in 2024 range from 3.04% to 5.44%,\nand in 2025 are scheduled to range from 3.11% to 5.51%. Business registration fees for\nthese businesses currently range from $19,682 to $45,928.\nState law limits the total revenue, including tax revenue, the City may spend each year. The\nvoters may approve increases to this limit for up to four years.",
    # "3": "Punjab, known as the 'Land of Five Rivers,' is a vibrant Indian state renowned for its rich cultural heritage, delectable cuisine, colorful festivals, and warm hospitality"
}


In [None]:


train_dataset = generate_anchor_positive_pairs(train_corpus, llm)

# Save the dataset to a JSON file
train_dataset.save_json("train_dataset.json")

# Load the dataset from a JSON file
loaded_dataset = EmbeddingAnchorPositiveDataset.from_json("train_dataset.json")


  0%|          | 0/1 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


1
he quick brown fox jumps over the lazy dog.


  0%|          | 0/1 [04:59<?, ?it/s]


NameError: name 'resonse' is not defined

# Embedding Finetuning


## checking generated data of qa pairs

In [5]:
train_synthetic_dataset = EmbeddingQAFinetuneDataset.from_json("/content/trained_synthetic_dataset.json")

In [6]:
train_synthetic_dataset.queries["a0bce105-3c69-47a0-a3ff-fc5df8c181e8"]

'What is the role of the Controller in determining the applicable tax rates for the gross receipts tax in Santa Cruz?'

In [None]:
train_synthetic_dataset.corpus["36b77480-727d-40f4-8490-5e845e808178"]


'CONTROLLER TO PUBLISH AND CERTIFY TAXABLE GROSS RECEIPTS\nAMOUNTS.\n(a) On or before October 3, 2022, for purposes of determining the applicable tax rates for tax\nyear 2023, the Controller shall publish the total amount of taxable gross receipts for tax year 2021\nreported by taxpayers as of June 30, 2022, and if that amount is equal to or greater than 90% of\n53SANTA CO\ntaxable gross receipts for tax year 2019 reported by taxpayers as of June 30, 2020, the Controller shall\nDEPARTMEN OP QRECTIONS\ncertify that the 90% gross receipts threshold has been met for tax year 2023.\n(b) On or before October 2, 2023, for purposes of determining the applicable tax rates for tax\nyear 2024, the Controller shall publish the total amount of taxable gross receipts for tax year 2022\nreported by taxpayers as of June 30, 2023, and if that amount is equal to or greater than 95% of\ntaxable gross receipts for tax year 2019 reported by taxpayers as of June 30, 2020, the Controller shall\neertify that

## imports

In [32]:
!pip install -U sentence-transformers -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llama-index-finetuning 0.1.9 requires sentence-transformers<3.0.0,>=2.3.0, but you have sentence-transformers 3.0.1 which is incompatible.[0m[31m
[0m

In [33]:
from sentence_transformers import SentenceTransformer

In [34]:
model_id = "BAAI/bge-small-en"
model = SentenceTransformer(model_id)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [25]:
import json

from torch.utils.data import DataLoader
from sentence_transformers import InputExample

In [48]:
TRAIN_DATASET_FPATH = '/content/trained_synthetic_qa_dataset.json'
VAL_DATASET_FPATH = '/content/eval_synthetic_qa_dataset.json'


BATCH_SIZE = 10

In [49]:
with open(TRAIN_DATASET_FPATH, 'r+') as f:
    train_dataset = json.load(f)

with open(VAL_DATASET_FPATH, 'r+') as f:
    val_dataset = json.load(f)

In [50]:
dataset = train_dataset

corpus = dataset['corpus']
queries = dataset['queries']
relevant_docs = dataset['relevant_docs']

examples = []
for query_id, query in queries.items():
    node_id = relevant_docs[query_id][0]
    text = corpus[node_id]
    example = InputExample(texts=[query, text])
    examples.append(example)


In [51]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

## MultipleNegativesRankingLoss

### Define loss
MultipleNegativesRankingLoss is a great loss function if you only have positive pairs, for example, only pairs of similar texts like pairs of paraphrases, pairs of duplicate questions, pairs of (query, response), or pairs of (source_language, target_language).

This loss function works great to train embeddings for retrieval setups where you have positive pairs (e.g. (query, relevant_doc)) as it will sample in each batch n-1 negative docs randomly.


The performance usually increases with increasing batch sizes.


In [30]:
from sentence_transformers import losses

In [35]:
loss = losses.MultipleNegativesRankingLoss(model)


### Defing Evaluator

We setup an evaluator with our val split of the dataset to monitor how well the embedding model is performing during training.



In [26]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator


In [61]:
dataset = val_dataset

corpus = dataset['corpus']
queries = dataset['queries']
relevant_docs = dataset['relevant_docs']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs,write_csv=True)

### Run Training
The training loop is very straight forward to steup thanks to sentencetransformers' high-level model training API. All we need to do is plugging in the data loader, loss function, and evaluator that we defined in the previous cells (along with a couple of additional minor settings).

In [38]:
!pip3 install datasets -q -U


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 16.1.0 

In [38]:
!pip install --upgrade pyarrow datasets -q

In [41]:
!pip uninstall pyarrow datasets -y
!pip install pyarrow==14.0.1 datasets -q


Found existing installation: pyarrow 16.1.0
Uninstalling pyarrow-16.1.0:
  Successfully uninstalled pyarrow-16.1.0
Found existing installation: datasets 2.20.0
Uninstalling datasets-2.20.0:
  Successfully uninstalled datasets-2.20.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.0/38.0 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [43]:
!pip install accelerate -U -q

In [22]:
!pip install transformers[torch] -q



In [None]:
from datasets import Dataset

In [72]:
# We train the model for very few epochs in this toy example.
# This should typically be higher for better performance.
EPOCHS = 1

In [68]:
warmup_steps = int(len(loader) * EPOCHS)

In [73]:
model.fit(
    train_objectives=[(loader, loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='exp_finetune',
    show_progress_bar=True,
    evaluator=evaluator,
    # evaluation_steps=50,
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/24 [00:00<?, ?it/s]

### checking the evaluation

In [74]:
results = evaluator(model)

In [76]:
print(results)

0.7249176564282218


## Sharing a Sentence Transformers to the Hugging Face Hub

In [59]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/usr/lib/python3.10/getpass.py", line 77, in unix_getpass
    passwd = _raw_input(prompt, stream, input=input)
  File "/usr/lib/python3.10/getpass.py", line 146, in _raw_input
    line = input.readline()
KeyboardInterrupt

During han

In [None]:
# 6. Save the trained model and optionally push it to the Hugging Face Hub
model.save_pretrained("bert-base-all-nli-stsb-quora-nq")
model.push_to_hub("bert-base-all-nli-stsb-quora-nq")

In [1]:
# Define the directory to save the model
save_path = 'fine_tuned_model'

# Save the model
model.save(save_path)

NameError: name 'model' is not defined

In [79]:
import shutil
from google.colab import files

# Zip the model directory
shutil.make_archive(save_path, 'zip', save_path)

# Download the zip file
files.download(f'{save_path}.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##