In [13]:
import uuid
import nest_asyncio
import json

from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import BSHTMLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas.testset import TestsetGenerator
from langchain_core.documents import Document
from datasets import load_dataset
import pandas as pd
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm import tqdm

In [3]:
load_dotenv()
nest_asyncio.apply()

In [30]:
# Datasets that contain info about 2 big cities

paris_ds = load_dataset("gaianet/paris")
london_ds = load_dataset("gaianet/london")

README.md:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

london.txt:   0%|          | 0.00/93.5k [00:00<?, ?B/s]

london_chunks.txt:   0%|          | 0.00/93.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/661 [00:00<?, ? examples/s]

In [37]:
paris_ds_as_text = " ".join(paris_ds["train"]["text"])
london_ds_as_text = " ".join(london_ds["train"]["text"])

In [47]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=50, length_function=len
)

training_context = text_splitter.split_documents([Document(paris_ds_as_text), Document(london_ds_as_text)])

len(training_context)

387

In [51]:
# Generate 100 Q&D samples

generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(training_context, testset_size=100)
dataset_as_df = dataset.to_pandas()

Applying SummaryExtractor:   0%|          | 0/385 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/387 [00:00<?, ?it/s]

Node 104b4226-ac88-4e1c-96d1-e0bb7f57d50a does not have a summary. Skipping filtering.
Node 31bf2fe4-d811-4cf8-80e1-0145155c6e89 does not have a summary. Skipping filtering.


Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/1159 [00:00<?, ?it/s]

Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/100 [00:00<?, ?it/s]

In [60]:
documents_total = len(dataset_as_df)
test_and_validation_shift = int((
    documents_total * 10  #  10% of docs to validation, 10% to test, 80% to train
) / 100)
training_split_documents = dataset_as_df[
    : documents_total - test_and_validation_shift * 2
]
validation_split_documents = dataset_as_df[
    documents_total - test_and_validation_shift * 2 : documents_total
    - test_and_validation_shift
]
test_split_documents = dataset_as_df[documents_total - test_and_validation_shift :]

len(test_split_documents), len(validation_split_documents), len(training_split_documents)

(10, 10, 80)

In [88]:
def create_questions(df: pd.DataFrame) -> tuple[dict]:
    training_questions = {}
    training_relevant_contexts = {}
    context_ids = {}
    
    for _, row in df.iterrows():
        question = row["user_input"]
        q_id = str(uuid.uuid4())
        training_questions[q_id] = question
        
        for context in row["reference_contexts"]:
            c_id = str(uuid.uuid4())
            training_relevant_contexts[q_id] = [c_id]
            context_ids[c_id] = context

    return training_questions, training_relevant_contexts, context_ids

In [89]:
training_questions, training_relevant_contexts, training_context_ids = create_questions(training_split_documents)

val_questions, val_relevant_contexts, val_context_ids = create_questions(validation_split_documents)

test_questions, test_relevant_contexts, test_context_ids = create_questions(test_split_documents)

In [91]:
train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_context_ids,
}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_context_ids
}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : test_context_ids
}

In [92]:
embedding_model_id = "Snowflake/snowflake-arctic-embed-l"
embedding_model = SentenceTransformer(model_id)

In [93]:
BATCH_SIZE = 64

In [94]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [95]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [96]:
matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(embedding_model)
train_loss = MatryoshkaLoss(
    embedding_model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [102]:
corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [103]:
EPOCHS = 10

In [104]:
import wandb
wandb.init(mode="disabled")

In [117]:
# Unfortunately I have not enough computing power here, unable to fit model using new data

warmup_steps = int(len(loader) * EPOCHS * 0.1)

embedding_model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='cities_optimized_embedding_model',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50
)

RuntimeError: MPS backend out of memory (MPS allocated: 17.52 GB, other allocations: 426.72 MB, max allowed: 18.13 GB). Tried to allocate 339.94 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [16]:
# Here is the code sample that should test v1 model and v2
# Since I can't fit new model it's just how evaluation should look like

with open("cities_test_dataset.jsonl") as f:
    test_dataset = json.load(f)

with open("cities_train_dataset.jsonl") as f:
    train_dataset = json.load(f)

with open("cities_val_dataset.jsonl") as f:
    val_dataset = json.load(f)

def evaluate_openai(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
  corpus = dataset['corpus']
  questions = dataset['questions']
  relevant_docs = dataset['relevant_contexts']
  documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]
  vectorstore = FAISS.from_documents(documents, embed_model)

  retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

  eval_results = []
  for id, question in tqdm(questions.items()):
    retrieved_nodes = retriever.invoke(question)
    retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
    expected_id = relevant_docs[id][0]
    is_hit = expected_id in retrieved_ids
    eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

  return eval_results



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 34.80it/s]


In [19]:
raw_model_results_df = pd.DataFrame(raw_model_results)
hit_rate = raw_model_results_df["is_hit"].mean()
hit_rate

np.float64(0.7)

In [None]:
# finetuned_model_results_df = pd.DataFrame(finetuned_results)
# hit_rate = finetuned_model_results_df["is_hit"].mean()
# hit_rate