In [None]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("hmao_npa.txt")
base_docs = loader.load()

In [None]:
for doc in base_docs:
  print(doc.metadata)

In [None]:
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=64)

docs = text_splitter.split_documents(base_docs)

In [None]:
len(docs)

In [None]:
print(max([len(chunk.page_content) for chunk in docs]))

In [None]:
docs[0].page_content

In [None]:
model_kwargs = {'device': 'cuda:2'}
vectorstore = Chroma.from_documents(docs, HuggingFaceEmbeddings(model_name='intfloat/multilingual-e5-large', model_kwargs=model_kwargs), collection_metadata={"hnsw:space": "cosine"})

In [None]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
model_kwargs = {'device': 'cuda:2'}
encode_kwargs = {'batch_size': 1536}

vectorstore = FAISS.from_documents(docs, 
                                   HuggingFaceEmbeddings(model_name='intfloat/multilingual-e5-large', model_kwargs=model_kwargs, encode_kwargs=encode_kwargs))

In [None]:
vectorstore.save_local("e5large_256_64_faiss")

In [None]:
model_kwargs = {'device': 'cuda:2'}
encode_kwargs = {'batch_size': 1024}

embeddings = HuggingFaceEmbeddings(model_name='intfloat/multilingual-e5-large', model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

In [None]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

vectorstore = FAISS.load_local("e5large_256_64_faiss", embeddings, allow_dangerous_deserialization=True)

In [None]:
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

embeddings = HuggingFaceEmbeddings(model_name='intfloat/multilingual-e5-small')
connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"  # Uses psycopg3!

vector_store = PGVector(
    embeddings=embeddings,
    collection_name='docs',
    connection=connection,
    use_jsonb=True,
    collection_metadata={"hnsw:space": "cosine"}
)

In [None]:
query_text = 'Каков объем экспорта услуг категории "Поездки" в региональном проекте "Экспорт услуг" категории "Поездки" в Ханты-Мансийском автономном округе - Югре?'
results = vectorstore.similarity_search_with_relevance_scores(query_text, k=3)

In [None]:
results

In [None]:
base_retriever = vectorstore.as_retriever(search_type='similarity', search_kwargs={"k" : 2})

In [None]:
s = 'Каков объем экспорта услуг категории "Поездки" в региональном проекте "Экспорт услуг" категории "Поездки" в Ханты-Мансийском автономном округе - Югре?'
relevant_docs = base_retriever.get_relevant_documents(s)

In [None]:
len(relevant_docs)

In [None]:
relevant_docs[0].page_content

In [None]:
from langchain.prompts import ChatPromptTemplate

template = """Ответьте на вопрос, опираясь только на следующий контекст. Если вы не можете ответить на вопрос, опираясь на контекст, пожалуйста, ответьте «Я не знаю»:

### КОНТЕКСТ
{context}

### ВОПРОС
ВОПРОС: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [None]:
prompt

In [None]:
from langchain_community.llms import VLLM
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from operator import itemgetter
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

llm = VLLM(
    model="Qwen/Qwen2-7B",
    trust_remote_code=True,  # mandatory for hf models
    max_new_tokens=128,
    top_k=10,
    top_p=0.95,
    temperature=0.8,
)
retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt | llm, "context": itemgetter("context")}
)

Let's test it out!

In [None]:
question = 'Каков объем экспорта услуг категории "Поездки" в региональном проекте "Экспорт услуг" категории "Поездки" в Ханты-Мансийском автономном округе - Югре?'

result = retrieval_augmented_qa_chain.invoke({"question" : question})

print(result)

In [None]:
result['response']

In [None]:
result

In [None]:
!pip uninstall distro

In [None]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness,
    answer_similarity
)

from ragas.metrics.critique import harmfulness
from ragas import evaluate

def create_ragas_dataset(rag_pipeline, eval_dataset):
  rag_dataset = []
  for row in tqdm(eval_dataset):
    answer = rag_pipeline.invoke({"question" : row["question"]})
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer["response"],
         "contexts" : [context.page_content for context in answer["context"]],
         "ground_truths" : [row["ground_truth"]]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset

def evaluate_ragas_dataset(ragas_dataset, llm, embeddings):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        answer_correctness,
        answer_similarity
    ],
    llm=llm,
    embeddings=embeddings
  )
  return result

In [None]:
from datasets import Dataset
import pandas as pd

d = pd.read_excel("v2_ragas_npa_dataset_firstPart.xlsx")
d = d[:50]
eval_dataset = Dataset.from_pandas(d)

In [None]:
eval_dataset

In [None]:
from tqdm import tqdm
import pandas as pd

basic_qa_ragas_dataset = create_ragas_dataset(retrieval_augmented_qa_chain, eval_dataset)

In [None]:
basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset, llm, embeddings)

In [None]:
basic_qa_result