In [None]:
!pip install -qU langchain_community faiss-cpu langchain_huggingface

In [None]:
import faiss
import numpy as np
from langchain_community.vectorstores import FAISS # semantic search
from langchain_community.docstore.in_memory import InMemoryDocstore # dict doc store
from langchain_huggingface.embeddings import HuggingFaceEmbeddings # embedding model

from datasets import Dataset

In [None]:
def file_read(file_name):
  with open(file_name, 'r') as file:
    file = file.read()
  dataset = Dataset.from_dict({"text": file.split("\n\n")})
  return dataset

dataset = file_read("./cache/church_text") # file
text = dataset["text"] # key
embedder = HuggingFaceEmbeddings()
sample_key = embedder.embed_query(text[0])

## bart faiss

index = faiss.IndexFlatL2(len(sample_key)) # первая операция для того чтобы указать faiss размерность входящих эмбеддингов

vectors = [embedder.embed_query(t) for t in text] # загружаем уже весь датасет
index.add(np.array(vectors))

In [None]:
vector_storage = FAISS( # works with semantic search & rag pipelines
    embedding_function=HuggingFaceEmbeddings(), # convert text to vectors
    index=index, # our loaded vectors
    docstore= InMemoryDocstore(), # dict in memory
    index_to_docstore_id={}
)

In [None]:
def import_model(model_name):
  model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16, ## gpu support
    device_map="auto",
    # quantization_config=BitsAndBytesConfig(load_in_8bit=True) # bitsAndBytes
    )
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.pad_token = tokenizer.eos_token # in case some custom models dont have pad_token by default
  return model, tokenizer

models = ["deepseek-ai/deepseek-llm-7b-base", "mistralai/Mistral-7B-Instruct-v0.3", "meta-llama/Llama-3.1-8B-Instruct", "microsoft/phi-2"]
model, tokenizer = import_model(models[0])

In [None]:
def gen(question,
        model,
        tokenizer,
        embedder,
        top_k=3 # somewhat simmilar with batch_size, we take range of simmilar topics from vectors
        ):
    # vector store search
    embed = embedder.embed_query(question) # load our question into vector store (rag)
    D, I = index.search(np.array([embed]), top_k) # top_k simillar answers
    retrieved_texts = [dataset["text"][i] for i in I[0]] # take what we found
    context = "\n".join(retrieved_texts) # join them together

    prompt = f"Context:{context}, Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=500,
        num_return_sequences=1,
        do_sample=False, ## variety, turn off for now
        top_p=0.95,
        temperature=0.7, ## temp
        pad_token_id=tokenizer.eos_token_id,
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = answer.split("Answer:")[-1].strip()
    return answer

In [None]:
gen("larnaka church", model, tokenizer, embedder)