In [None]:
!pip install -qU langchain_community faiss-cpu langchain_huggingface

In [None]:
import faiss
import numpy as np
from langchain_community.vectorstores import FAISS # semantic search
from langchain_community.docstore.in_memory import InMemoryDocstore # dict doc store
from langchain_huggingface.embeddings import HuggingFaceEmbeddings # embedding model
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from datasets import Dataset

In [None]:
class LangChainRAGPipeline:
  def __init__(self, model_name, file_path):
    self.model_name = model_name
    self.file_path = file_path

    self.model = None
    self.tokenizer = None
    self.dataset = None
    self.index = None
    self.vectorstore = None
    self.embedder = HuggingFaceEmbeddings()

    self.file_read()
    self.build_index()
    self.import_model()

  def file_read(self):
    with open(self.file_path, 'r') as file:
        data = file.read()
    self.dataset = Dataset.from_dict({"text": data.split("\n\n")})

  def build_index(self):
    embedder = HuggingFaceEmbeddings()
    text_data = self.dataset["text"]
    vectors = [self.embedder.embed_query(t) for t in text_data]

    dim = len(vectors[0])
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(vectors))
    self.index = index

    self.vectorstore = FAISS( # works with semantic search & rag pipelines
      embedding_function=HuggingFaceEmbeddings(), # convert text to vectors
      index=self.index, # our loaded vectors
      docstore= InMemoryDocstore(), # dict in memory
      index_to_docstore_id={}
    )

  def import_model(self):
    self.model = AutoModelForCausalLM.from_pretrained(
      self.model_name,
      torch_dtype=torch.float16, ## eat less resources
      device_map="auto",
      # quantization_config=BitsAndBytesConfig(load_in_8bit=True)
    )
    self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
    self.tokenizer.pad_token = self.tokenizer.eos_token # in case some custom models dont have pad_token by default   

  def get_model_and_tokenizer(self):
        return self.model, self.tokenizer

  def gen(self, question, top_k=3, max_length=500):
    # vector store search
    embed = self.embedder.embed_query(question) # load our question into vector store (rag)
    D, I = self.index.search(np.array([embed]), top_k) # top_k simillar answers
    retrieved_texts = [self.dataset["text"][i] for i in I[0]] # take what we found
    context = "\n".join(retrieved_texts) # join them together

    prompt = f"Context:{context}, Question: {question}\nAnswer:"
    inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

    outputs = self.model.generate(
        **inputs,
        max_length=500,
        num_return_sequences=1,
        do_sample=False, ## variety, turn off for now
        top_p=0.95,
        temperature=0.7, ## temp
        pad_token_id=self.tokenizer.eos_token_id,
    )

    answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = answer.split("Answer:")[-1].strip()
    return answer

In [None]:
models = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0", "mistralai/Mistral-7B-Instruct-v0.3", "meta-llama/Llama-3.1-8B-Instruct", "microsoft/phi-2"]
files = "./church_text"

pipeline = LangChainRAGPipeline(
    model_name=models[0], # <-- choose a model here
    file_path=files # <-- choose a dataset path here
)

model, tokenizer = pipeline.get_model_and_tokenizer()
response = pipeline.gen("What can you tell me about Larnaka church?")
print(response)