In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.document_loaders import PyPDFLoader
from glob import glob
from tqdm import tqdm

import yaml

## Load Documents and Build the Vector Store

In [None]:
def load_config():
    with open("./config/config.yaml", "r") as file:
        config = yaml.safe_load(file)
    return config

config = load_config()

def load_documents(directory : str):
    """Loads all documents from a directory and returns a list of Document objects
    args: directory format = directory/
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = config["TextSplitter"]["chunk_size"], 
                                                   chunk_overlap = config["TextSplitter"]["chunk_overlap"])
    documents = []
    for item_path in tqdm(glob(directory + "*.pdf")):
        loader = PyPDFLoader(item_path)
        documents.extend(loader.load_and_split(text_splitter=text_splitter))

    return documents

def load_embeddings(model_name=config["embeddings"]["name"],
                    model_kwargs = {'device': config["embeddings"]["device"]}):
    return HuggingFaceEmbeddings(model_name=model_name, model_kwargs = model_kwargs)


def load_db(embedding_function, save_path=config["faiss_indexstore"]["save_path"], index_name=config["faiss_indexstore"]["index_name"]):
    db = FAISS.load_local(folder_path=save_path, index_name=index_name, embeddings = embedding_function)
    return db

def save_db(db, save_path=config["faiss_indexstore"]["save_path"], index_name=config["faiss_indexstore"]["index_name"]):
    db.save_local(save_path, index_name)
    print("Saved db to " + save_path + index_name)

In [None]:
documents = load_documents("data/")

In [None]:
embedding_function = load_embeddings()

In [None]:
db = FAISS.from_documents(documents, embedding_function)

In [None]:
db.save_local("faiss_db/","books")

In [None]:
print(db.similarity_search("5G Subscribers in Asia"))

## Build The QA Retreiver

In [1]:
import sys, os
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
sys.path.append("../../Langchain")
from constants import openai_key

os.environ['OPENAI_API_KEY'] = openai_key

In [None]:
db = load_db(embedding_function)
qa = RetrievalQA.from_llm(llm = ChatOpenAI(temperature=0.1),
                          retriever = db.as_retriever(kwargs={"k": 7}),
                          return_source_documents = False)

In [None]:
question = "What are the main factors fuelling subscriber growth "
print(qa(question)['result'])

In [None]:
sys.path