In [1]:
import os
os.chdir("/home/dephinate/ASU/DL/MisterRetriveRite/")
! pwd

/home/dephinate/ASU/DL/MisterRetriveRite


Install Modules

In [2]:
# ! pip install python-box==6.0.2
# ! pip install ensure==1.0.2
# ! pip install -e .


In [3]:
from misterRetriveRite.config.configurations import ConfigurationManager


In [4]:
config = ConfigurationManager()

[2024-03-05 13:20:45,601,INFO,common,created directory at: artifacts]


In [5]:
print(config.get_model_config())
print(config.get_vectorization_config())

ModelConfig(model_name='gpt-3.5-turbo-instruct', temperature=0.9, max_tokens=500, chunk_size=2000, chunk_overlap=100)
[2024-03-05 13:20:46,235,INFO,common,created directory at: artifacts/vector_db]
VectorizationConfig(root_dir='artifacts/vector_db', encoder_name='all-mpnet-base-v2', model_ckpt='None', data_path='None', k='None', num_of_cells='None', nprobe='None')


Data Loader

In [17]:
from langchain.document_loaders import UnstructuredURLLoader
from misterRetriveRite.logging import logger
class DataLoader():
    def __init__(self) -> None:
        pass
    def load_from_url(self,urls: list):
        loader = UnstructuredURLLoader(urls=urls)
        data = loader.load()
        logger.info(f"Data loaded from : {urls}")
        return data

Data Splitter

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
class Splitter():
    def __init__(self,data) -> None:
        self.data = data
        pass
    def split_recursive(self, chunk_size:int,chunk_overlap:int,sperators:list[str]):
        Splitter = RecursiveCharacterTextSplitter(
            separators=sperators,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        splits = Splitter.split_documents(self.data)
        return splits

Vectorization

In [35]:
from langchain.vectorstores import FAISS
from misterRetriveRite.utils.common import pickel_dump
import os

class Vectorizer():
    def __init__(self,data_splits) -> None:
        self.data_splits = data_splits
    def build_vectorindex_with_faiss_and_openai(self, save_to_local:bool,file_dir:None,file_name:None):
        from langchain.embeddings import OpenAIEmbeddings
        embeddings = OpenAIEmbeddings()
        vectorindex = FAISS.from_documents(self.data_splits, embeddings)
        if save_to_local:
            file_path = os.path.join(file_dir,file_name)
            pickel_dump(file_path=file_path,data=vectorindex)
            return vectorindex
        return vectorindex
    
    def build_vectorindex_with_faiss_and_huggingface(self, model_name: str,save_to_local:bool,file_dir:None,file_name:None):
        from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
        embeddings = HuggingFaceBgeEmbeddings(model_name = model_name)
        vectorindex = FAISS.from_documents(self.data_splits, embeddings)
        if save_to_local:
            file_path = os.path.join(file_dir,file_name)
            pickel_dump(file_path=file_path,data=vectorindex)
            return vectorindex
        return vectorindex




Test    
* load
* split
* vectorize
* retrieve and prompt

Load

In [11]:
urls = ["https://www.muscleandfitness.com/features/feature-news/joey-swoll-hilariously-surprises-girl-as-she-films-herself-flexing/",
    "https://www.muscleandfitness.com/flexonline/flex-news/lessons-being-learned-on-bodybuildings-best-podcasts/"
]
type(urls)

list

In [16]:
data_loader = DataLoader()
data = data_loader.load_from_url(urls=urls)

[2024-03-05 14:14:24,248,INFO,xml,Reading document from string ...]
[2024-03-05 14:14:24,252,INFO,html,Reading document ...]
[2024-03-05 14:14:24,902,INFO,xml,Reading document from string ...]
[2024-03-05 14:14:24,906,INFO,html,Reading document ...]
[2024-03-05 14:14:24,918,INFO,3548886393,Loading data from : ['https://www.muscleandfitness.com/features/feature-news/joey-swoll-hilariously-surprises-girl-as-she-films-herself-flexing/', 'https://www.muscleandfitness.com/flexonline/flex-news/lessons-being-learned-on-bodybuildings-best-podcasts/']]


Split

In [25]:
splitter = Splitter(data=data)
splits_rec = splitter.split_recursive(chunk_size=1000,chunk_overlap=50,sperators=['\n\n', '\n', '.', ','])

Vectorize

In [30]:
from misterRetriveRite.config.configurations import ConfigurationManager
config = ConfigurationManager()
vectorization_config = config.get_vectorization_config()
vectorization_config

[2024-03-05 14:29:32,696,INFO,common,created directory at: artifacts]
[2024-03-05 14:29:32,697,INFO,common,created directory at: artifacts/vector_db]


VectorizationConfig(root_dir='artifacts/vector_db', encoder_name='all-mpnet-base-v2', model_ckpt='None', data_path='None', k='None', num_of_cells='None', nprobe='None')

In [41]:
vectorizer = Vectorizer(data_splits=splits_rec)
vector_index_huggingface = vectorizer.build_vectorindex_with_faiss_and_huggingface(model_name=vectorization_config.encoder_name, 
                                                                                   save_to_local=True,
                                                                                   file_dir=vectorization_config.root_dir,
                                                                                   file_name="faiss_Store_huggingface.pkl")

[2024-03-05 15:17:05,036,INFO,SentenceTransformer,Load pretrained SentenceTransformer: all-mpnet-base-v2]
[2024-03-05 15:17:07,303,INFO,SentenceTransformer,Use pytorch device_name: cpu]


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[2024-03-05 15:17:15,027,INFO,common,created file at : artifacts/vector_db/faiss_Store_huggingface.pkl]


Prompt

In [43]:
from dotenv import load_dotenv
load_dotenv(dotenv_path=".env")

True

In [3]:
from misterRetriveRite.utils.common import load_env
load_env("")


True

In [6]:
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain

llm = OpenAI(temperature=0.9, max_tokens=500, model='gpt-3.5-turbo-instruct' ) 
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vector_index_huggingface.as_retriever())
chain

NameError: name 'vector_index_huggingface' is not defined

In [2]:
from misterRetriveRite.utils.common import *

In [None]:
pickel_dump()