In [4]:
import os
from git import Repo
from langchain.text_splitter import Language #To enable context aware splitting
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

##### Clone Github Repo

In [19]:
%pwd

'/home/avesh/Source-Code-Analysis-using-GenAI/research'

In [49]:
!mkdir test_repo

In [50]:
repo_path = "test_repo/"
Repo.clone_from("https://github.com/entbappy/End-to-end-ML-Project-Implementation", to_path=repo_path)

<git.repo.base.Repo '/home/avesh/Source-Code-Analysis-using-GenAI/research/test_repo/.git'>

In [51]:
repo_path = "test_repo/"

loader = GenericLoader.from_filesystem(repo_path+'/src/mlProject',
                                        glob = "**/*",
                                       suffixes=[".py"],
                                       parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

In [52]:
documents = loader.load()

#### Perform Context Aware Splitting


##### Chunking

In [53]:
documents_splitter = RecursiveCharacterTextSplitter.from_language(language = Language.PYTHON,
                                                             chunk_size = 2000,
                                                             chunk_overlap = 200) #from_language will perform context aware splitting and align the indentation according to the language passed

In [54]:
texts = documents_splitter.split_documents(documents)

In [55]:
from dotenv import load_dotenv
load_dotenv()
secret_key = os.getenv('OPENAI_API_KEY')

In [56]:
os.environ["OPENAI_API_KEY"] = secret_key

In [57]:
embeddings=OpenAIEmbeddings(disallowed_special=())

In [58]:
vectordb = Chroma.from_documents(texts, embedding=embeddings, persist_directory='./data')
vectordb.persist()

##### RAG System

In [59]:
# llm = ChatOpenAI(model_name="gpt-4")
llm = ChatOpenAI()

In [60]:
memory = ConversationSummaryMemory(llm=llm, memory_key = "chat_history", return_messages=True)

In [61]:
qa = ConversationalRetrievalChain.from_llm(llm, retriever=vectordb.as_retriever(search_type="mmr", search_kwargs={"k":3}), memory=memory)

### Inference

In [62]:
question = "what is DataIngestion class?"

In [63]:
result = qa(question)
print(result['answer'])

The `DataIngestion` class is a class defined in the `mlProject.components.data_ingestion` module. It is used for data ingestion tasks in the ML project. In the provided code, the `DataIngestion` class is instantiated with a configuration object and contains methods to download and extract data files.
