In [74]:
from google.colab import userdata
import os

In [80]:
!pip install python-dotenv



In [85]:
from dotenv import load_dotenv

openai.api_key = os.getenv('OPENAI_API_KEY')

In [86]:
load_dotenv()

True

In [77]:
pinecode_api_key = userdata.get('pinecode_api_key')
pinecode_env = "us-east-1-aws"


In [47]:
!pip install openai pinecone-client chromadb langchain-community pypdf tiktoken



In [48]:
import langchain, openai, pinecone

In [49]:
embed_model = "text-embedding-ada-002"

In [50]:
from langchain_community.document_loaders import PyPDFLoader, UnstructuredPDFLoader,OnlinePDFLoader, TextLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.embeddings.openai import OpenAIEmbeddings

from langchain.vectorstores import Pinecone,Chroma

from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA

from pinecone import Pinecone,Index,ServerlessSpec
from langchain.llms import OpenAI

In [51]:
### Import the document
loader = DirectoryLoader('/content/', glob="*.pdf", loader_cls=PyPDFLoader,show_progress=True)
documents = loader.load()

100%|██████████| 1/1 [00:12<00:00, 12.84s/it]


In [52]:
# Split the Text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [53]:
type(texts)

list

In [54]:
# texts

In [55]:
len(texts)

762

In [56]:
texts[0]

Document(metadata={'source': '/content/MLBasicsBook.pdf', 'page': 0}, page_content='Machine Learning: The Basics\nAlexander Jung, August 27, 2023\nplease cite as:\nA. Jung,“Machine Learning: The Basics,” Springer, Singapore, 2022\nobservations\ndatahypothesis\nmake prediction validate/adapt\nloss\ninferencemodel\nFigure 1: Machine learning combines three main components: model, data and loss. Machine\nlearning methods implement the scientiﬁc principle of “trial and error”. These methods\ncontinuously validate and reﬁne a model based on the loss incurred by its predictions about\na phenomenon that generates data.\n1')

In [57]:
texts[1].metadata["source"]

'/content/MLBasicsBook.pdf'

Tried with Pinecone

In [58]:
# Pinecone and Opeanai Embedding setup
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_KEY,model=embed_model)
index_name="rag-index"
#initialize pinecone client
pc=Pinecone(api_key=pinecode_api_key)

if index_name not in pc.list_indexes().names():
  pc.create_index(
      name=index_name,
      dimension=1536,
      metric="cosine",
      spec=ServerlessSpec(cloud="aws",region="us-east-1"),
  )

In [59]:
#set up of Index

index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [60]:
# !pip install --upgrade langchain-pinecone

In [61]:
# from langchain.vectorstores import Pinecone as LangChainPinecone

# vector_store = LangChainPinecone.from_documents(texts, embeddings, index_name="rag-index")

Try with chroma db

In [62]:
persist_directory="db"

#define vector db and store the vectores
vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embeddings,
                                 persist_directory=persist_directory)

In [63]:
# persist data
vectordb.persist()
vectordb = None

In [64]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embeddings)

**Make a Retrival**

In [65]:
retriever=vectordb.as_retriever()

In [66]:
docs = retriever.get_relevant_documents("Explain Deep learninig?")

In [67]:
len(docs)

4

In [68]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [69]:
retriever.search_type

'similarity'

In [70]:
retriever.search_kwargs

{'k': 2}

Make llm chain for generate Responce

In [89]:
from langchain.chat_models import ChatOpenAI
# Initialize the language model
llm = ChatOpenAI(temperature=0)

# Create the QA chain using 'from_chain_type'
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)


  warn_deprecated(


In [90]:
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [91]:
# full example
query = "What is machine learning?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

  warn_deprecated(


Machine learning is portrayed as the combination of three basic components: data, model, and loss. It involves using computational methods to continuously adapt a hypothesis about a phenomenon that generates data. By using a hypothesis map, machine learning computes predictions of a quantity of interest based on the features of a data point.


Sources:
/content/MLBasicsBook.pdf
/content/MLBasicsBook.pdf


In [92]:
query = "What is deep learning learning?"
llm_response = qa_chain(query)
llm_response

{'query': 'What is deep learning learning?',
 'result': 'Deep learning is a machine learning method that uses cloud computing frameworks to train large models on large datasets. It operates on a much finer granularity for data and computation compared to linear regression. Deep learning involves iteratively updating a model based on the discrepancy between model predictions and actual observed data, measured via a loss function.',
 'source_documents': [Document(metadata={'page': 1, 'source': '/content/MLBasicsBook.pdf'}, page_content='ML methods are deep learning and linear regression.\nDeep learning methods use cloud computing frameworks to train large models on large\ndatasets. Operating on a much ﬁner granularity for data and computation, linear regression\ncan typically be implemented on small embedded systems. Nevertheless, deep learning meth-\nods and linear regression use the very same principle of iteratively updating a model based\non the discrepancy between model predictions 

In [94]:
query = "Which are the metrics used in machine learning?"
llm_response = qa_chain(query)
llm_response["result"]

'The metrics commonly used in machine learning include accuracy, precision, recall, F1 score, ROC-AUC, and mean squared error (MSE).'