### RAG
Implementation of Retrieval-Augmented Generation (RAG) using LangChain and Large Language Models (LLMs)

In [1]:
# pip install -r ./requirements.txt -q

In [2]:
# pip install langchain-community langchain-core -q

In [3]:
pip show langchain

Name: langchain
Version: 0.3.4
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: C:\Users\grzeg\AppData\Local\Programs\Python\Python311\Lib\site-packages
Requires: aiohttp, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: langchain-community
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [5]:
def load_documents(files):
    import os
    from langchain.document_loaders import PyPDFLoader, Docx2txtLoader
    
    all_docs = []  # List to hold all loaded documents

    for file in files:
        name, extension = os.path.splitext(file)
        if extension == '.pdf':
            print(f'Loading {file}')
            loader = PyPDFLoader(file)
        elif extension == '.docx':
            print(f'Loading {file}')
            loader = Docx2txtLoader(file)
        else:
            print(f'Document format for {file} is not supported')
            continue  # Skip unsupported files
        
        docs = loader.load()  # Load documents
        all_docs.extend(docs)  # Add loaded documents to the list

    return all_docs

# List of files to load
file_list = [
    "files/eu_ecodesign_directive_2019_2020.pdf",
    "files/eu_energy_labeling_directive_2017_1369.pdf",
    "files/eu_low_voltage_directive_2014_35_EU.pdf",
    "files/Rohs_directive_ 2011_65_EU.pdf"
]



In [6]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks
    

In [7]:
def embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total tokens: {total_tokens}')
    print(f' Embedding cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

In [23]:
def insert_embeddings(index, chunks):
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import PodSpec

    pc = pinecone.Pinecone()
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)

    if index_name in pc.list_indexes().names():
        print(f' Index {index_name} alredy exist. Loading embeddings...', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')

    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=PodSpec(environment='gcp-starter')
        
        )
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print("Ok")
        return vector_store
            

In [9]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinceone.Pinecone()
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print("Deleting all indexes", end='')
        for index in indexes:
            pc.delete_index(index)
        print('OK')
    else:
        print(f"Deleting index {index_name}", end='')
        pc.delete_index(index_name)
        print("Ok")

### Q&A

In [31]:
def ask_and_get_response(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)
    answer = chain.invoke(q)
    return answer

### Testing code

In [10]:
data = load_documents(file_list)

print(f'You have {len(data)} pages loaded in total.')

Loading files/eu_ecodesign_directive_2019_2020.pdf
Loading files/eu_energy_labeling_directive_2017_1369.pdf
Loading files/eu_low_voltage_directive_2014_35_EU.pdf
Loading files/Rohs_directive_ 2011_65_EU.pdf
You have 96 pages loaded in total.


In [11]:
chunks = chunk_data(data)
print(len(chunks))

1507


In [12]:
print(embedding_cost(chunks))

Total tokens: 74246
 Embedding cost in USD: 0.029698
None


In [24]:
index_name = 'askadokument'
vector_store = insert_embeddings(index_name, chunks)


Creating index askadokument and embeddings ...Ok


In [32]:
q = "what is the whole document about"
answer = ask_and_get_response(vector_store, q)
print(answer)

{'query': 'what is the whole document about', 'result': 'The provided context seems to be excerpts from a directive or regulation related to technical documentation and compliance requirements for products in the European Union. The document likely outlines specific criteria and procedures that manufacturers need to follow to demonstrate compliance with EU regulations, including the mandatory parts of technical documentation that must be entered into a database. It appears to cover aspects such as product identification, technical specifications, and compliance verification processes.'}


In [None]:
import time
i=1
print("Write q or e for exit")
while True:
    q = input(f'Question #{i}')
    i += 1
    if q.lower() in ['q', 'e']:
        print("Exiting...")
        time.sleep(2)
        break
    answer = ask_and_get_response(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')
        
    
    

Write q or e for exit


Question #1 is there an artcle about: Procedure for dealing with electrical equipment presenting a risk at national level?



Answer: {'query': 'is there an artcle about: Procedure for dealing with electrical equipment presenting a risk at national level?', 'result': 'Yes, the excerpt provided is from Article 19 of a directive that discusses the procedure for dealing with electrical equipment presenting a risk at the national level.'}

 -------------------------------------------------- 



Question #2 Can you explain it?



Answer: {'query': 'Can you explain it?', 'result': "I don't have enough context to provide a specific explanation. It seems to be referring to regulations related to electrical equipment and exemptions within the European Union. If you have a specific question or need clarification on a particular point, feel free to ask."}

 -------------------------------------------------- 

