# RAG QA Bot - OpenAI + Pinecone + Streamlit

In [55]:
!pip install openai pinecone streamlit PyPDF2 python-docx tiktoken



In [58]:
import os
import openai
from pinecone import Pinecone, ServerlessSpec
import PyPDF2
import docx
import tiktoken
import hashlib
import time
from google.colab import userdata

openai.api_key = userdata.get('OPENAI_API_KEY')
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
INDEX_NAME = 'rag-qa-index'

pc = Pinecone(api_key=PINECONE_API_KEY)
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(cloud='gcp', region='gcp-starter')
    )
    while not pc.describe_index(INDEX_NAME).status['ready']:
        time.sleep(1)
index = pc.Index(INDEX_NAME)

def extract_text(file_path):
    if file_path.endswith('.pdf'):
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            return '\n'.join([page.extract_text() for page in reader.pages])
    elif file_path.endswith('.docx'):
        doc = docx.Document(file_path)
        return '\n'.join([para.text for para in doc.paragraphs])
    elif file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as file:
             return file.read()

def chunk_text(text, chunk_size=800):
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    sentences = text.split('.')
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        potential_chunk = current_chunk + sentence + "."
        if len(encoding.encode(potential_chunk)) <= chunk_size:
            current_chunk = potential_chunk
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + "."
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

def get_embedding(text):
    response = openai.embeddings.create(model='text-embedding-3-small', input=text)
    return response.data[0].embedding

def store_document(chunks, doc_name):
    vectors = []
    for i, chunk in enumerate(chunks):
        chunk_id = hashlib.md5(f"{doc_name}_{i}".encode()).hexdigest()
        embedding = get_embedding(chunk)
        vectors.append({
            'id': chunk_id,
            'values': embedding,
            'metadata': {'text': chunk, 'document': doc_name}
        })
    for i in range(0, len(vectors), 100):
        index.upsert(vectors=vectors[i:i+100])
    return len(chunks)

def query_docs(query, top_k=3):
    query_embedding = get_embedding(query)
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
    return [match['metadata']['text'] for match in results['matches']]

def generate_answer(query):
    context_docs = query_docs(query)
    context = '\n\n'.join(context_docs)
    prompt = f"""Context: {context}\n\nQuestion: {query}\n\nAnswer based on the context:"""
    response = openai.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=[{'role': 'user', 'content': prompt}],
        max_tokens=500,
        temperature=0.7
    )
    return response.choices[0].message.content

In [41]:
def ask_rag_bot(query):
    return generate_answer(query)

In [59]:
def main():
    sample_doc_path = 'sample_document.txt'
    print(f"Processing document: {sample_doc_path}")
    try:
        document_text = extract_text(sample_doc_path)
        document_chunks = chunk_text(document_text)
        num_chunks_stored = store_document(document_chunks, os.path.basename(sample_doc_path))
        print(f"Processed and stored {num_chunks_stored} chunks from {sample_doc_path}")
    except Exception as e:
        print(f"Error processing document: {e}")
        return
    print("-" * 30)
    query = "What are the benefits of a RAG system?"
    print(f"Asking question: {query}")
    try:
        answer = ask_rag_bot(query)
        print(f"Answer: {answer}")
    except Exception as e:
        print(f"Error querying RAG bot: {e}")
main()

Processing document: sample_document.txt
Processed and stored 1 chunks from sample_document.txt
------------------------------
Asking question: What are the benefits of a RAG system?
Answer: A RAG (Red, Amber, Green) system is a visual management tool used to quickly assess and communicate the status of different tasks or projects. The benefits of a RAG system include improved transparency, increased accountability, better decision-making, and enhanced communication among team members. It allows for quick identification of areas that need attention or intervention, helping teams stay on track and achieve their goals more effectively.
