# Import Libraries

In [17]:
import os
import PyPDF2
import openai
import uuid

import warnings
warnings.filterwarnings('ignore')

import pinecone
import tiktoken

from itertools import chain
from tqdm.auto import tqdm
from pinecone import Pinecone
from pinecone import ServerlessSpec
from Credentials import OPENAI_API_KEY, PINECONE_API_KEY

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone as lang_pinecone
from langchain.embeddings import OpenAIEmbeddings

# Load Data

In [2]:
os.chdir('..')
curr_dir = (os.getcwd()).replace('\\', '/')
file_path = curr_dir + '/Dataset/Corrective RAG.pdf'

In [3]:
tokenizer = tiktoken.get_encoding('p50k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [4]:
def read_pdf(document):
       
    loader = PyPDFLoader(file_path)
    pdf_text = loader.load()
    pdf_text[0].page_content = pdf_text[0].page_content.replace('\n', '')

    return pdf_text    
        
def pdf_to_chunks(text, 
                  chunk_size, 
                  overlap):
    
    split_text = RecursiveCharacterTextSplitter(chunk_size=chunk_size, 
                                                chunk_overlap=overlap, 
                                                length_function=tiktoken_len)
    docs = split_text.split_text(text[0].page_content)
    
    chunks = [str(doc) for doc in docs]
        
    return chunks

In [5]:
text = read_pdf(file_path)

In [18]:
text

[Document(page_content='Introduction to RAG  Retrieval -augmented generation (RAG) is an AI framework for improving the quality of  LLM -generated  responses by grounding the model on external sources of knowledge to supplement the LLM’s internal representation of information. Implementing RAG in an LLM -based question answering system has two main benefits: It ensures that the model has access to the  most current, reliable facts, and that users have access to the model’s sources, ensuring that its claims can be checked for accuracy and ultimately trusted.  Prior research has introduced retrieval techniques to incorporate relevant knowledge and augment  generation, as  exemplified  by retrieval  augmented  generation ( RAG). In this framework, the input to  models is augmented  by prepending  relevant  documents  that are retrieved  from  an external  knowledge  corpus .  While  RAG  serves  as a practicable complement to  LLMs, its effectiveness is contingent upon the relevance  and 

In [6]:
pdf_chunks = pdf_to_chunks(text, 220, 10)

In [7]:
token_counts = tiktoken_len(text[0].page_content)
token_counts

791

In [41]:
uid = str(uuid.uuid4())
uid

data = [
    {
        'id' : f'{uid}-{chunk_idx}',
        'text' : chunk
    } for chunk_idx, chunk in enumerate(pdf_chunks)
]
data

[{'id': '07ff60c0-87ee-49d6-aabe-681ed759cd7f-0',
  'text': 'Introduction to RAG  Retrieval -augmented generation (RAG) is an AI framework for improving the quality of  LLM -generated  responses by grounding the model on external sources of knowledge to supplement the LLM’s internal representation of information. Implementing RAG in an LLM -based question answering system has two main benefits: It ensures that the model has access to the  most current, reliable facts, and that users have access to the model’s sources, ensuring that its claims can be checked for accuracy and ultimately trusted.  Prior research has introduced retrieval techniques to incorporate relevant knowledge and augment  generation, as  exemplified  by retrieval  augmented  generation ( RAG). In this framework, the input to  models is augmented  by prepending  relevant  documents  that are retrieved  from  an external  knowledge  corpus .  While  RAG  serves  as a practicable complement to  LLMs, its effectiveness i

# Text Embedding and Storing Vector in Pinecone

In [14]:
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    api_key=OPENAI_API_KEY
)

In [16]:
pc = Pinecone(api_key=PINECONE_API_KEY)


    
index_name = 'chatbot'
if index_name not in pc.list_indexes().names():
    pc.create_index(name=index_name,
                    dimension=1536,
                    metric='cosine',
                    spec=ServerlessSpec(
                        cloud='aws', 
                        region='us-west-2'
                    )
    )

index = pc.Index(index_name)

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [20]:
batch_limit = 100

texts = []

for text in enumerate(data[0].):
    print(text)

(0, 'id')
(1, 'text')


In [54]:
uid = str(uuid.uuid4())
chunk_id = []
embeddings = []
chunk_id = [str(f'{uid}-{idx}') for idx in range(len(pdf_chunks))]
embeddings = embed.embed_documents(pdf_chunks)
metadata = [{'chunk': chunk} for chunk in pdf_chunks]
index.upsert(vectors=zip(chunk_id, embeddings, metadata))

{'upserted_count': 4}

In [55]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4}},
 'total_vector_count': 4}

# Store embeddings in Pinecone

In [9]:
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = 'chatbot'
# pc.create_index(index_name, 
#                 dimension=768,
#                 metric='cosine',
#                 spec=ServerlessSpec(
#                     cloud="aws",
#                     region="us-west-2"
#                 )
# )

index = pc.Index(index_name)

for idx, embedding in enumerate(embeddings):
    embedding_list = embedding.tolist()
    
    chunk_id = f'chunk_{idx+1}'
    index.upsert([embedding_list], [chunk_id])

ValueError: Invalid vector value passed: cannot interpret type <class 'list'>

In [15]:
len(embeddings)

9

In [11]:
pc.list_indexes()

{'indexes': [{'dimension': 768,
              'host': 'chatbot-wlh2nlv.svc.apw5-4e34-81fa.pinecone.io',
              'metric': 'cosine',
              'name': 'chatbot',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [33]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}