In [1]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from dotenv import load_dotenv, find_dotenv
env_file = find_dotenv()
load_dotenv(env_file, override=True)

False

In [3]:
def load_file(file_path):
    import os
    base_name, file_extension = os.path.splitext(file_path)

    if file_extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        loader = PyPDFLoader(file_path)
    elif file_extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        loader = Docx2txtLoader(file_path)
    elif file_extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file_path)
    else:
        return None

    document_data = loader.load()
    return document_data


In [4]:
def fetch_from_wikipedia(search_query, language='en', max_docs_to_load=2):
    from langchain.document_loaders import WikipediaLoader
    wiki_loader = WikipediaLoader(query=search_query, lang=language, load_max_docs=max_docs_to_load)
    loaded_data = wiki_loader.load()
    return loaded_data


In [5]:
def split_into_chunks(document_data, chunk_length=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_length, chunk_overlap=0)
    document_chunks = splitter.split_documents(document_data)
    return document_chunks


In [6]:
def calculate_embedding_cost(pages):
    import tiktoken
    tokenizer = tiktoken.encoding_for_model('text-embedding-3-small')
    token_count = sum([len(tokenizer.encode(page.page_content)) for page in pages])
    print(f'Total Tokens: {token_count}')
    print(f'Embedding Cost in USD: {token_count / 1000 * 0.00002:.6f}')


In [7]:
def manage_embeddings(index_key, chunk_data):
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import ServerlessSpec
    pinecone_api = pinecone.Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
    embedding_service = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536, api_key=os.getenv("OPEN_AI_KEY"))
    
    if index_key in pinecone_api.list_indexes().names():
        print(f'Found index {index_key}. Retrieving embeddings ... ', end='')
        vector_store_instance = Pinecone.from_existing_index(index_key, embedding_service)
        print('Done')
    else:
        print(f'Index {index_key} not found. Creating and storing embeddings ...', end='')
        pinecone_api.create_index(
            name=index_key,
            dimension=1536,
            metric='cosine',
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
        vector_store_instance = Pinecone.from_documents(chunk_data, embedding_service, index_name=index_key)
        print('Done')
        
    return vector_store_instance


In [8]:
def delete_pinecone_indices(target_index='all'):
    import pinecone
    pinecone_service = pinecone.Pinecone()
    
    if target_index == 'all':
        indexes_to_delete = pinecone_service.list_indexes().names()
        print('Deleting all indices ... ')
        for index_name in indexes_to_delete:
            pinecone_service.delete_index(index_name)
        print('Completed')
    else:
        print(f'Deleting index {target_index} ...', end='')
        pinecone_service.delete_index(target_index)
        print('Completed')


In [9]:
def fetch_and_process_query(vector_store, search_query, top_k_results=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI
    chat_model = ChatOpenAI(model='gpt-3.5-turbo', temperature=1, api_key=os.getenv("OPEN_AI_KEY"))
    search_retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': top_k_results})
    qa_pipeline = RetrievalQA.from_chain_type(llm=chat_model, chain_type="stuff", retriever=search_retriever)
    result = qa_pipeline.invoke(search_query)
    return result


In [10]:
def divide_text(text_data):
    segment_length = 1000
    segments = [text_data[i:i + segment_length] for i in range(0, len(text_data), segment_length)]
    return segments

text = "This is some example text that we want to split into smaller chunks. " * 50
segments = divide_text(text)
print(len(segments))


4


In [11]:
import tiktoken

def count_tokens(input_text, model="gpt-3.5"):
    tokenizer = tiktoken.get_encoding("cl100k_base")
    token_count = len(tokenizer.encode(input_text))
    return token_count

def calculate_cost_per_embedding(text_chunks, rate_per_1000_tokens=0.0004):
    total_token_count = 0
    for text in text_chunks:
        total_token_count += count_tokens(text)
    embedding_cost = (total_token_count / 1000) * rate_per_1000_tokens
    return embedding_cost

text_chunks = [
    "This is the first chunk of text. It's just an example to demonstrate how token counting works.",
    "Here is the second chunk of text, another example with more content.",
    "Finally, this is the third chunk of text to make sure we have multiple chunks for the demonstration."
]

estimated_cost = calculate_cost_per_embedding(text_chunks)
print(f"Estimated embedding cost:


SyntaxError: unterminated string literal (detected at line 22) (3382900417.py, line 22)