In [2]:
import google.generativeai as genai
import os
import warnings
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.messages import HumanMessage
from datetime import datetime
from typing import List, Dict, Any, Optional
import uuid
from pinecone import Pinecone, ServerlessSpec
import numpy as np
import time
from sklearn.metrics.pairwise import cosine_similarity

# Suppress TensorFlow warnings
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
warnings.filterwarnings("ignore")

# Load environment variables
load_dotenv()

def get_api_key(key_name="OPENROUTER_API_KEY"): 
    """
    Get API key from environment variables
    """
    api_key = os.getenv(key_name)
    if not api_key:
        raise ValueError(f"Invalid API key: {key_name} not found in environment variables")
    return api_key

def initialize_llm(model_name="meta-llama/llama-3.1-8b-instruct",
                  temperature=0.4,
                  use_streaming=True):
    """
    Initialize LLM
    """
    api_key = get_api_key()
    callbacks = [StreamingStdOutCallbackHandler()]
    llm = ChatOpenAI(
        model_name=model_name,
        temperature=temperature,
        streaming=use_streaming,
        callbacks=callbacks,
        openai_api_key=api_key,
        openai_api_base="https://openrouter.ai/api/v1"
    )
    return llm

llm = initialize_llm()

In [3]:
def initialize_google_embedding_model():
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise ValueError("GEMINI_API_KEY not found in environment variables")

    genai.configure(api_key=api_key)
    # Get the text-embedding-004 model
    embedding_model = "text-embedding-004"
    return embedding_model

In [5]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Optional

def load_documents(data_dir: str, file_extension:str = ".txt") -> List:
    loader = DirectoryLoader(data_dir,
            glob = "**/*.txt",
            loader_cls = TextLoader, 
            show_progress = True)
    documents = loader.load()
    print("\nLoaded {len(documents)} documents from {data_dir}")
    return documents