<a href="https://colab.research.google.com/github/Arif-Kasim1/PIAIC-201/blob/main/201_PROJECT_02_EXPERIMENT_B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -Uq langchain==0.1.0 langchain-google-genai==0.0.6 pinecone-client==3.0.0 google-generativeai==0.3.2

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/798.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m471.0/798.0 kB[0m [31m14.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.0/798.0 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.9/199.9 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.9/146.9 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m598.7/598.7 kB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m241.2/241.2 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━

In [None]:
!pip install -Uq pinecone-client==3.0.0 langchain langchain-openai openai

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import pinecone
import os
import textwrap
from typing import List, Dict
import time
from google.colab import userdata

class PineconeRAG:
    def __init__(self, api_key: str, environment: str, index_name: str):
        """
        Initialize the RAG system with Pinecone

        Args:
            api_key (str): Pinecone API key
            environment (str): Pinecone environment
            index_name (str): Name of the Pinecone index
        """
        # Set API keys
        os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
        self.pinecone_api_key = api_key
        self.environment = environment
        self.index_name = index_name

        # Initialize embeddings and LLM
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        self.llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)

        # Initialize Pinecone with the new client
        pc = pinecone.Pinecone(api_key=self.pinecone_api_key)

        # Create index if it doesn't exist
        if self.index_name not in pc.list_indexes().names():
            pc.create_index(
                name=self.index_name,
                spec=pinecone.Spec(
                    dimension=768,  # Dimension for Google's embedding model
                    metric="cosine"
                )
            )

        # Connect to Pinecone
        self.vector_store = Pinecone.from_existing_index(
            index_name=self.index_name,
            embedding=self.embeddings
        )

    def process_documents(self, file_path: str) -> List[str]:
        """
        Process documents and split into chunks

        Args:
            file_path (str): Path to the text file

        Returns:
            List[str]: List of text chunks
        """
        # Read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()

        # Create text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50,
            separators=["\nTitle:", "\n\n", "\n", " ", ""]
        )

        # Split text into chunks
        return text_splitter.split_text(raw_text)

    def upload_to_pinecone(self, texts: List[str]) -> None:
        """
        Upload text chunks to Pinecone

        Args:
            texts (List[str]): List of text chunks to upload
        """
        # Create vector store from texts
        Pinecone.from_texts(
            texts,
            self.embeddings,
            index_name=self.index_name
        )
        print(f"Uploaded {len(texts)} chunks to Pinecone")

    def create_qa_chain(self) -> RetrievalQA:
        """
        Create the question-answering chain

        Returns:
            RetrievalQA: The QA chain
        """
        # Create prompt template
        prompt_template = """Use the following pieces of context to answer the question at the end.
        If you don't know the answer, just say that you don't know, don't try to make up an answer.

        Context: {context}

        Question: {question}

        Answer:"""

        PROMPT = PromptTemplate(
            template=prompt_template,
            input_variables=["context", "question"]
        )

        # Create chain
        chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.vector_store.as_retriever(
                search_kwargs={"k": 3}
            ),
            return_source_documents=True,
            chain_type_kwargs={"prompt": PROMPT}
        )

        return chain

    def ask_question(self, question: str, chain: RetrievalQA) -> Dict:
        """
        Ask a question and get response

        Args:
            question (str): Question to ask
            chain (RetrievalQA): The QA chain

        Returns:
            Dict: Response including answer and source documents
        """
        # Get response
        start_time = time.time()
        response = chain(question)
        end_time = time.time()

        # Format response
        result = {
            "question": question,
            "answer": response['result'],
            "sources": [doc.page_content for doc in response['source_documents']],
            "time_taken": f"{end_time - start_time:.2f} seconds"
        }

        return result

    def delete_index(self) -> None:
        """Delete the Pinecone index"""
        pc = pinecone.Pinecone(api_key=self.pinecone_api_key)
        if self.index_name in pc.list_indexes().names():
            pc.delete_index(self.index_name)
            print(f"Deleted index: {self.index_name}")

def main():
    # Initialize RAG system
    rag = PineconeRAG(
        api_key=userdata.get("PINECONE_API_KEY"),
        environment="gcp-starter",  # Make sure to use your actual environment
        index_name="tech-articles"
    )

    # Process and upload documents
    texts = rag.process_documents('/content/Data.txt')
    rag.upload_to_pinecone(texts)

    # Create QA chain
    qa_chain = rag.create_qa_chain()

    # Example questions
    questions = [
        "What are the main applications of blockchain?",
        "How does quantum computing differ from classical computing?",
        "What are the advantages of 5G networks?"
    ]

    # Ask questions
    for question in questions:
        result = rag.ask_question(question, qa_chain)

        print("\n" + "="*80)
        print(f"\nQuestion: {result['question']}")
        print(f"\nAnswer: {textwrap.fill(result['answer'], width=80)}")
        print(f"\nTime taken: {result['time_taken']}")
        print("\nSources used:")
        for i, source in enumerate(result['sources'], 1):
            print(f"\nSource {i}:")
            print(textwrap.fill(source, width=80))

if __name__ == "__main__":
    main()

AttributeError: module 'pinecone' has no attribute 'Spec'

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import pinecone
import os
import textwrap
from typing import List, Dict
import time
from google.colab import userdata

class PineconeRAG:
    def __init__(self, api_key: str, environment: str, index_name: str):
        """
        Initialize the RAG system with Pinecone

        Args:
            api_key (str): Pinecone API key
            environment (str): Pinecone environment
            index_name (str): Name of the Pinecone index
        """
        # Set API keys
        os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
        self.pinecone_api_key = api_key
        self.environment = environment
        self.index_name = index_name

        # Initialize embeddings and LLM
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        self.llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)

        # Initialize Pinecone
        # pinecone.init(api_key=self.pinecone_api_key, environment=self.environment)
        pc = pinecone.Pinecone(api_key=self.pinecone_api_key)

        # Create index if it doesn't exist
        if self.index_name not in pinecone.list_indexes():
            pinecone.create_index(
                name=self.index_name,
                dimension=768,  # Dimension for Google's embedding model
                metric='cosine'
            )

        # Connect to Pinecone
        self.vector_store = Pinecone.from_existing_index(
            index_name=self.index_name,
            embedding=self.embeddings
        )

    def process_documents(self, file_path: str) -> List[str]:
        """
        Process documents and split into chunks

        Args:
            file_path (str): Path to the text file

        Returns:
            List[str]: List of text chunks
        """
        # Read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()

        # Create text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50,
            separators=["\nTitle:", "\n\n", "\n", " ", ""]
        )

        # Split text into chunks
        return text_splitter.split_text(raw_text)

    def upload_to_pinecone(self, texts: List[str]) -> None:
        """
        Upload text chunks to Pinecone

        Args:
            texts (List[str]): List of text chunks to upload
        """
        # Create vector store from texts
        Pinecone.from_texts(
            texts,
            self.embeddings,
            index_name=self.index_name
        )
        print(f"Uploaded {len(texts)} chunks to Pinecone")

    def create_qa_chain(self) -> RetrievalQA:
        """
        Create the question-answering chain

        Returns:
            RetrievalQA: The QA chain
        """
        # Create prompt template
        prompt_template = """Use the following pieces of context to answer the question at the end.
        If you don't know the answer, just say that you don't know, don't try to make up an answer.

        Context: {context}

        Question: {question}

        Answer:"""

        PROMPT = PromptTemplate(
            template=prompt_template,
            input_variables=["context", "question"]
        )

        # Create chain
        chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.vector_store.as_retriever(
                search_kwargs={"k": 3}
            ),
            return_source_documents=True,
            chain_type_kwargs={"prompt": PROMPT}
        )

        return chain

    def ask_question(self, question: str, chain: RetrievalQA) -> Dict:
        """
        Ask a question and get response

        Args:
            question (str): Question to ask
            chain (RetrievalQA): The QA chain

        Returns:
            Dict: Response including answer and source documents
        """
        # Get response
        start_time = time.time()
        response = chain(question)
        end_time = time.time()

        # Format response
        result = {
            "question": question,
            "answer": response['result'],
            "sources": [doc.page_content for doc in response['source_documents']],
            "time_taken": f"{end_time - start_time:.2f} seconds"
        }

        return result

    def delete_index(self) -> None:
        """Delete the Pinecone index"""
        if self.index_name in pinecone.list_indexes():
            pinecone.delete_index(self.index_name)
            print(f"Deleted index: {self.index_name}")

def main():
    # Initialize RAG system
    rag = PineconeRAG(
        api_key=userdata.get("PINECONE_API_KEY"),
        environment="gcp-starter",  # Make sure to use your actual environment
        index_name="tech-articles"
    )

    # Process and upload documents
    texts = rag.process_documents('/content/Data.txt')
    rag.upload_to_pinecone(texts)

    # Create QA chain
    qa_chain = rag.create_qa_chain()

    # Example questions
    questions = [
        "What are the main applications of blockchain?",
        "How does quantum computing differ from classical computing?",
        "What are the advantages of 5G networks?"
    ]

    # Ask questions
    for question in questions:
        result = rag.ask_question(question, qa_chain)

        print("\n" + "="*80)
        print(f"\nQuestion: {result['question']}")
        print(f"\nAnswer: {textwrap.fill(result['answer'], width=80)}")
        print(f"\nTime taken: {result['time_taken']}")
        print("\nSources used:")
        for i, source in enumerate(result['sources'], 1):
            print(f"\nSource {i}:")
            print(textwrap.fill(source, width=80))

if __name__ == "__main__":
    main()

AttributeError: module 'pinecone' has no attribute 'list_indexes'

In [None]:
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Pinecone
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
import pinecone
import os
from typing import List

class SimpleRAG:
    def __init__(self, pinecone_api_key: str, openai_api_key: str, index_name: str):
        """
        Initialize the RAG system
        """
        # Set up API keys
        os.environ["OPENAI_API_KEY"] = openai_api_key

        # Initialize Pinecone client (v3)
        self.pc = pinecone.Pinecone(api_key=pinecone_api_key)
        self.index_name = index_name

        # Initialize OpenAI components
        self.embeddings = OpenAIEmbeddings()
        self.llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

        # Create index if it doesn't exist
        if self.index_name not in self.pc.list_indexes().names():
            self.pc.create_index(
                name=self.index_name,
                spec=pinecone.Spec(
                    dimension=1536,  # OpenAI embedding dimension
                    metric="cosine"
                )
            )

        # Initialize vector store
        self.vector_store = Pinecone.from_existing_index(
            index_name=self.index_name,
            embedding=self.embeddings
        )

    def load_documents(self, text: str) -> List[str]:
        """
        Split text into chunks
        """
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100
        )
        return splitter.split_text(text)

    def add_texts(self, texts: List[str]) -> None:
        """
        Add texts to Pinecone
        """
        self.vector_store.add_texts(texts)
        print(f"Added {len(texts)} chunks to Pinecone")

    def query(self, question: str) -> str:
        """
        Query the RAG system
        """
        qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}),
            return_source_documents=True
        )
        response = qa_chain({"query": question})
        return {
            "answer": response["result"],
            "sources": [doc.page_content for doc in response["source_documents"]]
        }

    def cleanup(self) -> None:
        """
        Delete the Pinecone index
        """
        if self.index_name in self.pc.list_indexes().names():
            self.pc.delete_index(self.index_name)
            print(f"Deleted index: {self.index_name}")

# Example usage
def main():
    # Initialize the RAG system
    rag = SimpleRAG(
        pinecone_api_key="your-pinecone-api-key",
        openai_api_key="your-openai-api-key",
        index_name="test-index"
    )

    # Example text
    sample_text = """
    Artificial Intelligence (AI) is revolutionizing various industries.
    Machine Learning, a subset of AI, enables systems to learn from data.
    Deep Learning, a type of Machine Learning, uses neural networks with multiple layers.
    Natural Language Processing (NLP) allows computers to understand human language.
    Computer Vision helps machines interpret and analyze visual information.
    """

    # Process and add documents
    chunks = rag.load_documents(sample_text)
    rag.add_texts(chunks)

    # Ask a question
    question = "What is Machine Learning and how does it relate to AI?"
    result = rag.query(question)

    # Print results
    print("\nQuestion:", question)
    print("\nAnswer:", result["answer"])
    print("\nSources used:")
    for i, source in enumerate(result["sources"], 1):
        print(f"\nSource {i}:", source)

    # Optional: Clean up
    # rag.cleanup()

if __name__ == "__main__":
    main()

In [None]:

from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
import pinecone
import os
from typing import List
from google.colab import userdata

class SimpleRAG:
    def __init__(self, pinecone_api_key: str, google_api_key: str, index_name: str):
        """
        Initialize the RAG system
        """
        # Set up API keys
        os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
        os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API_KEY")

        # Initialize Pinecone client (v3)
        self.pc = pinecone.Pinecone(api_key=pinecone_api_key)
        self.index_name = index_name

        # Initialize Gemini components
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        self.llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0)

        # Create index if it doesn't exist
        if self.index_name not in self.pc.list_indexes().names():
            self.pc.create_index(
                name=self.index_name,
                spec=pinecone.Spec(
                    dimension=768,  # Gemini embedding dimension
                    metric="cosine"
                )
            )

        # Initialize vector store
        self.vector_store = Pinecone.from_existing_index(
            index_name=self.index_name,
            embedding=self.embeddings
        )

    def load_documents(self, text: str) -> List[str]:
        """
        Split text into chunks
        """
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100
        )
        return splitter.split_text(text)

    def add_texts(self, texts: List[str]) -> None:
        """
        Add texts to Pinecone
        """
        self.vector_store.add_texts(texts)
        print(f"Added {len(texts)} chunks to Pinecone")

    def query(self, question: str) -> str:
        """
        Query the RAG system
        """
        qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}),
            return_source_documents=True
        )
        response = qa_chain({"query": question})
        return {
            "answer": response["result"],
            "sources": [doc.page_content for doc in response["source_documents"]]
        }

    def cleanup(self) -> None:
        """
        Delete the Pinecone index
        """
        if self.index_name in self.pc.list_indexes().names():
            self.pc.delete_index(self.index_name)
            print(f"Deleted index: {self.index_name}")

# Example usage
def main():
    # Initialize the RAG system
    rag = SimpleRAG(
        pinecone_api_key=userdata.get("PINECONE_API_KEY"),
        google_api_key=userdata.get("GOOGLE_API_KEY"),
        index_name="test-index"
    )

    # Example text
    sample_text = """
    Artificial Intelligence (AI) is revolutionizing various industries.
    Machine Learning, a subset of AI, enables systems to learn from data.
    Deep Learning, a type of Machine Learning, uses neural networks with multiple layers.
    Natural Language Processing (NLP) allows computers to understand human language.
    Computer Vision helps machines interpret and analyze visual information.
    """

    # Process and add documents
    chunks = rag.load_documents(sample_text)
    rag.add_texts(chunks)

    # Ask a question
    question = "What is Machine Learning and how does it relate to AI?"
    result = rag.query(question)

    # Print results
    print("\nQuestion:", question)
    print("\nAnswer:", result["answer"])
    print("\nSources used:")
    for i, source in enumerate(result["sources"], 1):
        print(f"\nSource {i}:", source)

    # Optional: Clean up
    # rag.cleanup()

if __name__ == "__main__":
    main()


AttributeError: module 'pinecone' has no attribute 'Spec'

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
import pinecone
import os
from typing import List
from google.colab import userdata
from pinecone import ServerlessSpec # Import the ServerlessSpec

class SimpleRAG:
    def __init__(self, pinecone_api_key: str, google_api_key: str, index_name: str):
        """
        Initialize the RAG system
        """
        # Set up API keys
        os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
        os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API_KEY")

        # Initialize Pinecone client (v3)
        self.pc = pinecone.Pinecone(api_key=pinecone_api_key)
        self.index_name = index_name

        # Initialize Gemini components
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        self.llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0)

        # Create index if it doesn't exist
        if self.index_name not in self.pc.list_indexes().names():
            self.pc.create_index(
                name=self.index_name,
                spec=ServerlessSpec( # Use ServerlessSpec
                    dimension=768,  # Gemini embedding dimension
                    metric="cosine"
                )
            )

        # Initialize vector store
        self.vector_store = Pinecone.from_existing_index(
            index_name=self.index_name,
            embedding=self.embeddings
        )

    def load_documents(self, text: str) -> List[str]:
        """
        Split text into chunks
        """
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100
        )
        return splitter.split_text(text)

    def add_texts(self, texts: List[str]) -> None:
        """
        Add texts to Pinecone
        """
        self.vector_store.add_texts(texts)
        print(f"Added {len(texts)} chunks to Pinecone")

    def query(self, question: str) -> str:
        """
        Query the RAG system
        """
        qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}),
            return_source_documents=True
        )
        response = qa_chain({"query": question})
        return {
            "answer": response["result"],
            "sources": [doc.page_content for doc in response["source_documents"]]
        }

    def cleanup(self) -> None:
        """
        Delete the Pinecone index
        """
        if self.index_name in self.pc.list_indexes().names():
            self.pc.delete_index(self.index_name)
            print(f"Deleted index: {self.index_name}")

# Example usage
def main():
    # Initialize the RAG system
    rag = SimpleRAG(
        pinecone_api_key=userdata.get("PINECONE_API_KEY"),
        google_api_key=userdata.get("GOOGLE_API_KEY"),
        index_name="test-index"
    )

    # Example text
    sample_text = """
    Artificial Intelligence (AI) is revolutionizing various industries.
    Machine Learning, a subset of AI, enables systems to learn from data.
    Deep Learning, a type of Machine Learning, uses neural networks with multiple layers.
    Natural Language Processing (NLP) allows computers to understand human language.
    Computer Vision helps machines interpret and analyze visual information.
    """

    # Process and add documents
    chunks = rag.load_documents(sample_text)
    rag.add_texts(chunks)

    # Ask a question
    question = "What is Machine Learning and how does it relate to AI?"
    result = rag.query(question)

    # Print results
    print("\nQuestion:", question)
    print("\nAnswer:", result["answer"])
    print("\nSources used:")
    for i, source in enumerate(result["sources"], 1):
        print(f"\nSource {i}:", source)

    # Optional: Clean up
    # rag.cleanup()

if __name__ == "__main__":
    main()

TypeError: ServerlessSpec.__new__() got an unexpected keyword argument 'dimension'

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
import pinecone
import os
from typing import List
from google.colab import userdata
from pinecone import ServerlessSpec # Import the ServerlessSpec

class SimpleRAG:
    def __init__(self, pinecone_api_key: str, google_api_key: str, index_name: str):
        """
        Initialize the RAG system
        """
        # Set up API keys
        os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
        os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API_KEY")

        # Initialize Pinecone client (v3)
        self.pc = pinecone.Pinecone(api_key=pinecone_api_key)
        self.index_name = index_name

        # Initialize Gemini components
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        self.llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0)

        # Create index if it doesn't exist
        if self.index_name not in self.pc.list_indexes().names():
            self.pc.create_index(
                name=self.index_name,
                dimension=768,  # Gemini embedding dimension
                metric="cosine",
                spec=ServerlessSpec( # Use ServerlessSpec
                    cloud="aws",
                    region="us-west-2"
                )
            )

        # Initialize vector store
        self.vector_store = Pinecone.from_existing_index(
            index_name=self.index_name,
            embedding=self.embeddings
        )

    def load_documents(self, text: str) -> List[str]:
        """
        Split text into chunks
        """
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100
        )
        return splitter.split_text(text)

    def add_texts(self, texts: List[str]) -> None:
        """
        Add texts to Pinecone
        """
        self.vector_store.add_texts(texts)
        print(f"Added {len(texts)} chunks to Pinecone")

    def query(self, question: str) -> str:
        """
        Query the RAG system
        """
        qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}),
            return_source_documents=True
        )
        response = qa_chain({"query": question})
        return {
            "answer": response["result"],
            "sources": [doc.page_content for doc in response["source_documents"]]
        }

    def cleanup(self) -> None:
        """
        Delete the Pinecone index
        """
        if self.index_name in self.pc.list_indexes().names():
            self.pc.delete_index(self.index_name)
            print(f"Deleted index: {self.index_name}")

# Example usage
def main():
    # Initialize the RAG system
    rag = SimpleRAG(
        pinecone_api_key=userdata.get("PINECONE_API_KEY"),
        google_api_key=userdata.get("GOOGLE_API_KEY"),
        index_name="test-index"
    )

    # Example text
    sample_text = """
    Artificial Intelligence (AI) is revolutionizing various industries.
    Machine Learning, a subset of AI, enables systems to learn from data.
    Deep Learning, a type of Machine Learning, uses neural networks with multiple layers.
    Natural Language Processing (NLP) allows computers to understand human language.
    Computer Vision helps machines interpret and analyze visual information.
    """

    # Process and add documents
    chunks = rag.load_documents(sample_text)
    rag.add_texts(chunks)

    # Ask a question
    question = "What is Machine Learning and how does it relate to AI?"
    result = rag.query(question)

    # Print results
    print("\nQuestion:", question)
    print("\nAnswer:", result["answer"])
    print("\nSources used:")
    for i, source in enumerate(result["sources"], 1):
        print(f"\nSource {i}:", source)

    # Optional: Clean up
    # rag.cleanup()

if __name__ == "__main__":
    main()

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-04', 'X-Cloud-Trace-Context': '4a343347a885ddee6fb0145bd8f21ff1', 'Date': 'Thu, 26 Dec 2024 11:47:02 GMT', 'Server': 'Google Frontend', 'Content-Length': '200', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Bad request: Your free plan does not support indexes in the us-west-2 region of aws. To create indexes in this region, upgrade your plan."},"status":400}


In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
import pinecone
import os
from typing import List
from google.colab import userdata
from pinecone import ServerlessSpec # Import the ServerlessSpec

class SimpleRAG:
    def __init__(self, pinecone_api_key: str, google_api_key: str, index_name: str):
        """
        Initialize the RAG system
        """
        # Set up API keys
        os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
        os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API_KEY")

        # Initialize Pinecone client (v3)
        self.pc = pinecone.Pinecone(api_key=pinecone_api_key)
        self.index_name = index_name

        # Initialize Gemini components
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        self.llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0)

        # Create index if it doesn't exist
        if self.index_name not in self.pc.list_indexes().names():
            self.pc.create_index(
                name=self.index_name,
                dimension=768,  # Gemini embedding dimension
                metric="cosine",
                spec=ServerlessSpec( # Use ServerlessSpec
                    cloud="aws",
                    region="us-east-1" # Changed region to us-east-1
                )
            )

        # Initialize vector store
        self.vector_store = Pinecone.from_existing_index(
            index_name=self.index_name,
            embedding=self.embeddings
        )

    def load_documents(self, text: str) -> List[str]:
        """
        Split text into chunks
        """
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100
        )
        return splitter.split_text(text)

    def add_texts(self, texts: List[str]) -> None:
        """
        Add texts to Pinecone
        """
        self.vector_store.add_texts(texts)
        print(f"Added {len(texts)} chunks to Pinecone")

    def query(self, question: str) -> str:
        """
        Query the RAG system
        """
        qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}),
            return_source_documents=True
        )
        response = qa_chain({"query": question})
        return {
            "answer": response["result"],
            "sources": [doc.page_content for doc in response["source_documents"]]
        }

    def cleanup(self) -> None:
        """
        Delete the Pinecone index
        """
        if self.index_name in self.pc.list_indexes().names():
            self.pc.delete_index(self.index_name)
            print(f"Deleted index: {self.index_name}")

# Example usage
def main():
    # Initialize the RAG system
    rag = SimpleRAG(
        pinecone_api_key=userdata.get("PINECONE_API_KEY"),
        google_api_key=userdata.get("GOOGLE_API_KEY"),
        index_name="test-index"
    )

    # Example text
    sample_text = """
    Artificial Intelligence (AI) is revolutionizing various industries.
    Machine Learning, a subset of AI, enables systems to learn from data.
    Deep Learning, a type of Machine Learning, uses neural networks with multiple layers.
    Natural Language Processing (NLP) allows computers to understand human language.
    Computer Vision helps machines interpret and analyze visual information.
    """

    # Process and add documents
    chunks = rag.load_documents(sample_text)
    rag.add_texts(chunks)

    # Ask a question
    question = "What is Machine Learning and how does it relate to AI?"
    result = rag.query(question)

    # Print results
    print("\nQuestion:", question)
    print("\nAnswer:", result["answer"])
    print("\nSources used:")
    for i, source in enumerate(result["sources"], 1):
        print(f"\nSource {i}:", source)

    # Optional: Clean up
    # rag.cleanup()

if __name__ == "__main__":
    main()

Added 1 chunks to Pinecone


  addendum : str, optional


ValueError: SystemMessages are not yet supported!

To automatically convert the leading SystemMessage to a HumanMessage,
set  `convert_system_message_to_human` to True. Example:

llm = ChatGoogleGenerativeAI(model="gemini-pro", convert_system_message_to_human=True)


In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
import pinecone
import os
from typing import List
from google.colab import userdata
from pinecone import ServerlessSpec # Import the ServerlessSpec
import textwrap

class SimpleRAG:
    def __init__(self, pinecone_api_key: str, google_api_key: str, index_name: str):
        """
        Initialize the RAG system
        """
        # Set up API keys
        os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
        os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API_KEY")

        # Initialize Pinecone client (v3)
        self.pc = pinecone.Pinecone(api_key=pinecone_api_key)
        self.index_name = index_name

        # Initialize Gemini components
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        # Add the convert_system_message_to_human parameter here
        self.llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0, convert_system_message_to_human=True)

        # Create index if it doesn't exist
        if self.index_name not in self.pc.list_indexes().names():
            self.pc.create_index(
                name=self.index_name,
                dimension=768,  # Gemini embedding dimension
                metric="cosine",
                spec=ServerlessSpec( # Use ServerlessSpec
                    cloud="aws",
                    region="us-east-1" # Changed region to us-east-1
                )
            )

        # Initialize vector store
        self.vector_store = Pinecone.from_existing_index(
            index_name=self.index_name,
            embedding=self.embeddings
        )

    def load_documents(self, text: str) -> List[str]:
        """
        Split text into chunks
        """
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100
        )
        return splitter.split_text(text)

    def add_texts(self, texts: List[str]) -> None:
        """
        Add texts to Pinecone
        """
        self.vector_store.add_texts(texts)
        print(f"Added {len(texts)} chunks to Pinecone")

    def query(self, question: str) -> str:
        """
        Query the RAG system
        """
        qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}),
            return_source_documents=True
        )
        response = qa_chain({"query": question})
        return {
            "answer": response["result"],
            "sources": [doc.page_content for doc in response["source_documents"]]
        }

    def cleanup(self) -> None:
        """
        Delete the Pinecone index
        """
        if self.index_name in self.pc.list_indexes().names():
            self.pc.delete_index(self.index_name)
            print(f"Deleted index: {self.index_name}")

# Example usage
def main():
    # Initialize the RAG system
    rag = SimpleRAG(
        pinecone_api_key=userdata.get("PINECONE_API_KEY"),
        google_api_key=userdata.get("GOOGLE_API_KEY"),
        index_name="test-index"
    )

    # Example text
    sample_text = """
    Artificial Intelligence (AI) is revolutionizing various industries.
    Machine Learning, a subset of AI, enables systems to learn from data.
    Deep Learning, a type of Machine Learning, uses neural networks with multiple layers.
    Natural Language Processing (NLP) allows computers to understand human language.
    Computer Vision helps machines interpret and analyze visual information.

    Introduction: The Pentium processor series, launched by Intel in 1993,
    marked a significant leap in performance over its predecessor, the 486 series.
    Architecture: Introduced advanced features like superscalar architecture,
    allowing multiple instructions per clock cycle.
    Variants: Evolved through multiple generations, including Pentium Pro,
    Pentium II, Pentium III, and Pentium 4, offering enhanced speed and functionality.
    Technology: Incorporated technologies like MMX, Hyper-Threading, and higher
    clock speeds to cater to evolving computing needs.
    Legacy: Paved the way for modern processors, blending performance and
    efficiency for desktops and laptops.

    The future of machine learning (ML) is poised for transformative growth,
    driving innovation across industries. Advances in neural networks, quantum
    computing, and explainable AI will make ML more powerful and transparent.
    It will revolutionize healthcare with precise diagnostics, enhance automation
    in manufacturing, and optimize personalized experiences in retail and
    entertainment. Ethical AI and robust frameworks will address challenges like
    bias and privacy. Seamless integration with IoT, robotics, and edge
    computing will bring AI closer to users. As ML democratizes through
    accessible tools, its potential to solve global challenges, from climate
    change to education, will shape a smarter, sustainable world.

    Pakistanis hold a special affection for their traditional dishes, with
    biryani, paye, and nihari reigning supreme. Biryani, a fragrant mix of rice,
    meat, and spices, is a celebratory dish enjoyed at weddings, festivals, and
    casual gatherings. Its rich flavors and endless variations make it a
    nationwide favorite. Paye, a slow-cooked delicacy made from trotters,
    offers a hearty, flavorful experience often relished during breakfast or
    family feasts. Nihari, a spicy stew of tender meat simmered overnight, is
    synonymous with comfort food, particularly loved in winters. These dishes
    are more than just meals—they represent Pakistan’s rich culinary heritage
    and the warmth of sharing food. Served with naan, parathas, or raita, they
    bring families and friends together, embodying a deep-rooted tradition of
    hospitality. Whether in bustling cities or quiet villages, the love for
    biryani, paye, and nihari reflects the soul of Pakistani cuisine.
    """

    # Process and add documents
    chunks = rag.load_documents(sample_text)
    rag.add_texts(chunks)

    # Ask a question
    # question = "What is Machine Learning and how does it relate to AI?"
    # question = "What is Machine Learning future and how does it relate to AI?"
    # question = "What is JSP and Servlet?"
    # question = "Difference between Pentium 2 and Pentium 4?"
    # question = "Difference between 486 and Pentium?"
    question = "Name some traditional dishes of pakistan?"

    result = rag.query(question)

    # Print results
    print("\nQuestion:", question)
    print("\nAnswer:", textwrap.fill(result["answer"], width=80))
    # textwrap.fill(response, width=80)
    print("\nSources used:")
    for i, source in enumerate(result["sources"], 1):
        print(f"\nSource {i}:", source)

    # Optional: Clean up
    # rag.cleanup()

if __name__ == "__main__":
    main()

Added 5 chunks to Pinecone


  warn_deprecated(



Question: Name some traditional dishes of pakistan?

Answer: - Biryani - Paye - Nihari

Sources used:

Source 1: hospitality. Whether in bustling cities or quiet villages, the love for 
    biryani, paye, and nihari reflects the soul of Pakistani cuisine.

Source 2: hospitality. Whether in bustling cities or quiet villages, the love for 
    biryani, paye, and nihari reflects the soul of Pakistani cuisine.

Source 3: hospitality. Whether in bustling cities or quiet villages, the love for 
    biryani, paye, and nihari reflects the soul of Pakistani cuisine.
