In [1]:
import os
key = os.environ['GEMINI_API_KEY']

In [2]:
# Load gemini model

from langchain_google_genai import ChatGoogleGenerativeAI

chat_model = ChatGoogleGenerativeAI(google_api_key=key,
                               model="gemini-1.5-flash")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load a PDF file and split it into pages

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("./assets/empirical_comparison.pdf")
pages = loader.load_and_split()

In [4]:
len(pages)

11

In [None]:
# Split document into chunks

from langchain_text_splitters import NLTKTextSplitter
import nltk

# Download the 'punkt_tab' resource from NLTK
nltk.download('punkt_tab')

# Define the text splitter by default from Copilot
# Will experment with different text splitters in the future
text_splitter = NLTKTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
)

chunks = text_splitter.split_documents(pages)

print(len(chunks))

38


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
# Create chunks embedding

from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedding_model = GoogleGenerativeAIEmbeddings(
    google_api_key=key,
    model="models/text-embedding-004"
)

In [6]:
# Store them into vector database 

# I use Chroma as the vector store
from langchain_community.vectorstores import Chroma

# Embed each chunk and load it into Chroma
db = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db_")

In [9]:
# Connect to db
db_connection = Chroma(persist_directory="./chroma_db_",
                       embedding_function=embedding_model)

# Will get a warning about langchain being deprecated, fix it later

  db_connection = Chroma(persist_directory="./chroma_db_",


In [10]:
# Converting CHROMA db_connection to Retriever Object
retriever = db_connection.as_retriever(search_kwargs={"k": 5})

print(type(retriever))

<class 'langchain_core.vectorstores.base.VectorStoreRetriever'>


In [11]:
from langchain_core.messages import SystemMessage
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate

In [12]:
chat_template = ChatPromptTemplate.from_messages([
    # System Message Prompt Template
    SystemMessage(content="""
                  You are a Helpful AI Bot.
                  Given a context and question from user,
                  you should answer based on the given context.
                  """),
    # Human Message Prompt Template
    HumanMessagePromptTemplate.from_template(
        """
        Answer the question based on the given context.
        Context: {context}
        Question: {question}
        Answer: 
        """)
])

In [13]:
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()

In [14]:
# I think this is a pipeline for prompting

from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | chat_template
    | chat_model
    | output_parser
)

In [64]:
response = rag_chain.invoke("""
                            Please summarize Empirical Comparison of Deep Learning Algorithm Performances on Rapidminer and Tensorflow for Classification Problems
                            """)

In [None]:
response

In [65]:
from IPython.display import Markdown as md

md(response)

This research empirically compares the performance of deep learning algorithms on RapidMiner and TensorFlow for classification problems.  The key finding is that RapidMiner significantly outperforms TensorFlow in model training and validation execution time.  RapidMiner achieved an average time of 1.01 seconds, while TensorFlow took 110.98 seconds.  This difference led to the rejection of the null hypothesis and acceptance of the alternative hypothesis, concluding a statistically significant performance difference between the two platforms.  The research offers valuable insights for selecting deep learning platforms based on specific needs.


In [15]:
response = rag_chain.invoke("""What is RapidMiner?""")

response

'Based on the provided text, RapidMiner is a data analysis platform that enables users to create and deploy machine learning models without needing extensive programming skills.  It uses the H2O deep learning algorithm, which is a multilayered feedforward neural network trained with stochastic gradient descent (SGD) and backpropagation.  The text highlights its speed advantage over TensorFlow in model training and validation.\n'

In [16]:
from IPython.display import Markdown as md

md(response)

Based on the provided text, RapidMiner is a data analysis platform that enables users to create and deploy machine learning models without needing extensive programming skills.  It uses the H2O deep learning algorithm, which is a multilayered feedforward neural network trained with stochastic gradient descent (SGD) and backpropagation.  The text highlights its speed advantage over TensorFlow in model training and validation.
