In [51]:
import os
key = os.environ['GEMINI_API_KEY']

In [52]:
# Load gemini model

from langchain_google_genai import ChatGoogleGenerativeAI

chat_model = ChatGoogleGenerativeAI(google_api_key=key,
                               model="gemini-1.5-flash")

In [53]:
# Load a PDF file and split it into pages

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("./assets/empirical_comparison.pdf")
pages = loader.load_and_split()

In [54]:
len(pages)

11

In [55]:
# Split document into chunks

from langchain_text_splitters import NLTKTextSplitter
import nltk

# Download the 'punkt' resource from NLTK
nltk.download('punkt_tab')

# Define the text splitter by default from Copilot
# Will experment with different text splitters in the future
text_splitter = NLTKTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
)

chunks = text_splitter.split_documents(pages)

print(len(chunks))

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Created a chunk of size 641, which is longer than the specified 500
Created a chunk of size 522, which is longer than the specified 500
Created a chunk of size 590, which is longer than the specified 500


72


In [56]:
# Create chunks embedding

from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedding_model = GoogleGenerativeAIEmbeddings(
    google_api_key=key,
    model="models/text-embedding-004"
)

In [57]:
# Store them into vector database 

# I use Chroma as the vector store
from langchain_community.vectorstores import Chroma

# Embed each chunk and load it into Chroma
db = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db_")

In [58]:
# Connect to db
db_connection = Chroma(persist_directory="./chroma_db_",
                       embedding_function=embedding_model)

# Will get a warning about langchain being deprecated, fix it later

In [59]:
# Converting CHROMA db_connection to Retriever Object
retriever = db_connection.as_retriever(search_kwargs={"k": 5})

print(type(retriever))

<class 'langchain_core.vectorstores.base.VectorStoreRetriever'>


In [60]:
from langchain_core.messages import SystemMessage
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate

In [61]:
chat_template = ChatPromptTemplate.from_messages([
    # System Message Prompt Template
    SystemMessage(content="""
                  You are a Helpful AI Bot.
                  Given a context and question from user,
                  you should answer based on the given context.
                  """),
    # Human Message Prompt Template
    HumanMessagePromptTemplate.from_template(
        """
        Answer the question based on the given context.
        Context: {context}
        Question: {question}
        Answer: 
        """)
])

In [62]:
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()

In [63]:
# I think this is a pipeline for prompting

from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | chat_template
    | chat_model
    | output_parser
)

In [64]:
response = rag_chain.invoke("""
                            Please summarize Empirical Comparison of Deep Learning Algorithm Performances on Rapidminer and Tensorflow for Classification Problems
                            """)

In [None]:
response

In [65]:
from IPython.display import Markdown as md

md(response)

This research empirically compares the performance of deep learning algorithms on RapidMiner and TensorFlow for classification problems.  The key finding is that RapidMiner significantly outperforms TensorFlow in model training and validation execution time.  RapidMiner achieved an average time of 1.01 seconds, while TensorFlow took 110.98 seconds.  This difference led to the rejection of the null hypothesis and acceptance of the alternative hypothesis, concluding a statistically significant performance difference between the two platforms.  The research offers valuable insights for selecting deep learning platforms based on specific needs.


In [66]:
response = rag_chain.invoke("""Who are the author of this paper?""")

response

'The authors of the paper are Utomo Pujianto and Alvian Rahmadani Saputra.\n'

In [67]:
from IPython.display import Markdown as md

md(response)

The authors of the paper are Utomo Pujianto and Alvian Rahmadani Saputra.
