In [12]:
import vertexai
from vertexai.language_models import TextGenerationModel
from dotenv import load_dotenv
import os

load_dotenv()

PROJECT = os.getenv("PROJECT")
LOCATION = "asia-southeast1"

vertexai.init(project=PROJECT, location=LOCATION)
# parameters = {
#     "candidate_count": 1,
#     "max_output_tokens": 1024,
#     "temperature": 0.9,
#     "top_p": 1
# }
model = TextGenerationModel.from_pretrained("text-bison")

In [13]:
# Utils
import time
from typing import List

# Langchain
import langchain

print(f"LangChain version: {langchain.__version__}")

# Vertex AI
# from langchain.llms import VertexAI

from google.cloud import aiplatform
from langchain_google_vertexai import ChatVertexAI
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_google_vertexai import VertexAI
from langchain.schema import HumanMessage, SystemMessage

print(f"Vertex AI SDK version: {aiplatform.__version__}")

LangChain version: 0.1.12
Vertex AI SDK version: 1.44.0


# LLM
- temperature: Randomness/creativity in output
- token_limit: max amt of text output from one prompt
- top-k: token selected from top k options (based on probability)
- top-p: affects tokens selected for output as well -> select possible tokens to output till their probabilities sums to top-p. Choose among those tokens (based on temp)

In [14]:
# Utility functions for Embeddings API with rate limiting
def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    print("Waiting")
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            print(".", end="")
            time.sleep(sleep_time)


class CustomVertexAIEmbeddings(VertexAIEmbeddings):
    requests_per_minute: int
    num_instances_per_batch: int

    # Overriding embed_documents method
    def embed_documents(self, texts: List[str]):
        limiter = rate_limit(self.requests_per_minute)
        results = []
        docs = list(texts)

        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[: self.num_instances_per_batch],
                docs[self.num_instances_per_batch :],
            )
            chunk = self.client.get_embeddings(head)
            results.extend(chunk)
            next(limiter)

        return [r.values for r in results]

In [52]:
# LLM model
ans_parameters = {
    # "candidate_count": 1,
    "model_name":"text-bison",
    "max_output_tokens": 2048,
    "temperature": 0.1,
    "top_p": 0.8,
    "top_k": 40,
    "verbose": True,
}

ques_parameters = {
    # "candidate_count": 1,
    "model_name":"text-bison",
    "max_output_tokens": 2048,
    "temperature": 0.3,
    "top_p": 0.8,
    "top_k": 40,
    "verbose": True,
}

# Minimise randomness
answer_llm = VertexAI( **ans_parameters)
#     model_name="text-bison",
#     max_output_tokens=512,
#     temperature=0.1,
#     top_p=0.8,
#     top_k=40,
#     verbose=True,
# )

# Ok to be random
question_llm = VertexAI(**ques_parameters)

# Chat
chat = ChatVertexAI()

# Embedding
EMBEDDING_QPM = 15
EMBEDDING_NUM_BATCH = 2
embeddings = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH,
)

Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


# Reading from PDFs

In [71]:
# Ingest PDF files
from langchain.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter

# file_path = "Lec2 DataCenterNetworking.pdf"
file_path = "Lec1.pdf"
loader = PyPDFLoader(file_path)
documents = loader.load()

In [72]:
len(documents)

54

In [73]:
documents[0]

Document(page_content='CE/CZ4052Cloud ComputingBasics, IaaS, PaaS, SaaSDr. Tan, Chee WeiEmail: cheewei.tan@ntu.edu.sgOffice: N4-02c-104', metadata={'source': 'Lec1.pdf', 'page': 0})

In [74]:
# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
print(f"# of documents = {len(docs)}")

# of documents = 54


In [75]:
docs

[Document(page_content='CE/CZ4052Cloud ComputingBasics, IaaS, PaaS, SaaSDr. Tan, Chee WeiEmail: cheewei.tan@ntu.edu.sgOffice: N4-02c-104', metadata={'source': 'Lec1.pdf', 'page': 0}),
 Document(page_content='Outline\n1Real-world examples of the cloud Definitions of cloud computingKey cloud concepts and characteristics Deployment scenarios', metadata={'source': 'Lec1.pdf', 'page': 1}),
 Document(page_content='Cloud:MassiveScale\n2Facebook[GigaOM,2012]30Kin2009->60Kin2010->100Kin2012Microsoft[DCknowledge]>1million,2013AWSEC2[RandyBias,2009]40K,8corespermachineGoogle[DCknowledge]>900K,2013', metadata={'source': 'Lec1.pdf', 'page': 2}),
 Document(page_content='Datacenter:outside\nCopyright:Google4', metadata={'source': 'Lec1.pdf', 'page': 3}),
 Document(page_content='Datacenter:outside\nCopyright:Googles', metadata={'source': 'Lec1.pdf', 'page': 4}),
 Document(page_content='Datacenter:inside\nCopyright:Google6', metadata={'source': 'Lec1.pdf', 'page': 5}),
 Document(page_content='Serverrac

In [76]:
question_gen = ''

for page in documents:
    question_gen += page.page_content

splitter_ques_gen = TokenTextSplitter(
model_name = 'gpt-3.5-turbo',
chunk_size = 1000,
chunk_overlap = 50
)

chunks_ques_gen = splitter_ques_gen.split_text(question_gen)

document_ques_gen = [Document(page_content=t) for t in chunks_ques_gen]

In [77]:
len(document_ques_gen)

4

In [78]:
document_ques_gen

[Document(page_content='CE/CZ4052Cloud ComputingBasics, IaaS, PaaS, SaaSDr. Tan, Chee WeiEmail: cheewei.tan@ntu.edu.sgOffice: N4-02c-104Outline\n1Real-world examples of the cloud Definitions of cloud computingKey cloud concepts and characteristics Deployment scenariosCloud:MassiveScale\n2Facebook[GigaOM,2012]30Kin2009->60Kin2010->100Kin2012Microsoft[DCknowledge]>1million,2013AWSEC2[RandyBias,2009]40K,8corespermachineGoogle[DCknowledge]>900K,2013Datacenter:outside\nCopyright:Google4Datacenter:outside\nCopyright:GooglesDatacenter:inside\nCopyright:Google6Serverracks\n7Photocredit:GoogleServer:inside\nCopyright:xsnetsServercage\n9Network room\nCopyright:Google10CoolingfunctionalityPowerfunctionality\n11Copyright:GigaOM\nCopyright: NationofChangehttps://www.youtube.com/watch?v=XZmGGAbHqa0Explore Google DatacenterCloudproviders\n12amazonwebservices™\nCloud-basedservices\n13\nCloudvendors\n14\nSowhatisacloud?\n15serv•ices--Cloud corecloud edgeData centers and InternetAdefinition\n16Cloud com

# VectorStores
- Index or DB to store vectors (our embeddings)
- Columns are: [Embedding Value, Metadata]

In [79]:
# from langchain_community.vectorstores import Chroma

In [80]:
# db = Chroma.from_documents(docs, embeddings)

# Prompts

In [81]:
from langchain.prompts import PromptTemplate

In [82]:
prompt_template = """
You are an expert at creating questions based on the content in documents.
Your goal is to prepare a student for their exam and tests.
You do this by asking questions about the text below:

------------
{text}
------------

Create questions in an MCQ format with 4 possible solutions that will prepare the students for
their tests. The output specifications are as follows:
'question', 'possible_answers', 'correct_answer', 'explanation'. Separate the questions with a *.
Make sure not to lose any important information.

QUESTIONS:
"""

PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=["text"])

refine_template = ("""
You are an expert at creating practice questions based on content in documents.
Your goal is to help a stuednt prepare for their exams and test.
We have received some practice questions to a certain extent: {existing_answer}.
The question, possible answers, correct answer and explanation can be found in the 'question',
'possible_answers','correct_answer' and 'explanation' fields respectively.
We have the option to refine the existing questions or add new ones. 
(only if necessary) with some more context below. Separate the questions with a '*'.
------------
{text}
------------

Given the new context, refine the original questions in English.
If the context is not helpful, please provide the original questions.
QUESTIONS:
"""
)

REFINE_PROMPT_QUESTIONS = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template,
)

In [98]:
map_template = """
You are an expert at summarising the important content in documents.
Your goal is to prepare a student for their exam and tests by ensuring the summaries of documents are concise and informative.
You do this by reading the document's text below carefully:

------------
{docs}
------------

Based on the text, please identify the main content

Main Content:
"""

map_prompt = PromptTemplate(template=map_template, input_variables=["docs"])

reduce_template = ("""
You are an expert at creating practice questions based on content in documents.
Your goal is to help a stuednt prepare for their exams and test.
We have received some summaries of the content that you need to prepare the students for:
{docs}

Create questions in an MCQ format with 4 possible solutions that will prepare the students for their tests. The output specifications are as follows:
'question', 'possible_answers', 'correct_answer', 'explanation'.
Make sure not to lose any important information.
"""
)

reduce_prompt  = PromptTemplate(
    input_variables=["docs"],
    template=reduce_template,
)

# LLM Pipeline
This section integrates the above components together and generates the questions and answers.

In [84]:
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain, LLMChain, StuffDocumentsChain

## Testing of MapReduce for Large text

In [99]:

map_chain = LLMChain(llm=answer_llm, prompt=map_prompt)
reduce_chain = LLMChain(llm=question_llm, prompt=reduce_prompt)

combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

In [100]:
res = map_reduce_chain.run(document_ques_gen)
print(res)

 **Question 1:**
**question**: What is cloud computing?
**possible_answers**:
a) A model for enabling ubiquitous, convenient, on-demand network access to a shared pool of configurable computing resources.
b) A type of software that allows users to access their files from any device.
c) A way to store data online.
d) A type of computer that is connected to the internet.
**correct_answer**: a) A model for enabling ubiquitous, convenient, on-demand network access to a shared pool of configurable computing resources.
**explanation**: Cloud computing is a model for enabling ubiquitous, convenient, on-demand network access to a shared pool of configurable computing resources. This means that users can access computing resources, such as storage, processing power, and software, over the internet on a pay-as-you-go basis.

**Question 2:**
**question**: Which of the following is not a key cloud concept?
**possible_answers**:
a) On-demand self-service
b) Broad network access
c) Resource pooling


In [104]:
# Save res to txt
output_file = "static/output/questions.md"
with open(output_file, "w") as f:
    f.write(res)

## Testing of Refine Chain for Smaller Text

In [88]:
import ast
ques_gen_chain = load_summarize_chain(llm = question_llm, 
                                            chain_type = "refine", 
                                            verbose = True, 
                                            question_prompt=PROMPT_QUESTIONS, 
                                            refine_prompt=REFINE_PROMPT_QUESTIONS)

# Outputs a list of questions
ques = ques_gen_chain.run([Document(page_content=question_gen)])

# ques = [" {\n 'question': 'Which of the following is not one of the three guarantees in the CAP theorem?',\n 'possible_answers': ['Consistency', 'Availability', 'Partition-tolerance', 'Reliability'],\n 'correct_answer': 'Reliability',\n 'explanation': 'The CAP theorem describes the trade-offs between Consistency, Availability, and Partition-tolerance in distributed systems.'\n},\n*\n{\n 'question': 'What does the CAP theorem state?',\n 'possible_answers': ['It is impossible for a distributed system to provide all three guarantees of Consistency, Availability, and Partition-tolerance at the same time.', 'A distributed system can satisfy any two of the three guarantees at the same time but not all three.', 'A distributed system can only satisfy one of the three guarantees at a time.', 'A distributed system can always satisfy all three guarantees at the same time.'],\n 'correct_answer': 'A distributed system can satisfy any two of the three guarantees at the same time but not all three.',\n 'explanation': 'The CAP theorem states that it is impossible for a distributed system to provide all three guarantees of Consistency, Availability, and Partition-tolerance at the same time.'\n},\n*\n{\n 'question': 'Which of the following is an example of an AP system?',\n 'possible_answers': ['Web Caching', 'DNS', 'Majority protocols', 'Distributed Locking'],\n 'correct_answer': 'Web Caching',\n 'explanation': 'AP systems relax consistency in favor of availability. Web Caching is an example of an AP system because it allows for stale data to be served in order to improve performance.'\n},\n*\n{\n 'question': 'Which of the following is an example of a CP system?',\n 'possible_answers': ['Web Caching', 'DNS', 'Majority protocols', 'Distributed Locking'],\n 'correct_answer': 'Distributed Locking',\n 'explanation': 'CP systems sacrifice availability for consistency. Distributed Locking is an example of a CP system because it ensures that only one node can access a shared resource at a time, even if it means that other nodes may be unavailable.'\n},\n*\n{\n 'question': 'What is eventual consistency?',\n 'possible_answers': ['A specific form of weak consistency', 'A type of strong consistency', 'A type of partition-tolerance', 'A type of availability'],\n 'correct_answer': 'A specific form of weak consistency',\n 'explanation': 'Eventual consistency is a specific form of weak consistency that guarantees that if no new updates are made to an object, eventually all accesses will return the last updated value.'\n},\n*\n{\n 'question': 'Why does Facebook use an eventually consistent model?',\n 'possible_answers': ['To reduce the load and improve availability', 'To ensure strong consistency', 'To improve partition-tolerance', 'To reduce the cost of storage'],\n 'correct_answer': 'To reduce the load and improve availability',\n 'explanation': 'Facebook uses an eventually consistent model to reduce the load and improve availability. With over 1 billion active users, it is non-trivial to efficiently and reliably store the huge amount of data generated at any given time.'\n},\n*\n{\n 'question': 'What is the trade-off between consistency and availability in an airline reservation system?',\n 'possible_answers': ['When most of the seats are available, availability is more critical; when the plane is close to being filled, consistency is more critical.', 'When most of the seats are available, consistency is more critical; when the plane is close to being filled, availability is more critical.', 'There is no trade-off between consistency and availability in an airline reservation system.', 'The trade-off between consistency and availability is the same regardless of the number of seats available.'],\n 'correct_answer': 'When most of the seats are available, availability is more critical; when the plane is close to being filled, consistency is more critical.',\n 'explanation': 'When most of the seats are available, it is ok to rely on somewhat out-of-date data, availability is more critical. When the plane is close to be filled, it needs more accurate data to ensure the plane is not overbooked, consistency is more critical.'\n}"]
ques_list = ques.split(",\n'*',\n")
formatted_ques_list = []
for i, q in enumerate(ques_list):
    f_q = ast.literal_eval(q)
    formatted_ques_list.append(f_q)
    # print(f_q)

    
# ques_list = ques.split("\n")
# filtered_ques_list = [element for element in ques_list if element.endswith('?') or element.endswith('.')]



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are an expert at creating questions based on the content in documents.
Your goal is to prepare a student for their exam and tests.
You do this by asking questions about the text below:

------------
CE/CZ4052Cloud ComputingBasics, IaaS, PaaS, SaaSDr. Tan, Chee WeiEmail: cheewei.tan@ntu.edu.sgOffice: N4-02c-104Outline
1Real-world examples of the cloud Definitions of cloud computingKey cloud concepts and characteristics Deployment scenariosCloud:MassiveScale
2Facebook[GigaOM,2012]30Kin2009->60Kin2010->100Kin2012Microsoft[DCknowledge]>1million,2013AWSEC2[RandyBias,2009]40K,8corespermachineGoogle[DCknowledge]>900K,2013Datacenter:outside
Copyright:Google4Datacenter:outside
Copyright:GooglesDatacenter:inside
Copyright:Google6Serverracks
7Photocredit:GoogleServer:inside
Copyright:xsnetsServercage
9Network room
Copyright:Google10CoolingfunctionalityPowerf

In [89]:
# Answer generation
# retriever = VectorStoreRetriever(vectorstore=db)
# answer_gen_chain = RetrievalQA.from_llm(llm = answer_llm, retriever=db.as_retriever())

# Output CSV

In [90]:
import os
import csv

fields = formatted_ques_list[0].keys()

base_folder = 'static/output/'
if not os.path.isdir(base_folder):
    os.mkdir(base_folder)
output_file = base_folder+"QA.csv"
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)

    writer.writeheader()

    for q in formatted_ques_list:
        writer.writerow(q)


    # csv_writer = csv.writer(csvfile)
    # csv_writer.writerow(["Question", "Answer"])  # Writing the header row

    # for question in filtered_ques_list:
    #     print("Question: ", question)
    #     answer = answer_gen_chain.run(question)
    #     print("Answer: ", answer)
    #     print("--------------------------------------------------\n\n")

    #     # Save answer to CSV file
    #     csv_writer.writerow([question, answer])

In [91]:
import json
with open("static/output/process_txt.json", "r") as f:
        d = json.load(f)
type(d)

dict

In [92]:
d

{'text': 'CE/CZ4052Cloud ComputingDistributed Computingvia SaaS: API, RESTDr. Tan, Chee WeiEmail: cheewei.tan@ntu.edu.sgOffice: N4-02c-104Traditional web applications\nClientServerGET /the-resource...200 OK<html>Code...</html>\nDisplays the page, then user clickson link.GET /another-resource...200 OK<html>Code...</html>\nDisplays the other page, ...Traditional web applicationsThe interface is built on HTML & HTTP.•Drawbacks:–The client must understand both HTTP and HTML.–The entire webpage is replaced with another one.•No way to animate transitions between webpages.–Same data is usually sent in multiple responses.•E.g. HTML code for the layout.Traditional web applications\nClientServer\nHTTP & HTMLClient???•HTTP & HTML can be used, but is not optimal.•The GUI on smartphones does not use HTML.•E.g. GET /users/3:\n<h1>Claire</h1><p>Claire is 24 years old and lives in Boston.</p>NameAgeCityApplication Programming InterfaceAn API is an interface for Machine ↔ Machine communication.•An API 