In [153]:
import vertexai
from vertexai.language_models import TextGenerationModel
from dotenv import load_dotenv
import os

load_dotenv()

PROJECT = os.getenv("PROJECT")
LOCATION = "asia-southeast1"

vertexai.init(project=PROJECT, location=LOCATION)
# parameters = {
#     "candidate_count": 1,
#     "max_output_tokens": 1024,
#     "temperature": 0.9,
#     "top_p": 1
# }
model = TextGenerationModel.from_pretrained("text-bison")

In [60]:
# Utils
import time
from typing import List

# Langchain
import langchain

print(f"LangChain version: {langchain.__version__}")

# Vertex AI
# from langchain.llms import VertexAI

from google.cloud import aiplatform
from langchain_google_vertexai import ChatVertexAI
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_google_vertexai import VertexAI
from langchain.schema import HumanMessage, SystemMessage

print(f"Vertex AI SDK version: {aiplatform.__version__}")

LangChain version: 0.1.12
Vertex AI SDK version: 1.44.0


# LLM
- temperature: Randomness/creativity in output
- token_limit: max amt of text output from one prompt
- top-k: token selected from top k options (based on probability)
- top-p: affects tokens selected for output as well -> select possible tokens to output till their probabilities sums to top-p. Choose among those tokens (based on temp)

In [61]:
# Utility functions for Embeddings API with rate limiting
def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    print("Waiting")
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            print(".", end="")
            time.sleep(sleep_time)


class CustomVertexAIEmbeddings(VertexAIEmbeddings):
    requests_per_minute: int
    num_instances_per_batch: int

    # Overriding embed_documents method
    def embed_documents(self, texts: List[str]):
        limiter = rate_limit(self.requests_per_minute)
        results = []
        docs = list(texts)

        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[: self.num_instances_per_batch],
                docs[self.num_instances_per_batch :],
            )
            chunk = self.client.get_embeddings(head)
            results.extend(chunk)
            next(limiter)

        return [r.values for r in results]

In [146]:
# LLM model
ans_parameters = {
    # "candidate_count": 1,
    "model_name":"text-bison",
    "max_output_tokens": 512,
    "temperature": 0.1,
    "top_p": 0.8,
    "top_k": 40,
    "verbose": True,
}

ques_parameters = {
    # "candidate_count": 1,
    "model_name":"text-bison",
    "max_output_tokens": 2048,
    "temperature": 0.3,
    "top_p": 0.8,
    "top_k": 40,
    "verbose": True,
}

# Minimise randomness
answer_llm = VertexAI( **ans_parameters)
#     model_name="text-bison",
#     max_output_tokens=512,
#     temperature=0.1,
#     top_p=0.8,
#     top_k=40,
#     verbose=True,
# )

# Ok to be random
question_llm = VertexAI(**ques_parameters)

# Chat
chat = ChatVertexAI()

# Embedding
EMBEDDING_QPM = 15
EMBEDDING_NUM_BATCH = 2
embeddings = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH,
)

Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


# Reading from PDFs

In [63]:
# Ingest PDF files
from langchain.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter

file_path = "Lec1.pdf"
loader = PyPDFLoader(file_path)
documents = loader.load()

In [64]:
len(documents)

54

In [65]:
# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
print(f"# of documents = {len(docs)}")

# of documents = 54


In [66]:
docs

[Document(page_content='CE/CZ4052Cloud ComputingBasics, IaaS, PaaS, SaaSDr. Tan, Chee WeiEmail: cheewei.tan@ntu.edu.sgOffice: N4-02c-104', metadata={'source': 'Lec1.pdf', 'page': 0}),
 Document(page_content='Outline\n1Real-world examples of the cloud Definitions of cloud computingKey cloud concepts and characteristics Deployment scenarios', metadata={'source': 'Lec1.pdf', 'page': 1}),
 Document(page_content='Cloud:MassiveScale\n2Facebook[GigaOM,2012]30Kin2009->60Kin2010->100Kin2012Microsoft[DCknowledge]>1million,2013AWSEC2[RandyBias,2009]40K,8corespermachineGoogle[DCknowledge]>900K,2013', metadata={'source': 'Lec1.pdf', 'page': 2}),
 Document(page_content='Datacenter:outside\nCopyright:Google4', metadata={'source': 'Lec1.pdf', 'page': 3}),
 Document(page_content='Datacenter:outside\nCopyright:Googles', metadata={'source': 'Lec1.pdf', 'page': 4}),
 Document(page_content='Datacenter:inside\nCopyright:Google6', metadata={'source': 'Lec1.pdf', 'page': 5}),
 Document(page_content='Serverrac

In [67]:
question_gen = ''

for page in documents:
    question_gen += page.page_content

splitter_ques_gen = TokenTextSplitter(
model_name = 'gpt-3.5-turbo',
chunk_size = 10000,
chunk_overlap = 0
)

chunks_ques_gen = splitter_ques_gen.split_text(question_gen)

document_ques_gen = [Document(page_content=t) for t in chunks_ques_gen]

# VectorStores
- Index or DB to store vectors (our embeddings)
- Columns are: [Embedding Value, Metadata]

In [None]:
from langchain_community.vectorstores import Chroma

In [None]:
db = Chroma.from_documents(docs, embeddings)

Waiting
..........................

# Prompts

In [75]:
from langchain.prompts import PromptTemplate

In [137]:
prompt_template = """
You are an expert at creating questions based on the content in documents.
Your goal is to prepare a student for their exam and tests.
You do this by asking questions about the text below:

------------
{text}
------------

Create questions in an MCQ format with 4 possible solutions that will prepare the students for their tests. The output specifications are as follows:
'question', 'possible_answers', 'correct_answer', 'explanation'. Separate the questions with a '*'.
Make sure not to lose any important information.

QUESTIONS:
"""

PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=["text"])

refine_template = ("""
You are an expert at creating practice questions based on content in documents.
Your goal is to help a stuednt prepare for their exams and test.
We have received some practice questions to a certain extent: {existing_answer}. The question, possible answers, correct answer and explanation can be found in the 'question', 'possible_answers', 'correct_answer' and 'explanation' fields respectively.
We have the option to refine the existing questions or add new ones. 
(only if necessary) with some more context below. Separate the questions with a '*'.
------------
{text}
------------

Given the new context, refine the original questions in English.
If the context is not helpful, please provide the original questions.
QUESTIONS:
"""
)

REFINE_PROMPT_QUESTIONS = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template,
)

# LLM Pipeline
This section integrates the above components together and generates the questions and answers.

In [103]:
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
from langchain_core.vectorstores import VectorStoreRetriever

In [104]:
docs

[Document(page_content='CE/CZ4052Cloud ComputingBasics, IaaS, PaaS, SaaSDr. Tan, Chee WeiEmail: cheewei.tan@ntu.edu.sgOffice: N4-02c-104', metadata={'source': 'Lec1.pdf', 'page': 0}),
 Document(page_content='Outline\n1Real-world examples of the cloud Definitions of cloud computingKey cloud concepts and characteristics Deployment scenarios', metadata={'source': 'Lec1.pdf', 'page': 1}),
 Document(page_content='Cloud:MassiveScale\n2Facebook[GigaOM,2012]30Kin2009->60Kin2010->100Kin2012Microsoft[DCknowledge]>1million,2013AWSEC2[RandyBias,2009]40K,8corespermachineGoogle[DCknowledge]>900K,2013', metadata={'source': 'Lec1.pdf', 'page': 2}),
 Document(page_content='Datacenter:outside\nCopyright:Google4', metadata={'source': 'Lec1.pdf', 'page': 3}),
 Document(page_content='Datacenter:outside\nCopyright:Googles', metadata={'source': 'Lec1.pdf', 'page': 4}),
 Document(page_content='Datacenter:inside\nCopyright:Google6', metadata={'source': 'Lec1.pdf', 'page': 5}),
 Document(page_content='Serverrac

In [147]:
import ast
ques_gen_chain = load_summarize_chain(llm = question_llm, 
                                            chain_type = "refine", 
                                            verbose = True, 
                                            question_prompt=PROMPT_QUESTIONS, 
                                            refine_prompt=REFINE_PROMPT_QUESTIONS)

# Outputs a list of questions
ques = ques_gen_chain.run(document_ques_gen)
ques_list = ques.split(",\n'*',\n")
formatted_ques_list = []
for i, q in enumerate(ques_list):
    f_q = ast.literal_eval(q)
    formatted_ques_list.append(f_q)
    # print(f_q)

    
# ques_list = ques.split("\n")
# filtered_ques_list = [element for element in ques_list if element.endswith('?') or element.endswith('.')]



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are an expert at creating questions based on the content in documents.
Your goal is to prepare a student for their exam and tests.
You do this by asking questions about the text below:

------------
CE/CZ4052Cloud ComputingBasics, IaaS, PaaS, SaaSDr. Tan, Chee WeiEmail: cheewei.tan@ntu.edu.sgOffice: N4-02c-104Outline
1Real-world examples of the cloud Definitions of cloud computingKey cloud concepts and characteristics Deployment scenariosCloud:MassiveScale
2Facebook[GigaOM,2012]30Kin2009->60Kin2010->100Kin2012Microsoft[DCknowledge]>1million,2013AWSEC2[RandyBias,2009]40K,8corespermachineGoogle[DCknowledge]>900K,2013Datacenter:outside
Copyright:Google4Datacenter:outside
Copyright:GooglesDatacenter:inside
Copyright:Google6Serverracks
7Photocredit:GoogleServer:inside
Copyright:xsnetsServercage
9Network room
Copyright:Google10CoolingfunctionalityPowerf

In [107]:
# Answer generation
# retriever = VectorStoreRetriever(vectorstore=db)
# answer_gen_chain = RetrievalQA.from_llm(llm = answer_llm, retriever=db.as_retriever())

# Output CSV

In [152]:
import os
import csv

fields = formatted_ques_list[0].keys()

base_folder = 'static/output/'
if not os.path.isdir(base_folder):
    os.mkdir(base_folder)
output_file = base_folder+"QA.csv"
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)

    writer.writeheader()

    for q in formatted_ques_list:
        writer.writerow(q)


    # csv_writer = csv.writer(csvfile)
    # csv_writer.writerow(["Question", "Answer"])  # Writing the header row

    # for question in filtered_ques_list:
    #     print("Question: ", question)
    #     answer = answer_gen_chain.run(question)
    #     print("Answer: ", answer)
    #     print("--------------------------------------------------\n\n")

    #     # Save answer to CSV file
    #     csv_writer.writerow([question, answer])