In [13]:
import os
from dotenv import load_dotenv
import zipfile

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
import chromadb
from dotenv import load_dotenv
import pickle

load_dotenv()

True

In [12]:
def load_chunk_persist_pdf() -> Chroma:
    pdf_folder_path = "./pdfs/"
    documents = []
    for file in os.listdir(pdf_folder_path):
        if file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder_path, file)
            loader = PyPDFLoader(pdf_path)
            documents.extend(loader.load())
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
    chunked_documents = text_splitter.split_documents(documents)
    client = chromadb.Client()
    if client.list_collections():
        consent_collection = client.create_collection("consent_collection")
    else:
        print("Collection already exists")
    vectordb = Chroma.from_documents(
        documents=chunked_documents,
        embedding=OpenAIEmbeddings(),
        persist_directory="./chroma_store"
    )
    vectordb.persist()
    return vectordb

In [None]:
def create_agent_chain():
    model_name = "gpt-3.5-turbo"
    llm = ChatOpenAI(model_name=model_name)
    chain = load_qa_chain(llm, chain_type="stuff")
    return chain

In [None]:
def get_llm_response(query, vectordb):
    chain = create_agent_chain()
    matching_docs = vectordb.similarity_search(query)
    answer = chain.run(input_documents=matching_docs, question=query)
    return answer

In [None]:
def zip_folder(folder_path, output_path):
    """
    Compresses a folder into a ZIP file.

    Parameters:
    - folder_path: The path to the folder that should be compressed.
    - output_path: The path where the output ZIP file should be saved.
    """
    # Create a ZIP file for writing compressed data
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through the directory
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                # Create a relative path for files to maintain the directory structure
                rel_path = os.path.relpath(os.path.join(root, file), os.path.dirname(folder_path))
                zipf.write(os.path.join(root, file), arcname=rel_path)

In [None]:
zip_folder("./chroma_store", "chroma_store.zip")

In [None]:
res = load_chunk_persist_pdf() 

In [None]:
# with open('vector_db.pkl', 'wb') as f:
#     pickle.dump(res, f)

In [None]:
get_llm_response("What is the purpose of this study?", res)


In [28]:
import requests
from langchain.vectorstores import Chroma
import shutil
import zipfile

zip_path = './temp_storage/chroma_store_1_pdf.zip'
db_path = './temp_storage/chroma_store_1'
url = "http://ajuq4-ruaaa-aaaaa-qaaga-cai.localhost:4943/chroma_store_1_pdf.zip"

def run_query(query):
    os.makedirs('./temp_storage', exist_ok=True)

    response = requests.get(url)
    if response.status_code == 200:
        with open(zip_path, 'wb') as f:
            f.write(response.content)
        message = "File downloaded successfully."
    else:
        message = f"Failed to download the file. Status code: {response.status_code}"
        return 0

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(db_path)
    print(message)
    db = Chroma(persist_directory=db_path, embedding_function=OpenAIEmbeddings())


    model_name = "gpt-3.5-turbo"
    llm = ChatOpenAI(model_name=model_name)
    chain = load_qa_chain(llm, chain_type="stuff")

    matching_docs = db.similarity_search(query)
    answer = chain.run(input_documents=matching_docs, question=query)    

    return answer



In [46]:
db_path = './temp_storage/chroma_store_1/chroma_store/'
db = Chroma(persist_directory=db_path, embedding_function=OpenAIEmbeddings())

model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)
chain = load_qa_chain(llm, chain_type="stuff")

# model_name2 = "gpt-3.5-turbo"
# llm2 = ChatOpenAI(model_name=model_name2)
# chain2 = load_qa_chain(llm2, chain_type="stuff")


information = "what is curroption?"
query = f"Give content from the docuemnts provided for the following query. If the query doesn't make sense or doesn't relate to the document provided give a general overview of the pdf. Make sure the content is of atleast 300 words: {information}"

matching_docs = db.similarity_search(query)
answer = chain.run(input_documents=matching_docs, question=query)    

# query = "From the given context: {answer}. Make 3 Quesions. The question should be in this format #questionstart#question 1, question 2, question 3#questionend#"
# answer2 = chain.run(question=query)    

answer

"Corruption, as described in the documents provided, is referred to as an insidious plague that has a wide range of corrosive effects on societies. It is a phenomenon found in all countries, big and small, rich and poor, but its effects are most destructive in the developing world. The detrimental impacts of corruption include undermining democracy and the rule of law, leading to human rights violations, distorting markets, eroding the quality of life, and allowing organized crime, terrorism, and other threats to human security to flourish.\n\nOne significant consequence of corruption highlighted in the documents is its disproportionate impact on the poor. It diverts funds intended for development, undermines a government's ability to provide basic services, feeds inequality and injustice, and discourages foreign aid and investment. Moreover, corruption is identified as a key element in economic underperformance and a major obstacle to poverty alleviation and overall development.\n\nTo

In [51]:
from langchain.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import re
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are world class quiz writer who creates questions and answers from content."),
    ("user", "{input}")
])
llm = ChatOpenAI(model_name="gpt-3.5-turbo")
output_parser = StrOutputParser()
chain = prompt|llm|output_parser 


In [53]:
res = chain.invoke({"input": f"From the given context: {answer}."+" Make 3 Quesions. The question should be in this format #questionstart#{question-content}#questionend# for each question. For a total of 3 times."})
pattern = r"#questionstart#(.*?)#questionend#"
questions = re.findall(pattern, res)


In [55]:
# Define a new prompt template for generating quiz answers
prompt2 = ChatPromptTemplate.from_messages([
    ("system", "You are a world-class quiz writer who creates one correct and three incorrect answers for quiz questions."),
    ("user", "{input}")
])

# Still using the StrOutputParser for parsing the output to a string
output_parser = StrOutputParser()

# Chain the components
chain2 = prompt2 | llm | output_parser 

In [56]:
for question in questions:
    res = chain.invoke({
    "input": f"Based on the question: {question}." + 
    " Generate one correct answer and three incorrect answers for a quiz. " +
    "Format the answers as follows: #correctanswer#Correct Answer#end# " +
    "and #wronganswer#Wrong Answer 1#end# #wronganswer#Wrong Answer 2#end# #wronganswer#Wrong Answer 3#end#."
    })
    # Define patterns to extract correct and incorrect answers
    correct_pattern = r"#correctanswer#(.*?)#end#"
    wrong_pattern = r"#wronganswer#(.*?)#end#"

    # Extracting the correct and incorrect answers
    correct_answer = re.findall(correct_pattern, res)
    incorrect_answers = re.findall(wrong_pattern, res)

    print(correct_answer[0], incorrect_answers[:3])

['Some of the detrimental impacts of corruption as described in the documents provided include a loss of public trust in government institutions, hindrance of economic development, and exacerbation of inequality.'] ['Corruption has no significant impact on public trust in government institutions.', 'Corruption leads to increased transparency in economic development.', 'Corruption helps reduce inequality within society.']
['Corruption diverts resources away from essential services, making it harder for the poor to access things like healthcare and education.'] ['Corruption benefits the poor by providing them with easy access to resources.', 'Corruption has no impact on the poor as they are already disadvantaged.', 'Corruption only affects the wealthy and not the poor.']
['The convention provides a framework for countries to develop and implement measures to prevent corruption, promote integrity, and hold individuals and entities accountable for corrupt practices globally.'] ['It is a tr

In [69]:
from langchain.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import re
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores.chroma import Chroma
import random

def generate_quiz_from_query(query, db_path='./temp_storage/chroma_store_1/chroma_store/'):
    # Initialize the Chroma DB and the language model
    db = Chroma(persist_directory=db_path, embedding_function=OpenAIEmbeddings())
    llm = ChatOpenAI(model_name="gpt-3.5-turbo")
    
    # Assuming `load_qa_chain()` is a defined function that loads the QA chain
    chain = load_qa_chain(llm, chain_type="stuff")
    
    content_query = (f"Give content from the documents provided for the following query. If the query doesn't make sense "
                     f"or doesn't relate to the document provided give a general overview of the pdf. Make sure the content "
                     f"is of at least 300 words: {query}")
    
    matching_docs = db.similarity_search(content_query)
    answer = chain.run(input_documents=matching_docs, question=content_query)
    
    question_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a world-class quiz writer who creates questions and answers from content."),
        ("user", "{input}")
    ])
    question_chain = question_prompt | llm | StrOutputParser()
    question_res = question_chain.invoke({
        "input": f"From the given context: {answer}. Make 3 Questions."+ "The question should be in this format #questionstart#{question-content}#questionend# for each question. For a total of 3 times."
    })
    
    question_pattern = r"#questionstart#(.*?)#questionend#"
    questions = re.findall(question_pattern, question_res)
    
    answer_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a world-class quiz writer who creates one correct and three incorrect answers for quiz questions."),
        ("user", "{input}")
    ])
    answer_chain = answer_prompt | llm | StrOutputParser()
    
    placeholder = "Placeholder"
    quiz_results = []
    
    for question in questions:
        res = answer_chain.invoke({
            "input": f"Based on the question: {question}. Generate one correct answer and three incorrect answers for a quiz. "
                     "Format the answers as follows: #correctanswer#Correct Answer#end# "
                     "and #wronganswer#Wrong Answer 1#end# #wronganswer#Wrong Answer 2#end# #wronganswer#Wrong Answer 3#end#."
        })
        
        correct_pattern = r"#correctanswer#(.*?)#end#"
        wrong_pattern = r"#wronganswer#(.*?)#end#"
        
        correct_answer = re.findall(correct_pattern, res)
        incorrect_answers = re.findall(wrong_pattern, res)
        
        # Remove "Wrong Answer" and "Correct Answer" text from the answers
        correct_answer = [answer.replace("Correct Answer", "").strip() for answer in correct_answer]
        incorrect_answers = [answer.replace("Wrong Answer", "").strip() for answer in incorrect_answers]
        
        if not correct_answer or len(incorrect_answers) < 3:
            correct_answer = [placeholder]
            incorrect_answers = [placeholder for _ in range(3)]

        answers = correct_answer + incorrect_answers[:3]
        random.shuffle(answers)
        correct_index = answers.index(correct_answer[0])

        # Clean the question if necessary
        question = question if question and question.strip() != "#questionstart##questionend#" else placeholder

        quiz_results.append({
            "question": question,
            "answers": answers,
            "correct_index": correct_index
        })
    
    return quiz_results


In [71]:
query = "What is corruption?"
quiz_results = generate_quiz_from_query(query)
print(quiz_results)

[{'question': 'What are some of the corrosive effects of corruption on societies, as mentioned in the text?', 'answers': ['Promoting ethical behavior and good governance', 'Increasing transparency and accountability', 'Undermining trust in institutions and eroding social cohesion#', 'Boosting economic growth and development'], 'correct_index': 2}, {'question': 'How does corruption disproportionately impact the poor, according to the text?', 'answers': ['Corruption diverts resources meant for public services, affecting the poor who rely on these services', 'Corruption does not impact the poor differently than other social classes', 'Corruption leads to economic growth, benefiting the poor', 'Corruption benefits the poor by providing them with more opportunities'], 'correct_index': 0}, {'question': 'What key values does the United Nations Convention against Corruption emphasize in its efforts to combat corruption globally?', 'answers': ['Corruption, Bribery, Collusion', 'Transparency, Ac

In [72]:
for i in quiz_results:
    print(i['question'])
    print(i['answers'])
    print(i['correct_index'])

What are some of the corrosive effects of corruption on societies, as mentioned in the text?
['Promoting ethical behavior and good governance', 'Increasing transparency and accountability', 'Undermining trust in institutions and eroding social cohesion#', 'Boosting economic growth and development']
2
How does corruption disproportionately impact the poor, according to the text?
['Corruption diverts resources meant for public services, affecting the poor who rely on these services', 'Corruption does not impact the poor differently than other social classes', 'Corruption leads to economic growth, benefiting the poor', 'Corruption benefits the poor by providing them with more opportunities']
0
What key values does the United Nations Convention against Corruption emphasize in its efforts to combat corruption globally?
['Corruption, Bribery, Collusion', 'Transparency, Accountability, Integrity', 'Secrecy, Dishonesty, Greed', 'Anarchy, Deception, Fraud']
1
