<a href="https://colab.research.google.com/github/ArunMunagala7/MCQ-Generator-/blob/main/LDA_Implementation_with_MCQs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing libraries

In [1]:
!pip install -q pypdf langchain_community langchain openai sentence-transformers chromadb langchain_cohere tiktoken gdown

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.1/106.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m698.9/698.9 kB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m36.9 MB/s[0m et

In [45]:
!pip install pypdf



Importing needed libraries and the LdaModel

In [2]:
import gensim
import nltk
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from pypdf import PdfReader
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.llms import OpenAI

Preprocessing function to remove stopwords and unecessary tokens from the text

In [22]:
def preprocess(text, stop_words):
    """
    Tokenizes and preprocesses the input text, removing stopwords and short
    tokens.

    Parameters:
        text (str): The input text to preprocess.
        stop_words (set): A set of stopwords to be removed from the text.
    Returns:
        list: A list of preprocessed tokens.
    """
    result = []
    for token in simple_preprocess(text, deacc=True):
        if token not in stop_words and len(token) > 3:
            result.append(token)
    return result


Used to get a list of topics from pdf

In [23]:
def get_topic_lists_from_pdf(file, num_topics, words_per_topic):
    """
    Extracts topics and their associated words from a PDF document using the
    Latent Dirichlet Allocation (LDA) algorithm.

    Parameters:
        file (str): The path to the PDF file for topic extraction.
        num_topics (int): The number of topics to discover.
        words_per_topic (int): The number of words to include per topic.

    Returns:
        list: A list of num_topics sublists, each containing relevant words
        for a topic.
    """
    # Load the pdf file
    loader = PdfReader(file)

    # Extract the text from each page into a list. Each page is considered a document
    documents= []
    for page in loader.pages:
        documents.append(page.extract_text())

    # Preprocess the documents
    nltk.download('stopwords')
    stop_words = set(stopwords.words(['english','spanish']))
    processed_documents = [preprocess(doc, stop_words) for doc in documents]

    # Create a dictionary and a corpus
    dictionary = corpora.Dictionary(processed_documents)
    corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

    # Build the LDA model
    lda_model = LdaModel(
        corpus,
        #num_topics=num_topics,
        id2word=dictionary,
        passes=15
        )

    # Retrieve the topics and their corresponding words
    topics = lda_model.print_topics()#(num_words=words_per_topic)

    # Store each list of words from each topic into a list
    topics_ls = []
    for topic in topics:
        words = topic[1].split("+")
        topic_words = [word.split("*")[1].replace('"', '').strip() for word in words]
        topics_ls.append(topic_words)

    return topics_ls

Generates a description along with each major topic in the document

In [25]:
def topics_from_pdf(llm, file, num_topics, words_per_topic):
    """
    Generates descriptive prompts for LLM based on topic words extracted from a
    PDF document.

    This function takes the output of `get_topic_lists_from_pdf` function,
    which consists of a list of topic-related words for each topic, and
    generates an output string in bulleted nested list format.

    Parameters:
        llm (LLM): An instance of the Large Language Model (LLM) for generating
        responses.
        file (str): The path to the PDF file for extracting topic-related words.
    Returns:
        str: A response generated by the language model based on the provided
        topic words.
    """

    # Extract topics and convert them to string
    list_of_topicwords = get_topic_lists_from_pdf(file, num_topics,
                                                  words_per_topic)
    string_lda = ""
    for list in list_of_topicwords:
        string_lda += str(list) + "\n"

    # Create the template
    template_string = '''Describe the topic of each of the {num_topics}
        double-quote delimited lists in a simple sentence and also write down
        three possible different subthemes. The lists are the result of an
        algorithm for topic discovery.
        Do not provide an introduction or a conclusion, only describe the
        topics. Do not mention the word "topic" when describing the topics.
        Use the following template for the response.

        1: <<<(sentence describing the topic)>>>
        - <<<(Phrase describing the first subtheme)>>>
        - <<<(Phrase describing the second subtheme)>>>
        - <<<(Phrase describing the third subtheme)>>>

        2: <<<(sentence describing the topic)>>>
        - <<<(Phrase describing the first subtheme)>>>
        - <<<(Phrase describing the second subtheme)>>>
        - <<<(Phrase describing the third subtheme)>>>

        ...

        n: <<<(sentence describing the topic)>>>
        - <<<(Phrase describing the first subtheme)>>>
        - <<<(Phrase describing the second subtheme)>>>
        - <<<(Phrase describing the third subtheme)>>>

        Lists: """{string_lda}""" '''

    # LLM call
    prompt_template = ChatPromptTemplate.from_template(template_string)
    chain = LLMChain(llm=llm, prompt=prompt_template)
    response = chain.run({
        "string_lda" : string_lda,
        "num_topics" : num_topics
        })

    return response

API for gpt LLM

In [26]:
openai_key = "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"
llm = OpenAI(openai_api_key=openai_key, max_tokens=-1)

Extracting the text and preprocessing the pdf

In [31]:
file = "/content/Nutrition-and-India.pdf"
num_topics=25
words_per_topic=30
summary = topics_from_pdf(llm, file, num_topics, words_per_topic)
print(summary)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




1: Interventions focusing on lactation
- Improving breastfeeding techniques
- Encouraging breastfeeding education
- Promoting skin-to-skin contact with newborns

2: Interventions to improve maternal health
- Providing access to prenatal care
- Educating mothers on proper nutrition during pregnancy
- Offering support for postpartum depression

3: Levels of intervention for mental health
- Individual therapy
- Group therapy
- Medication management

4: Lays food products
- Potato chips
- Sunflower seeds
- Puffed snacks

5: Lactation and its benefits for infants
- Improved immune system
- Proper nutrition for growth and development
- Bonding between mother and baby

6: Lactating mothers and their dietary needs
- Adequate calorie intake
- Consuming nutrient-rich foods
- Staying hydrated

7: Knowledge acquisition in education
- Active learning strategies
- Critical thinking skills
- Utilizing technology in the classroom

8: Janani Suraksha Yojana (JSY)
- Promoting institutional deliveries


Using this summary to generate MCQs

In [32]:
type(summary)

str

In [33]:
from langchain.prompts import PromptTemplate
def generate_list(text):
  template=""""
  Store the topics and their respective subpoints combined into one element each of a list from the given CONTEXT.\
  In this way, format all the topics and their subpoints each into an element of a list.\
  Output the final list as the answer.

  The final response should be a list of Strings

  CONTEXT: {text}
  """
  model = OpenAI(openai_api_key=openai_key, max_tokens=-1)
  prompt = PromptTemplate(
    template= template,
    input_variables=["text"],
    )

  chain = prompt | model
  response = chain.invoke({"text":text})
  return response


Storing the output in a topics_list

In [38]:
topics=generate_list(summary)
import ast

# Convert the string to a list of strings
topics_list = ast.literal_eval(topics)

In [64]:
topics_list[15]

'Interventions for addressing malnutrition - Nutrient supplementation programs - Promoting diverse and balanced diets - Addressing food insecurity'

In [63]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(
    file_path="/content/Nutrition-and-India.pdf"
)
data=loader.load()

Create a vector database to store the questions

In [41]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Initialize Text Splitter for question generation
text_splitter_question_gen = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)

In [48]:
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
# Split text into chunks for question generation
text_question_gen = ''
for page in data:
    text_question_gen += page.page_content
text_chunks_question_gen = text_splitter_question_gen.split_text(text_question_gen)

In [50]:
# Convert chunks into Documents for question generation
docs_question_gen = [Document(page_content=t) for t in text_chunks_question_gen]

In [51]:
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key="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",model_name="BAAI/bge-small-en-v1.5")
vector_store = Chroma.from_documents(docs_question_gen, embeddings)

Using Retrieval Chain and Multiple API calls to generate MCQs. Implementing reranking too

In [58]:
# Initialize retrieval chain for answer generation
import os
from langchain.chains import RetrievalQA
llm = OpenAI(openai_api_key=openai_key, max_tokens=-1)
from langchain_cohere import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever
os.environ['COHERE_API_KEY']="QWe3cacVyfK7BQioAI1lwMUT8Eydbt3Xp0N2kwUF"
compressor = CohereRerank()
base_retriever=vector_store.as_retriever(k=5)
retriever=ContextualCompressionRetriever(base_compressor=compressor, base_retriever=base_retriever)
question_gen_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",
                                                retriever=base_retriever)

Prompting to generate MCQs

In [59]:
text=""
for i in range(len(topics_list)):
  # Generate the string with the variable value
  question_string = f"""
  You are a UPSC Exam Expert in India. Your job is to test the knowledge of aspirants.\
  Generate 5-6 possible MCQ questions along with their options on the topic and summary {topics_list[i]}.\
  Here is the format which must be strictly followed for the generated MCQs:

      Q. The Question for the MCQ Framed
      (a) Option 1
      (b) Option 2
      (c) Option 3
      (d) Option 4


  """

  # Run the chain with the generated string
  answer = question_gen_chain.run(question_string)
  text += answer



In [60]:
print(text)


1. Q. Which of the following is NOT a major cause of malnutrition in India?
(a) Lack of a balanced diet
(b) Inefficient PDS system
(c) Availability of diverse and nutritious crops
(d) Lack of awareness about nutritional requirements 

2. Q. According to the National Family Health Survey -4, what percentage of children in India are stunted?
(a) 33%
(b) 50%
(c) 19%
(d) 10%

3. Q. Which of the following is NOT a goal of the POSHAN Abhiyan?
(a) Reducing stunting in children
(b) Improving the nutritional status of pregnant women
(c) Increasing obesity rates in adults 
(d) Reducing low birth weight in newborns 

4. Q. The National Nutrition Strategy aims to decrease all forms of malnutrition by which year?
(a) 2020
(b) 2025
(c) 2030
(d) 2050

5. Q. Which intervention is NOT a part of the POSHAN Abhiyan?
(a) Encouraging skin-to-skin contact with newborns 
(b) Providing bi-annual vitamin A supplements for children 
(c) Promoting the consumption of fortified wheat flour 
(d) Training Auxiliary

In [None]:
|