In [1]:
from dotenv import load_dotenv

load_dotenv()

import os

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANG_CHAIN_PROJECT"] = "test-preparer"


In [2]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_community.document_loaders import PyPDFLoader,TextLoader
from langchain_community.document_loaders import Docx2txtLoader,UnstructuredPowerPointLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import LLMChain

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [3]:
def get_documents(file_path:str):
      if file_path.endswith('.pdf'):
            return PyPDFLoader(file_path).load()
      elif file_path.endswith('.docx'):
            return Docx2txtLoader(file_path).load()
      elif file_path.endswith('.pptx'):
            return UnstructuredPowerPointLoader(file_path).load()
      elif file_path.endswith('.txt'):
            return TextLoader(file_path).load()
      else:
            raise ValueError(f"Unsupported file type: {file_path}")

In [24]:
doc=('../Unit 1.4 copy.pdf')
doc

'../Unit 1.4 copy.pdf'

In [7]:
RESPONSE_JSON = """{
      "mcq":{
            "1": {
                  "question": "multiple choice question here",
                  "options": {
                      "a": "choice here",
                      "b": "choice here",
                      "c": "choice here",
                      "d": "choice here",
                  },
                  "correct": "correct answer here like a,b,c,d",
            },
            "2": {
                  "question": "multiple choice question here",
                  "options": {
                      "a": "choice here",
                      "b": "choice here",
                      "c": "choice here",
                      "d": "choice here",
                },
                "correct": "correct answer here like a,b,c,d",
            },
      "subjective":{
            "1": {
                  "question": "subjective question here",
                  "expected_answer": "subjective answer here",
            },
            "2": {
                  "question": "subjective question here",
                  "expected_answer": "subjective answer here",
            },
      }
      }"""

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text_splitter

<langchain_text_splitters.character.RecursiveCharacterTextSplitter at 0x1b0e01f02f0>

In [10]:
splitted_docs = text_splitter.split_documents(doc)
splitted_docs

[Document(metadata={'producer': 'macOS Version 13.4.1 (c) (Build 22F770820d) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20240829053237Z00'00'", 'moddate': "D:20240829053237Z00'00'", 'source': '../Unit 1.4.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1'}, page_content='Industrial Uses of Programming'),
 Document(metadata={'producer': 'macOS Version 13.4.1 (c) (Build 22F770820d) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20240829053237Z00'00'", 'moddate': "D:20240829053237Z00'00'", 'source': '../Unit 1.4.pdf', 'total_pages': 11, 'page': 1, 'page_label': '2'}, page_content='INDUSTRIAL USES OF PROGRAMMING •Scientific applications – typically require simple data structures but a large number of floating-point operations •– typical data structures: arrays and matrices; •typical control constructs: selection and loop – •earliest high-level language: FORTRAN •other language: ALGOL 60\nRead about Python Applications in Industry: https://junilearning.com/blo

In [22]:
prompt_text="""
            Text:{text}
            You are an expert MCQ and Subjective question maker. Given the above text, it is your job to
            create a quiz  of {mcq_number} multiple choice questions and {subjective_number} subjective questions for students. 
            Make sure to format your response like  RESPONSE_JSON below  and use it as a guide.
            ### RESPONSE_JSON
            {RESPONSE_JSON}"""

In [41]:
from typing import List
def generate_mcq_question_based_on_documents(file_path:str, no_of_mcqs: int, no_of_subjective: int):
    text=''.join([item.page_content for item in get_documents(file_path)])
    prompt = PromptTemplate.from_template(template=prompt_text, partial_variables={"RESPONSE_JSON": RESPONSE_JSON})
    chain = LLMChain(llm=llm, prompt=prompt, output_parser=JsonOutputParser())
    return chain.invoke({"text":text,"mcq_number":no_of_mcqs,"subjective_number":no_of_subjective,"response_json":RESPONSE_JSON})

In [19]:
len(doc)

22

In [35]:
get_documents(doc)[0].page_content

Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 38 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 50 0 (offset 0)
Ignoring wrong pointing object 78 0 (offset 0)


'Industrial Uses of Programming'

In [36]:
text=''.join([item.page_content for item in get_documents(doc)])
text

Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 38 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 50 0 (offset 0)
Ignoring wrong pointing object 78 0 (offset 0)


'Industrial Uses of ProgrammingINDUSTRIAL USES OF PROGRAMMING •Scientific applications – typically require simple data structures but a large number of floating-point operations •– typical data structures: arrays and matrices; •typical control constructs: selection and loop – •earliest high-level language: FORTRAN •other language: ALGOL 60\nRead about Python Applications in Industry: https://junilearning.com/blog/guide/what-is-python-used-for-major-industries/INDUSTRIAL USES OF PROGRAMMING •Business applications – requires preparation of detailed, complex reports and precise ways of handling decimal numbers and character data •– most popular language: COBOL•Systems programming applications – systems software is used almost continuously and so a language for systems programming must have fast execution. •– a language used for systems programming must have low-level featurs to access external devices. •– popular language: C (almost the whole of UNIX is written in C)INDUSTRIAL USES OF PRO

In [37]:
prompt = PromptTemplate.from_template(template=prompt_text, partial_variables={"RESPONSE_JSON": RESPONSE_JSON})

In [38]:
chain = LLMChain(llm=llm, prompt=prompt, output_parser=JsonOutputParser())

In [39]:
response=chain.invoke({"text":text,"mcq_number":10,"subjective_number":10,"response_json":RESPONSE_JSON})
response


{'text': {'mcq': {'1': {'question': 'Which programming language is primarily used for scientific applications?',
    'options': {'a': 'Python', 'b': 'FORTRAN', 'c': 'Java', 'd': 'C++'},
    'correct': 'b'},
   '2': {'question': 'What is the most popular language for business applications?',
    'options': {'a': 'Python', 'b': 'COBOL', 'c': 'Java', 'd': 'C'},
    'correct': 'b'},
   '3': {'question': 'Which programming language is known for its use in systems programming?',
    'options': {'a': 'Python', 'b': 'C', 'c': 'Java', 'd': 'Ruby'},
    'correct': 'b'},
   '4': {'question': 'Which language is commonly used for artificial intelligence applications?',
    'options': {'a': 'Python', 'b': 'LISP', 'c': 'Java', 'd': 'C++'},
    'correct': 'b'},
   '5': {'question': 'What is the primary programming language used in the automotive sector?',
    'options': {'a': 'Python', 'b': 'C', 'c': 'Java', 'd': 'JavaScript'},
    'correct': 'b'},
   '6': {'question': 'Which Python framework is menti

In [None]:
text=''.join([item.page_content for item in get_documents(doc)])
prompt = PromptTemplate.from_template(template=prompt_text, partial_variables={"RESPONSE_JSON": RESPONSE_JSON})
chain = LLMChain(llm=llm, prompt=prompt, output_parser=JsonOutputParser())
chain.invoke({"text":text,"mcq_number":10,"subjective_number":10,"response_json":RESPONSE_JSON})

In [31]:
generate_mcq_question_based_on_documents(doc,10,10)

Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 38 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 50 0 (offset 0)
Ignoring wrong pointing object 78 0 (offset 0)


TypeError: the JSON object must be str, bytes or bytearray, not dict