In [2]:
from pathlib import Path
from typing import Iterable, List
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from fastapi import UploadFile



In [3]:
SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt"}

In [11]:
def load_documents(paths: Iterable[Path]) -> List[Document]:
    print("documents loading ...")
    print(f"documents about to load \n {str(paths)}")

    docs: List[Document] = []

    try:
        for path in paths:
            ext = path.suffix.lower()
            print(f"trying to load {path} with {ext}")
            if ext == ".pdf":
                loader = PyPDFLoader(str(path))
            elif ext == ".docx":
                loader = Docx2txtLoader(str(path))
            elif ext == ".txt":
                loader = TextLoader(str(path), encoding="utf-8")
            else:
                print(f"Unsupported extension skipped, path={str(path)}")
                continue
            docs.extend(loader.load())
            print(docs)
        print(f"{len(docs)} Documents loaded")
    except Exception as e:
        print(f"Failed loading documents, error={str(e)}")

    

In [12]:
test_files = [Path("E:/Project/MCQ-Generator/test_data/test.txt")]
load_documents(test_files)
    

documents loading ...
documents about to load 
 [WindowsPath('E:/Project/MCQ-Generator/test_data/test.txt')]
trying to load E:\Project\MCQ-Generator\test_data\test.txt with .txt
[Document(metadata={'source': 'E:\\Project\\MCQ-Generator\\test_data\\test.txt'}, page_content='some test data here\n')]
1 Documents loaded


In [14]:
import uuid
import re
from pathlib import Path
from typing import Iterable, List


SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt", ".pptx", ".md", ".csv", ".xlsx", ".xls", ".db", ".sqlite", ".sqlite3"}

In [None]:
def save_uploaded_files(uploaded_files: Iterable, target_dir: Path) -> List[Path]:

    target_dir.mkdir(parents=True, exist_ok=True)
    saved: list[Path] = []

    for uf in uploaded_file













In [5]:
from langchain_core.prompts import ChatPromptTemplate

from langchain_mistralai import ChatMistralAI, MistralAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# --- 1. Setup Vector Store and Retriever (Prerequisite RAG steps) ---
# In a real application, you would load and chunk your data first
# For example, loading a PDF, splitting, embedding, and storing in Chroma
loader = PyPDFLoader("E:/Project/MCQ-Generator/data/test_data/nlp.pdf") # Replace with your document
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = FAISS.from_documents(documents=splits, embedding=MistralAIEmbeddings(model="mistral-embed"))

# Define the retriever
retriever = vectorstore.as_retriever()

# Assuming you already have your vectorstore set up as `vectorstore` and `retriever`
# from the previous code snippet:

# ... (Previous code to set up vectorstore and retriever remains the same) ...
# Define the retriever
retriever = vectorstore.as_retriever(k=4) # Retrieve 4 relevant chunks

# --- 1. Define the MCQ Prompt Template ---
# The prompt is updated to generate questions based on the topic and context
mcq_system_prompt = (
    "You are a helpful assistant that generates multiple-choice questions (MCQs) "
    "based on provided context and a given topic.\n"
    "Generate exactly 3 unique MCQs about the topic '{topic}'. "
    "For each MCQ, provide 4 options (A, B, C, D) and specify the correct answer.\n"
    "Use only the following context:\n\n"
    "{context}\n\n"
    "Format your output clearly, with each question numbered."
)

mcq_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", mcq_system_prompt),
        ("human", "Generate the MCQs now based on the topic: '{topic}'"),
    ]
)

# --- 2. Initialize the LLM ---
# Use a slightly higher temperature for creativity in generating options
llm = ChatMistralAI(model="mistral-large-latest", temperature=0.5)

# --- 3. Build the MCQ Generation Chain using LCEL ---

# We use RunnableParallel to manage the flow of inputs.
# The user provides a 'topic'.
# We need to use that 'topic' to retrieve 'context' AND pass the 'topic' to the final prompt.
mcq_chain = (
    RunnableParallel(
        # 'context' key gets populated by feeding the 'topic' to the retriever
        context=lambda x: retriever.invoke(x['topic']), 
        # 'topic' key simply passes the original input topic through
        topic=RunnablePassthrough() 
    )
    | mcq_prompt
    | llm
    | StrOutputParser()
)

# --- 4. Invoke the chain with a user-specified topic ---
user_topic = "nlp"
response = mcq_chain.invoke({"topic": user_topic})

print(response)

Here are 3 unique multiple-choice questions (MCQs) based on the provided context about **Natural Language Processing (NLP)**:

---

### **Question 1**
What was the **highest F1 score** achieved in the **CoNLL 2003 Named Entity Recognition (NER) challenge** as mentioned in the text?

**A)** 86.84%
**B)** 88.31%
**C)** 88.76%
**D)** 89.31%

✅ **Correct Answer: D) 89.31%**

---

### **Question 2**
According to the text, what is a **common traditional approach** in NLP for tasks like POS tagging, chunking, and NER?

**A)** Using deep neural networks without feature engineering
**B)** Extracting hand-designed features and feeding them into shallow classifiers (e.g., SVM)
**C)** Relying solely on unlabeled data without any feature selection
**D)** Applying rule-based systems without machine learning

✅ **Correct Answer: B) Extracting hand-designed features and feeding them into shallow classifiers (e.g., SVM)**

---

### **Question 3**
Which of the following was **NOT** mentioned as a featur