Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# MinIO access
MINIO_ENDPOINT=""
MINIO_ACCESS_KEY=""
MINIO_SECRET_KEY=""
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,10 @@ See below for an example (may not be exactly the same):

![python-env-2](./docs/images/python-env-2.png)

## Environment Variables for MinIO
Exam questions will be stored in a MinIO bucket. <br>

Instructions for obtaining the necessary environment variables for configuring connection to the MinIO client can be located in the FEPrep Notion page.

## Common Issues

Expand Down
66 changes: 66 additions & 0 deletions parser/minio_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Handles making and storing pdfs for each FE question in MinIO file storage bucket

import pymupdf
import io
import os
import sys
from dotenv import load_dotenv
from minio import Minio
from parser.dataset.exam import Exam


class Bucket:
client: Minio
bucket_name: str = "fe-pdfs"

def __init__(self):
# Configure connection to MinIO client
load_dotenv()
self.client = Minio(
os.getenv("MINIO_ENDPOINT"),
access_key=os.getenv("MINIO_ACCESS_KEY"),
secret_key=os.getenv("MINIO_SECRET_KEY"),
)

if not self.client.bucket_exists(self.bucket_name):
self.client.make_bucket(self.bucket_name)

# Creates and stores Minio objects containing exam questions
def create_exam_objs(self, exam: Exam):
document = pymupdf.open(exam.exam_path)

assert exam.sections
for section in exam.sections:
assert section.questions
for question in section.questions:
# Create a new empty PDF named with a unique ID
pdf_dir = f"{exam.semester}-{exam.year}"
question_pdf_name = f"{pdf_dir}/{question.id}.pdf"
question_pdf = pymupdf.open()

for page in question.pages:
# Add question content from document page to the new document
rect = document[page].rect
question_pdf_page = question_pdf.new_page(
width=rect.width, height=rect.height
)
question_pdf_page.show_pdf_page(rect, document, page)

pdf_bytes = question_pdf.write()
pdf_stream = io.BytesIO(pdf_bytes)

try:
# Add the file to the bucket
self.client.put_object(
bucket_name=self.bucket_name,
object_name=question_pdf_name,
data=pdf_stream,
length=len(pdf_bytes),
content_type="application/pdf",
)
except Exception as e:
print(f"An error occurred: {e}", file=sys.stderr)
import traceback

traceback.print_exc()
sys.exit(1)
1 change: 1 addition & 0 deletions parser/model/page_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ class Question(BaseModel, strict=True):
max_points: int
category: str
sub_category: str
id: str

# text excluding the question number, category, max points,
original_text: str
Expand Down
6 changes: 6 additions & 0 deletions parser/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from parser.dataset.exam import Exam
from parser.model.page_model import Section

from parser.minio_store import Bucket


class PreProcessedExam(BaseModel):
sections: List[Section]
Expand All @@ -20,6 +22,10 @@ def main(input_file: str, output_file: str, verbose: bool = False):
exam.load_data(verbose)
exam.write(output_file)

# Store each question in the MinIO bucket
minio_bucket = Bucket()
minio_bucket.create_exam_objs(exam)


def write_to_file(filename: str, content: str):
try:
Expand Down
2 changes: 2 additions & 0 deletions parser/question_extraction.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
import sys
import uuid
from copy import copy
from typing import Dict, List

Expand Down Expand Up @@ -128,6 +129,7 @@ def extract_questions(text: str, section_type: SectionType) -> List[Question]:
original_text=original_text,
sub_questions=sub_questions,
metadata=Metadata(),
id=str(uuid.uuid4()),
)

questions.append(question)
Expand Down
1,634 changes: 883 additions & 751 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ pymupdf = "^1.24.10"
nltk = "^3.9.1"
requests = "^2.32.3"
bs4 = "^0.0.2"
minio = "^7.2.10"
uuid = "^1.30"
python-dotenv = "^1.0.1"

[tool.poetry.group.dev.dependencies]
ipykernel = "^6.29.5"
Expand Down