In [None]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

# SQLALCHEMY_DATABASE_URL = "sqlite:///./sql_app.db"
SQLALCHEMY_DATABASE_URL = "postgresql://user:password@postgresserver/db"

engine = create_engine(
    SQLALCHEMY_DATABASE_URL
)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)


def get_db():
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()


Base = declarative_base()

## Models

In [None]:
from sqlalchemy import Column, Integer, String, Float, Boolean, ForeignKey, LargeBinary, Table, Date, DateTime
from sqlalchemy.orm import relationship

class Question(Base):
    __tablename__ = "questions"

    id = Column(Integer, primary_key = True, index = True)
    text = Column(String)
    correct_answer = Column(String)
    mark = Column(Float)
    unit = Column(String)
    mix_choice = Column(Boolean, default = True)
    image = Column(LargeBinary, nullable=True)
    choices = Column(String)

    subject_id = Column(Integer, ForeignKey("subject.id"))
    subject = relationship("Subject", back_populates="questions")



class Subject(Base):
    __tablename__ = "subjects"

    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, unique=True, index=True)
    lecturer = Column(String)
    date = Column(Date)

    questions = relationship("Question", back_populates="subject")
    quiz_sets = relationship("QuizSet", back_populates="subject")

quiz_set_questions = Table('quiz_set_questions', Base.metadata,
    Column('quiz_set_id', Integer, ForeignKey('quiz_sets.id')),
    Column('question_id', Integer, ForeignKey('questions.id'))
)


class QuizSet(Base):
    __tablename__ = "quiz_sets"

    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, index=True)
    subject_id = Column(Integer, ForeignKey("subjects.id"))
    
    subject = relationship("Subject", back_populates="quiz_sets")
    questions = relationship("Question", secondary=quiz_set_questions, back_populates="quiz_sets")
    exams = relationship("Exam", back_populates="quiz_set")



class Exam(Base):
    __tablename__ = "exams"

    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, index=True)
    quiz_set_id = Column(Integer, ForeignKey("quiz_sets.id"))
    user_id = Column(Integer, ForeignKey("users.id"))
    start_time = Column(DateTime)
    end_time = Column(DateTime)
    duration = Column(Integer)  # in minutes

    quiz_set = relationship("QuizSet", back_populates="exams")
    user = relationship("User", back_populates="exams")

In [13]:
import docx
import base64
from datetime import datetime
from docx.oxml.shape import CT_Picture


def parse_docx(file_path):
    doc = docx.Document(file_path)
    info = {}
    questions = []
    warnings = []

    # Process metadata
    for para in doc.paragraphs:
        text = para.text.strip()
        if text.startswith("Subject:"):
            try:
                info['subject'] = text.split(":", 1)[1].strip()
            except IndexError:
                warnings.append("Subject format is incorrect.")
        elif text.startswith("Number of Quiz:"):
            try:
                info['num_quiz'] = int(text.split(":", 1)[1].strip())
            except (ValueError, IndexError):
                warnings.append("Number of Quiz format is incorrect.")
        elif text.startswith("Lecturer:"):
            info['lecturer'] = text.split(":", 1)[1].strip()
        elif text.startswith("Date:"):
            try:
                date_str = text.split(":", 1)[1].strip()
                info['date'] = datetime.strptime(date_str, "%d-%m-%Y").date()
            except (ValueError, IndexError):
                warnings.append("Date format is incorrect.")
    if len(info) != 4:
        warnings.append("Missing header information.")

    expected_order = ['qn', 'a.', 'b.', 'c.', 'd.', 'answer:', 'mark:', 'unit:', 'mix choices:']
    table_index = 0

    for table in doc.tables:
        question = {}
        order = []

        for row in table.rows:
            if len(row.cells) != 2:
                warnings.append(f"Invalid table format in table {table_index + 1}")
                continue

            key = row.cells[0].text.strip().lower()
            value = row.cells[1].text.strip()
            order.append(key)

            if key.strip().lower().startswith('qn'):
                question['text'] = value
                # Handle image
                for paragraph in row.cells[1].paragraphs:
                    for run in paragraph.runs:
                        image_parts = []
                        for element in run._element.iter():
                            if isinstance(element, CT_Picture):
                                rId = element.xpath('.//a:blip/@r:embed')[0]
                                image_part = doc.part.related_parts[rId]
                                image_parts.append(image_part)
                    
                        if image_parts:
                            image_bytes = image_parts[0].blob
                            question['image'] = base64.b64encode(image_bytes).decode('utf-8')
                            # print('question image: ', question['image'])
                question['choices'] = [value]
            elif key.strip().lower() in ['b.', 'c.', 'd.']:
                if 'choices' in question:
                    question['choices'].append(value)
                else:
                    warnings.append(f"Choice {key} found before 'a.' in table {table_index + 1}")
            elif key.strip().lower() == 'answer:':
                question['correct_answer'] = value
            elif key.strip().lower() == 'mark:':
                try:
                    question['mark'] = float(value)
                except ValueError:
                    warnings.append(f"Invalid mark value in table {table_index + 1}")
            elif key.strip().lower() == 'unit:':
                question['unit'] = value
            elif key.strip().lower() == 'mix choices:':
                question['mix_choices'] = value.lower() == 'yes'

        # Ensure keys are in the expected order
        order = list(map(lambda x: x.strip().lower(), order))
        order[0] = order[0][:2]
        
        if order != expected_order:
            warnings.append(f"Row names are not in the expected order in table {table_index + 1}, {order}")

        if 'text' not in question:
            warnings.append(f"Question text is missing in table {table_index + 1}")
        if 'choices' not in question or len(question.get('choices', [])) < 2:
            warnings.append(f"Insufficient choices in table {table_index + 1}")
        if 'correct_answer' not in question:
            warnings.append(f"Correct answer is missing in table {table_index + 1}")

        if 'text' in question and 'choices' in question and 'correct_answer' in question:
            questions.append(question)
        
        table_index += 1

    
    for question in enumerate(questions,1):
        print(question[0], '-----------------------------------------------')
        for k,v in question[1].items():
            if k == 'image':
                print(k, v[:50])
            else:
                print(k,v)

    return info, questions, warnings


In [14]:
parse_docx("F:\FSB\Python for Engineer\Week 2 - 1\Template 2.docx")

1 -----------------------------------------------
text See the figure and choose the right type of B2B E-Commerce
[file:8435.jpg]
image iVBORw0KGgoAAAANSUhEUgAAATQAAAC7CAYAAAGcF1KwAAAAAX
choices ['See the figure and choose the right type of B2B E-Commerce\n[file:8435.jpg]', 'Electronic Exchange', 'Buy-side B2B', 'Supply Chain Improvements and Collaborative Commerce']
correct_answer B
mark 0.5
unit Chapter1
mix_choices True
2 -----------------------------------------------
text See the figure and choose the right type of B2B E-Commerce
choices ['See the figure and choose the right type of B2B E-Commerce', 'Electronic', 'B2B', 'Supply Commerce']
correct_answer B
mark 0.5
unit Chapt1
mix_choices True
3 -----------------------------------------------
text See the figure and choose the right type of B2B E-Commerce
image iVBORw0KGgoAAAANSUhEUgAAATQAAAC7CAYAAAGcF1KwAAAAAX
choices ['See the figure and choose the right type of B2B E-Commerce', 'Electronic Exchange', 'Buy-side B2B', 'Supply Chai

({'subject': 'ISC',
  'num_quiz': 30,
  'lecturer': 'hungpd2',
  'date': datetime.date(1999, 8, 22)},
 [{'text': 'See the figure and choose the right type of B2B E-Commerce\n[file:8435.jpg]',
   'image': 'iVBORw0KGgoAAAANSUhEUgAAATQAAAC7CAYAAAGcF1KwAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsQAAA7EAZUrDhsAADw+SURBVHhe7Z1ndBTXlu9nPrw3s+at9/m9mTdrXd87tkESQiQBCki2CTY2OYogEEkEBQRCQkgCARI5CJGzyCCREaCAkEQQEhmTTDTBAbAxNtjXvvfOzN2v/lvVcqup6q6urg7qPr+19uqqruru6lO79tnnnH32+QfyYLz74l6/fk379u6lB/fv0+3bt+nu3bv0ww8/0H//93/LZ+jH4Yvbv28fLVm0iLe//PJLuvPFHX799ddf+T1H8L7bWlhQUC8mLN+z3NeDvot7/Ld6MWH5nuW+Huov7vHjx7Rm9Wo6ceIExU+YQNXV1fKRt1H6Ycv3LPfVSIiLo2lpabR2zRp6+vQpjYuNlY+YXVzRkSNviaMsz8uTt5S5dvWq1d9tPA/EPxz4sV60kLt0qbz1NlpNSbp0S//jD+/UizmqF3fx4kX5XX1oMcIlxcXaL06N//qv/5K33ua+VDPYy8ysLHnLOnbdVlRPwL/7ILvExIMHD+St39F1W4dEDaKSkhL+wj//+c/yGXV8UPuLXWLJd999R6dPneKnuW+v3tou7p8XlteLNfCDHcq/oVazN/F2+82VFLLnErWeu5U6VDznY+HHv6SwI3co8txr+VPKaC65f4vsWS/m/PTTT7Ri+XJ5T1vJRZz9kS8M2yad/dvf/kYnysp424Su23ru3Dn53d+

## Test

In [None]:
import pytest
import docx
from io import BytesIO
from datetime import datetime

def test_parse_docx_with_sample_file():
    file_path = r"F:\FSB\Python for Engineer\Week 2 - 1\Template 2.docx"
    
    # Call the parse_docx function with the sample file
    info, questions, warnings = parse_docx(file_path)
    
    # Print results for debugging purposes
    print("info:", info)
    print("Questions:", questions)
    print("Warnings:", warnings)
    
    # Check info (adjust the expected values based on the actual content of your template)
    assert info['subject'] == 'ISC'
    assert info['num_quiz'] == 30
    assert info['lecturer'] == 'hungpd2'
    assert info['date'] == datetime.strptime('22-08-1999', '%d-%m-%Y').date()  # Adjust date as per your template
    
    # Check questions
    assert len(questions) == 26  # Ensure there's at least one question
  
  
    # Validate the first question (adjust the expected values based on the actual content of your template)
    question = questions[2]
    print("Image: ", question['image'][:50])
    assert info['num_quiz'] == 3
    assert 'text' in question
    assert 'choices' in question
    assert 'correct_answer' in question
    assert 'mark' in question
    assert 'unit' in question
    assert 'mix_choices' in question
    assert 'image' in question

    
    # Check warnings
    assert len(warnings) == 0  # Ensure there are no warnings for a correctly formatted document


