In [6]:
import cv2
import numpy as np
import pytesseract
import re
import pandas as pd
from PIL import Image
import fitz  # PyMuPDF
from typing import List, Dict, Tuple, Optional
import os
import io

class OCRQuestionExtractor:
    def __init__(self):
        # Configure Tesseract for Vietnamese
        self.tesseract_config = r'--oem 3 --psm 6 -l vie+eng'
        
        # Question patterns for different formats
        self.question_patterns = [
            r'Câu\s*(\d+)[:\.\)\s]+(.+?)(?=Câu\s*\d+|A[\.\)\s]|$)',
            r'Question\s*(\d+)[:\.\)\s]+(.+?)(?=Question\s*\d+|A[\.\)\s]|$)',
            r'(\d+)[:\.\)\s]+(.+?)(?=\d+[:\.\)]|A[\.\)\s]|$)',
        ]
        
        # Answer patterns
        self.answer_patterns = [
            r'([A-D])[\.\)\s]+(.+?)(?=[A-D][\.\)]|\n|$)',
            r'([a-d])[\.\)\s]+(.+?)(?=[a-d][\.\)]|\n|$)',
        ]
    
    def preprocess_image(self, image_path: str) -> np.ndarray:
        """Enhanced image preprocessing for better OCR accuracy"""
        
        # Read image
        img = cv2.imread(image_path)
        if img is None:
            raise ValueError(f"Cannot read image: {image_path}")
        
        # Convert to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Noise reduction
        denoised = cv2.medianBlur(gray, 3)
        
        # Contrast enhancement using CLAHE
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        enhanced = clahe.apply(denoised)
        
        # Deskewing (basic rotation correction)
        coords = np.column_stack(np.where(enhanced > 0))
        if len(coords) > 0:
            angle = cv2.minAreaRect(coords)[-1]
            if angle < -45:
                angle = -(90 + angle)
            else:
                angle = -angle
            
            if abs(angle) > 0.5:  # Only rotate if significant skew
                (h, w) = enhanced.shape[:2]
                center = (w // 2, h // 2)
                M = cv2.getRotationMatrix2D(center, angle, 1.0)
                enhanced = cv2.warpAffine(enhanced, M, (w, h), 
                                        flags=cv2.INTER_CUBIC, 
                                        borderMode=cv2.BORDER_REPLICATE)
        
        # Morphological operations to clean up text
        kernel = np.ones((1,1), np.uint8)
        enhanced = cv2.morphologyEx(enhanced, cv2.MORPH_CLOSE, kernel)
        
        # Binary thresholding
        _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        
        return binary
    
    def extract_text_from_image(self, image_path: str) -> str:
        """Extract text from image using OCR"""
        try:
            # Preprocess image
            processed_img = self.preprocess_image(image_path)
            
            # Convert to PIL Image for Tesseract
            pil_img = Image.fromarray(processed_img)
            
            # OCR with Vietnamese support
            text = pytesseract.image_to_string(pil_img, config=self.tesseract_config)
            
            return text
        except Exception as e:
            print(f"Error extracting text from image: {e}")
            return ""
    
    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """Extract text from PDF file"""
        try:
            doc = fitz.open(pdf_path)
            text = ""
            ocr_text = ""
            
            for page_num in range(doc.page_count):
                page = doc[page_num]
                page_text = page.get_text()
                
                # If no text found, try OCR on page image
                print("Using OCR")
                pix = page.get_pixmap()
                img_data = pix.tobytes("png")
                img = Image.open(io.BytesIO(img_data))
                ocr_text += pytesseract.image_to_string(img, config=self.tesseract_config)
            
                text += page_text + "\n"
            
            doc.close()
            return text, ocr_text
        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
            return ""
    
    def clean_text(self, text: str) -> str:
        """Clean and normalize extracted text"""
        if not text:
            return ""
        
        # Remove extra whitespaces
        text = re.sub(r'\s+', ' ', text)
        
        # Fix common OCR errors for Vietnamese
        replacements = {
            'ă': 'ă', 'â': 'â', 'ê': 'ê', 'ô': 'ô', 'ơ': 'ơ', 'ư': 'ư',
            'à': 'à', 'á': 'á', 'ả': 'ả', 'ã': 'ã', 'ạ': 'ạ',
            'è': 'è', 'é': 'é', 'ẻ': 'ẻ', 'ẽ': 'ẽ', 'ẹ': 'ẹ',
            'ì': 'ì', 'í': 'í', 'ỉ': 'ỉ', 'ĩ': 'ĩ', 'ị': 'ị',
            'ò': 'ò', 'ó': 'ó', 'ỏ': 'ỏ', 'õ': 'õ', 'ọ': 'ọ',
            'ù': 'ù', 'ú': 'ú', 'ủ': 'ủ', 'ũ': 'ũ', 'ụ': 'ụ',
            'ỳ': 'ỳ', 'ý': 'ý', 'ỷ': 'ỷ', 'ỹ': 'ỹ', 'ỵ': 'ỵ',
        }
        
        for old, new in replacements.items():
            text = text.replace(old, new)
        
        # Remove special characters that might be OCR noise
        text = re.sub(r'[^\w\s\.\,\?\!\:\;\(\)\-\+\=\[\]\"\'\/\\\<\>]', '', text)
        
        # Normalize line breaks
        text = re.sub(r'\n+', '\n', text)
        
        return text.strip()
    
    def detect_question_structure(self, text: str) -> Dict:
        """Detect the structure and format of questions in the text"""
        structure_info = {
            'question_format': None,
            'answer_format': None,
            'total_questions': 0,
            'sample_matches': []
        }
        
        # Try different question patterns
        for i, pattern in enumerate(self.question_patterns):
            matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE)
            if matches:
                structure_info['question_format'] = f'Pattern_{i+1}'
                structure_info['total_questions'] = len(matches)
                structure_info['sample_matches'] = matches[:3]  # First 3 samples
                break
        
        # Try different answer patterns
        for i, pattern in enumerate(self.answer_patterns):
            matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE)
            if matches:
                structure_info['answer_format'] = f'Answer_Pattern_{i+1}'
                break
        
        return structure_info
    
    def parse_questions_basic(self, text: str) -> List[Dict]:
        """Basic question parsing using regex patterns"""
        questions = []
        
        # Detect structure first
        structure = self.detect_question_structure(text)
        
        if structure['total_questions'] == 0:
            print("No questions detected with current patterns")
            return questions
        
        # Use appropriate pattern based on detection
        pattern_idx = int(structure['question_format'].split('_')[1]) - 1
        question_pattern = self.question_patterns[pattern_idx]
        
        # Extract questions
        question_matches = re.findall(question_pattern, text, re.DOTALL | re.IGNORECASE)
        
        for match in question_matches:
            if len(match) == 2:  # (number, question_text)
                q_num, q_text = match
                
                # Extract answers following this question
                answers = self.extract_answers_for_question(text, q_text)
                
                if answers:
                    question_data = {
                        'number': q_num.strip(),
                        'question': q_text.strip(),
                        'answers': answers,
                        'correct_answer': self.detect_correct_answer(answers)
                    }
                    questions.append(question_data)
        
        return questions
    
    def extract_answers_for_question(self, text: str, question_text: str) -> List[Dict]:
        """Extract answers for a specific question"""
        answers = []
        
        # Find the position of the question in text
        q_pos = text.find(question_text)
        if q_pos == -1:
            return answers
        
        # Look for answers after the question
        remaining_text = text[q_pos + len(question_text):]
        
        # Try different answer patterns
        for pattern in self.answer_patterns:
            matches = re.findall(pattern, remaining_text[:500], re.DOTALL)  # Limit search range
            if matches:
                for match in matches:
                    if len(match) == 2:  # (letter, answer_text)
                        letter, answer_text = match
                        answers.append({
                            'letter': letter.upper(),
                            'text': answer_text.strip()
                        })
                break
        
        return answers[:4]  # Maximum 4 answers (A, B, C, D)
    
    def detect_correct_answer(self, answers: List[Dict]) -> str:
        """Simple heuristic to detect correct answer (placeholder)"""
        # This is a basic implementation - in reality, you'd need:
        # 1. Visual cues (bold, underlined, different color)
        # 2. Explicit marking in text
        # 3. Manual annotation
        
        if answers:
            return answers[0]['letter']  # Default to first answer
        return 'A'
    
    def export_to_excel(self, questions: List[Dict], output_path: str):
        """Export questions to Excel format compatible with Quizizz"""
        if not questions:
            print("No questions to export")
            return
        
        # Prepare data for Excel
        excel_data = []
        
        for q in questions:
            row = {
                'Question': q['question'],
                'Answer 1': '',
                'Answer 2': '',
                'Answer 3': '',
                'Answer 4': '',
                'Correct Answer(s)': q['correct_answer'],
                'Time (seconds)': 30  # Default time
            }
            
            # Fill answers
            for i, answer in enumerate(q['answers']):
                if i < 4:
                    row[f'Answer {i+1}'] = answer['text']
            
            excel_data.append(row)
        
        # Create DataFrame and export
        df = pd.DataFrame(excel_data)
        df.to_excel(output_path, index=False)
        print(f"Exported {len(questions)} questions to {output_path}")
    
    def process_file(self, file_path: str, output_path: str = None):
        """Main processing function"""
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            return
        
        print(f"Processing file: {file_path}")
        
        # Extract text based on file type
        file_ext = os.path.splitext(file_path)[1].lower()
        
        if file_ext == '.pdf':
            raw_text = self.extract_text_from_pdf(file_path)
        elif file_ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
            raw_text = self.extract_text_from_image(file_path)
        else:
            print(f"Unsupported file type: {file_ext}")
            return
        
        if not raw_text.strip():
            print("No text extracted from file")
            return
        
        print("Raw text extracted, cleaning...")
        clean_text = self.clean_text(raw_text)
        
        print("Analyzing text structure...")
        structure = self.detect_question_structure(clean_text)
        print(f"Structure detected: {structure}")
        
        print("Parsing questions...")
        questions = self.parse_questions_basic(clean_text)
        
        print(f"Found {len(questions)} questions")
        
        # Show sample results
        if questions:
            print("\nSample question:")
            sample = questions[0]
            print(f"Q{sample['number']}: {sample['question'][:100]}...")
            for ans in sample['answers']:
                print(f"  {ans['letter']}. {ans['text'][:50]}...")
        
        # Export to Excel
        if output_path is None:
            output_path = os.path.splitext(file_path)[0] + '_questions.xlsx'
        
        self.export_to_excel(questions, output_path)
        
        return {
            'raw_text': raw_text,
            'clean_text': clean_text,
            'structure': structure,
            'questions': questions
        }

# Demo usage
def demo_usage():
    """Demo function showing how to use the OCR Question Extractor"""
    
    extractor = OCRQuestionExtractor()
    
    # Example usage
    print("=== OCR Question Extractor Demo ===")
    print("Supported file types: PDF, JPG, PNG, BMP, TIFF")
    print("Usage examples:")
    print("1. Process an image: extractor.process_file('questions.jpg')")
    print("2. Process a PDF: extractor.process_file('exam.pdf')")
    print("3. Custom output: extractor.process_file('input.pdf', 'output.xlsx')")
    
    # Test with sample text
    sample_text = """
    Câu 1: Thủ đô của Việt Nam là gì?
    A. Hồ Chí Minh
    B. Hà Nội
    C. Đà Nẵng
    D. Cần Thơ
    
    Câu 2: Năm Việt Nam gia nhập ASEAN?
    A. 1995
    B. 1996
    C. 1997
    D. 1998
    """
    
    print("\n=== Testing with sample text ===")
    structure = extractor.detect_question_structure(sample_text)
    print(f"Detected structure: {structure}")
    
    questions = extractor.parse_questions_basic(sample_text)
    print(f"Parsed {len(questions)} questions")
    
    for q in questions:
        print(f"\nQ{q['number']}: {q['question']}")
        for ans in q['answers']:
            print(f"  {ans['letter']}. {ans['text']}")

if __name__ == "__main__":
    demo_usage()

=== OCR Question Extractor Demo ===
Supported file types: PDF, JPG, PNG, BMP, TIFF
Usage examples:
1. Process an image: extractor.process_file('questions.jpg')
2. Process a PDF: extractor.process_file('exam.pdf')
3. Custom output: extractor.process_file('input.pdf', 'output.xlsx')

=== Testing with sample text ===
Detected structure: {'question_format': 'Pattern_1', 'answer_format': 'Answer_Pattern_1', 'total_questions': 2, 'sample_matches': [('1', 'Thủ đô củ'), ('2', 'Năm Việt Nam gi')]}
Parsed 2 questions

Q1: Thủ đô củ
  A. Hồ Chí Minh
  B. Hà Nội
  C. Đà Nẵng
  D. Cần Thơ

Q2: Năm Việt Nam gi
  A. 1995
  B. 1996
  C. 1997
  D. 1998


In [2]:
import pytesseract
from PIL import Image

# Set đường dẫn trực tiếp cho Tesseract
pytesseract.pytesseract.tesseract_cmd = r'D://Tesseract//tesseract.exe'

try:
    img = Image.open("testocr.png")
    text = pytesseract.image_to_string(img, lang='vie+eng')
    print("OCR Result:")
    print("="*50)
    print(text)
except FileNotFoundError:
    print("Tesseract not found at D:\\Tesseract\\tesseract.exe")
    print("Check if tesseract.exe exists in that folder")
except Exception as e:
    print(f"Error: {e}")

OCR Result:
This is a lot of 12 point text to test the
ocr code and see if it works on all types
of file format.

The quick brown dog jumped over the
lazy fox. The quick brown dog jumped
over the lazy fox. The quick brown dog
jumped over the lazy fox. The quick
brown dog jumped over the lazy fox.



In [7]:
extractor = OCRQuestionExtractor()
text, ocr_text = extractor.extract_text_from_pdf("C://Users//hokta//Downloads//Chuong 3 Tong quan ve Dart va Flutter.pptx-trang-2.pdf")

Using OCR


In [8]:
print(text)

TRƯỜNG CÔNG NGHỆ THÔNG TIN VÀ TRUYỀN THÔNG
School of Information and Communication Technology
1.2 Các nền tảng (platforms) thực thi
▪Hai nền tảng: Dart Native và Dart Web
6
Các ứng dụng di động và máy tính để 
bàn, bao gồm máy ảo Dart (Dart VM) 
với trình biên dịch just-in-time (JIT) và 
trình ahead-of-time (AOT) 
Các ứng dụng trên nền Web.
Mã nguồn Dart được dịch thành mã 
JavaScript
https://dart.dev/overview




In [9]:
print(ocr_text)

1.2 Cac nén tang (platforms) thwe thi
- Hai nén tang: Dart Native va Dart Web
Statful hot relood
EE

ARM32 'ARM64 18664 JavaScript

Dart Native Dart Web
Cac img dung di d6ng va may tinh dé Cae img dung trén nén Web.
ban, bao gom may do Dart (Dart VM) ‘MA nguon Dart duge dich thinh ma
v6i trinh bién dich just-in-time (JIT) va JavaScript
trinh ahead-of-time (AOT)

Esoictsmenmemenz ye iran orion



## Cách 2

In [1]:
import cv2
import numpy as np
import easyocr
import re
import json

class SimpleQuestionExtractor:
    def __init__(self):
        # Khởi tạo EasyOCR với tiếng Việt
        self.reader = easyocr.Reader(['vi', 'en'], gpu=True)
    
    def extract_from_image(self, image_path):
        """Extract câu hỏi từ ảnh - version đơn giản nhất"""
        
        # 1. Đọc và preprocess ảnh
        img = cv2.imread(image_path)
        
        # 2. OCR để lấy text
        results = self.reader.readtext(img)
        
        # 3. Ghép tất cả text lại
        full_text = ""
        for (bbox, text, confidence) in results:
            if confidence > 0.5:  # Chỉ lấy text có confidence cao
                full_text += text + "\n"
        
        print("=== RAW OCR TEXT ===")
        print(full_text)
        print("==================")
        
        # 4. Parse câu hỏi và options
        question_data = self._parse_text(full_text)
        
        # 5. Detect checkbox (đơn giản)
        correct_idx = self._detect_checked_box(img)
        if correct_idx is not None:
            question_data['correct'] = correct_idx
        
        return question_data
    
    def _parse_text(self, text):
        """Parse text thành structure câu hỏi"""
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        
        question = ""
        options = []
        
        # Tìm câu hỏi (có "BQ:" hoặc "Câu")
        for i, line in enumerate(lines):
            if 'BQ:' in line or 'Câu' in line:
                question = line.replace('BQ:', '').replace('Câu', '').strip()
                # Lấy các dòng tiếp theo làm options
                for j in range(i+1, min(i+5, len(lines))):
                    if lines[j] and not lines[j].startswith('BQ:'):
                        # Loại bỏ checkbox symbols
                        clean_option = re.sub(r'^[\□☐☑✓✔]\s*', '', lines[j])
                        if clean_option:
                            options.append(clean_option)
                break
        
        return {
            'question': question,
            'options': options,
            'correct': 0,  # default
            'type': 'multiple_choice'
        }
    
    def _detect_checked_box(self, img):
        """Detect checkbox nào được tick - version siêu đơn giản"""
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Template matching cho checkbox được tick
        # Tạo template đơn giản cho checkbox
        template = np.ones((20, 20), dtype=np.uint8) * 255
        cv2.rectangle(template, (2, 2), (18, 18), 0, 2)
        cv2.line(template, (5, 10), (8, 13), 0, 2)
        cv2.line(template, (8, 13), (15, 6), 0, 2)
        
        # Match template
        res = cv2.matchTemplate(gray, template, cv2.TM_CCOEFF_NORMED)
        threshold = 0.6
        locations = np.where(res >= threshold)
        
        if len(locations[0]) > 0:
            # Trả về index của checkbox đầu tiên tìm thấy
            y_positions = locations[0]
            # Sắp xếp theo y coordinate để tìm option nào
            sorted_indices = np.argsort(y_positions)
            return len(sorted_indices) - 1  # Giả sử checkbox cuối cùng là đáp án
        
        return None
    
    def save_to_quizizz_format(self, question_data, output_file="questions.json"):
        """Lưu ra format Quizizz"""
        quizizz_format = {
            "questions": [question_data]
        }
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(quizizz_format, f, ensure_ascii=False, indent=2)
        
        print(f"Đã lưu vào {output_file}")

# Usage example
if __name__ == "__main__":
    extractor = SimpleQuestionExtractor()
    
    # Test với ảnh
    image_path = "C://Users//hokta//Downloads//PDF.png"  # Thay bằng path ảnh của bạn
    
    try:
        result = extractor.extract_from_image(image_path)
        print("\n=== EXTRACTED QUESTION ===")
        print(f"Question: {result['question']}")
        print(f"Options: {result['options']}")
        print(f"Correct: {result['correct']}")
        
        # Lưu file
        extractor.save_to_quizizz_format(result)
        
    except Exception as e:
        print(f"Error: {e}")

=== RAW OCR TEXT ===
#Hàm nào của chương trình Dart được gọi là top-level function?
Hàm main
Hàm nextInt
OHàm constructor
OHam call
Hàm build
#Hàm nào sau dây
phải là một top-level function của Dart
OHàm constructor
OHàm var
Hàm mixin
Tát cả các dáp án còn lai
không


=== EXTRACTED QUESTION ===
Question: 
Options: []
Correct: 0
Đã lưu vào questions.json


In [1]:
import torch
print(torch.version.cuda)       # Phiên bản CUDA mà PyTorch build
print(torch.cuda.is_available())  # True nếu GPU dùng được


12.8
True
