# AP Shah College ID Card Information Extractor

This notebook extracts the following information from college ID cards:
1. **Photo** (middle of the card)
2. **Name** (below photo)
3. **Department** (below name)
4. **Moodle ID** (8-digit code at bottom, format: 2XXXXXXX)

**Ignored regions:**
- Top ribbon (college name)
- Bottom right (principal's signature)
- Placeholder areas

## Setup and Imports

In [None]:
# Install required packages (run once)
# !pip install opencv-python pytesseract pillow numpy pandas matplotlib easyocr

In [None]:
import cv2
import numpy as np
import pytesseract
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
import os
import re
from pathlib import Path

# If on Windows, set tesseract path
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

print("✅ Libraries imported successfully!")

## Configuration

In [None]:
# Directory paths
DATA_DIR = Path("data")
OUTPUT_DIR = Path("output")
OUTPUT_DIR.mkdir(exist_ok=True)

# Region definitions (percentages of image dimensions)
# These will need to be adjusted based on your actual ID card layout
REGIONS = {
    'ignore_top': (0, 0, 1, 0.15),      # Top ribbon - ignore
    'photo': (0.3, 0.2, 0.7, 0.5),      # Photo in middle
    'name': (0.1, 0.52, 0.9, 0.62),     # Name below photo
    'department': (0.1, 0.63, 0.9, 0.73), # Department below name
    'moodle_id': (0.1, 0.85, 0.6, 0.95), # Moodle ID at bottom
    'ignore_signature': (0.7, 0.85, 1, 0.95) # Principal signature - ignore
}

print("✅ Configuration set!")

## Helper Functions

In [None]:
def load_image(image_path):
    """Load and display image"""
    img = cv2.imread(str(image_path))
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img, img_rgb

def get_region(img, region_coords):
    """Extract region from image based on percentage coordinates"""
    h, w = img.shape[:2]
    x1, y1, x2, y2 = region_coords
    
    # Convert percentage to pixels
    x1_px = int(x1 * w)
    y1_px = int(y1 * h)
    x2_px = int(x2 * w)
    y2_px = int(y2 * h)
    
    return img[y1_px:y2_px, x1_px:x2_px]

def preprocess_for_ocr(img):
    """Preprocess image for better OCR results"""
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if len(img.shape) == 3 else img
    
    # Apply thresholding
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Denoise
    denoised = cv2.fastNlMeansDenoising(thresh, h=10)
    
    return denoised

def extract_text(img_region, config=''):
    """Extract text from image region using OCR"""
    preprocessed = preprocess_for_ocr(img_region)
    text = pytesseract.image_to_string(preprocessed, config=config)
    return text.strip()

def extract_moodle_id(text):
    """Extract 8-digit Moodle ID starting with 2"""
    # Pattern: 2XXXXXXX (8 digits starting with 2)
    pattern = r'2\d{7}'
    match = re.search(pattern, text)
    return match.group(0) if match else None

def clean_text(text):
    """Clean extracted text"""
    # Remove extra whitespace and newlines
    text = ' '.join(text.split())
    return text

print("✅ Helper functions defined!")

## Visualization Functions

In [None]:
def visualize_regions(img_rgb, regions):
    """Visualize detected regions on ID card"""
    h, w = img_rgb.shape[:2]
    img_copy = img_rgb.copy()
    
    colors = {
        'photo': (0, 255, 0),
        'name': (255, 0, 0),
        'department': (0, 0, 255),
        'moodle_id': (255, 255, 0),
        'ignore_top': (128, 128, 128),
        'ignore_signature': (128, 128, 128)
    }
    
    for region_name, coords in regions.items():
        x1, y1, x2, y2 = coords
        x1_px = int(x1 * w)
        y1_px = int(y1 * h)
        x2_px = int(x2 * w)
        y2_px = int(y2 * h)
        
        color = colors.get(region_name, (255, 255, 255))
        cv2.rectangle(img_copy, (x1_px, y1_px), (x2_px, y2_px), color, 2)
        cv2.putText(img_copy, region_name, (x1_px, y1_px - 5), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
    
    plt.figure(figsize=(12, 8))
    plt.imshow(img_copy)
    plt.title("ID Card Regions")
    plt.axis('off')
    plt.show()

def display_extracted_info(img_rgb, photo, name, department, moodle_id):
    """Display extracted information"""
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Original ID card
    axes[0].imshow(img_rgb)
    axes[0].set_title("Original ID Card")
    axes[0].axis('off')
    
    # Extracted photo
    if photo is not None and photo.size > 0:
        photo_rgb = cv2.cvtColor(photo, cv2.COLOR_BGR2RGB)
        axes[1].imshow(photo_rgb)
        axes[1].set_title("Extracted Photo")
    axes[1].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    # Print extracted text information
    print("\n" + "="*50)
    print("EXTRACTED INFORMATION")
    print("="*50)
    print(f"Name:       {name}")
    print(f"Department: {department}")
    print(f"Moodle ID:  {moodle_id}")
    print("="*50)

print("✅ Visualization functions defined!")

## Main Extraction Function

In [None]:
def extract_id_card_info(image_path, visualize=True):
    """
    Extract all information from an ID card
    
    Args:
        image_path: Path to ID card image
        visualize: Whether to display visualizations
    
    Returns:
        dict: Extracted information
    """
    print(f"\n🔍 Processing: {image_path.name}")
    
    # Load image
    img, img_rgb = load_image(image_path)
    
    if visualize:
        visualize_regions(img_rgb, REGIONS)
    
    # Extract photo region
    photo = get_region(img, REGIONS['photo'])
    photo_path = OUTPUT_DIR / f"{image_path.stem}_photo.jpg"
    cv2.imwrite(str(photo_path), photo)
    
    # Extract name
    name_region = get_region(img, REGIONS['name'])
    name_text = extract_text(name_region)
    name = clean_text(name_text)
    
    # Extract department
    dept_region = get_region(img, REGIONS['department'])
    dept_text = extract_text(dept_region)
    department = clean_text(dept_text)
    
    # Extract Moodle ID
    moodle_region = get_region(img, REGIONS['moodle_id'])
    moodle_text = extract_text(moodle_region, config='--psm 6 digits')
    moodle_id = extract_moodle_id(moodle_text)
    
    # Display results
    if visualize:
        display_extracted_info(img_rgb, photo, name, department, moodle_id)
    
    # Return structured data
    return {
        'image_name': image_path.name,
        'name': name,
        'department': department,
        'moodle_id': moodle_id,
        'photo_path': str(photo_path)
    }

print("✅ Main extraction function defined!")

## Process Single ID Card

In [None]:
# Process a single ID card (replace with your image filename)
image_files = list(DATA_DIR.glob("*.jpg")) + list(DATA_DIR.glob("*.png"))

if len(image_files) == 0:
    print("⚠️ No images found in 'data' folder!")
    print("Please add ID card images to the 'data' folder.")
else:
    print(f"Found {len(image_files)} image(s)")
    
    # Process first image as example
    sample_image = image_files[0]
    result = extract_id_card_info(sample_image, visualize=True)

## Process Multiple ID Cards

In [None]:
def process_all_id_cards(data_dir, output_csv='extracted_data.csv'):
    """
    Process all ID cards in the data directory
    
    Args:
        data_dir: Directory containing ID card images
        output_csv: Output CSV filename
    
    Returns:
        DataFrame with all extracted information
    """
    image_files = list(data_dir.glob("*.jpg")) + list(data_dir.glob("*.png"))
    
    if len(image_files) == 0:
        print("⚠️ No images found!")
        return None
    
    print(f"\n📂 Processing {len(image_files)} ID cards...\n")
    
    results = []
    
    for img_path in image_files:
        try:
            result = extract_id_card_info(img_path, visualize=False)
            results.append(result)
            print(f"✅ {img_path.name}")
        except Exception as e:
            print(f"❌ Error processing {img_path.name}: {e}")
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Save to CSV
    csv_path = OUTPUT_DIR / output_csv
    df.to_csv(csv_path, index=False)
    print(f"\n💾 Data saved to: {csv_path}")
    
    return df

# Process all ID cards
if len(image_files) > 0:
    df_results = process_all_id_cards(DATA_DIR)
    
    if df_results is not None:
        print("\n📊 Summary:")
        display(df_results)

## Fine-tune Region Coordinates

In [None]:
# Use this cell to fine-tune the region coordinates
# Adjust REGIONS dictionary values and re-run extraction

def test_region_adjustment(image_path, region_name, new_coords):
    """
    Test new coordinates for a specific region
    
    Args:
        image_path: Path to test image
        region_name: Name of region to test
        new_coords: New coordinates (x1, y1, x2, y2) as percentages
    """
    img, img_rgb = load_image(image_path)
    
    # Extract region with new coordinates
    region = get_region(img, new_coords)
    
    # Display
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.imshow(img_rgb)
    plt.title("Full ID Card")
    plt.axis('off')
    
    plt.subplot(1, 2, 2)
    region_rgb = cv2.cvtColor(region, cv2.COLOR_BGR2RGB)
    plt.imshow(region_rgb)
    plt.title(f"{region_name} Region")
    plt.axis('off')
    
    plt.tight_layout()
    plt.show()
    
    # Try OCR if text region
    if region_name != 'photo':
        text = extract_text(region)
        print(f"Extracted text: {text}")

# Example usage:
# test_region_adjustment(image_files[0], 'name', (0.1, 0.52, 0.9, 0.62))

## Export Results

In [None]:
# Export final results
if 'df_results' in locals() and df_results is not None:
    # Save as CSV
    df_results.to_csv(OUTPUT_DIR / 'id_card_data.csv', index=False)
    
    # Save as JSON
    df_results.to_json(OUTPUT_DIR / 'id_card_data.json', orient='records', indent=2)
    
    print("✅ Results exported to:")
    print(f"   - {OUTPUT_DIR / 'id_card_data.csv'}")
    print(f"   - {OUTPUT_DIR / 'id_card_data.json'}")
else:
    print("⚠️ No results to export. Process some ID cards first!")

## Notes & Tips

### Improving Accuracy:
1. **Adjust Region Coordinates**: Use the `test_region_adjustment()` function to fine-tune coordinates
2. **Image Quality**: Ensure high-resolution, well-lit images
3. **Preprocessing**: Adjust threshold values if OCR accuracy is poor
4. **OCR Config**: Try different Tesseract PSM modes for better text extraction

### Region Coordinate Format:
- `(x1, y1, x2, y2)` where values are percentages (0 to 1)
- x1, y1 = top-left corner
- x2, y2 = bottom-right corner

### Troubleshooting:
- If Moodle ID is not detected, check if the pattern matches `2XXXXXXX`
- If text is garbled, adjust preprocessing parameters
- Use `visualize=True` to see which regions are being extracted

### Next Steps:
1. Place ID card images in the `data/` folder
2. Run the notebook cells sequentially
3. Adjust region coordinates if needed
4. Process all cards and export results