In [1]:
!pip install -q packaging ninja

[0m

In [2]:
!pip install flash-attn --no-build-isolation

[0m

In [5]:
# Cell 1: Install Dependencies and import

In [4]:
# Install system dependencies
!sudo apt-get update -qq
!sudo apt-get install -y -qq poppler-utils tesseract-ocr

# Install Python packages
!pip install -q pdf2image Pillow pytesseract opencv-python-headless pandas numpy tqdm
!pip install -q torch torchvision --upgrade
!pip install -q transformers==4.46.3 tokenizers==0.20.3 --upgrade
!pip install -q einops addict easydict safetensors accelerate


print("="*50)
print("✓ All dependencies installed!")
print("="*50)

✓ All dependencies installed!


In [1]:
import warnings
import logging
import os
import sys
import re
from io import StringIO
from pathlib import Path
from datetime import datetime

import torch
import pandas as pd
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
from transformers import AutoModel, AutoTokenizer

warnings.filterwarnings("ignore")
logging.getLogger("transformers").setLevel(logging.ERROR)

print("✓ Imports complete")

✓ Imports complete


In [2]:
# Cell 2: Load Model

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
MODEL_NAME = 'deepseek-ai/DeepSeek-OCR'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', trust_remote_code=True, use_safetensors=True)
model = model.eval().cuda().to(torch.bfloat16)

print("✓ Model loaded")

2025-12-24 22:09:31.563122: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766614171.576284   11258 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766614171.582348   11258 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-12-24 22:09:31.598744: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


✓ Model loaded


In [None]:
# Cell 3: Configuration

In [5]:
# ============================================================
# CONFIGURATION - Modify these as needed
# ============================================================

CONFIG = {
    'dpi': 400,
    'base_size': 1024,
    'image_size': 640,
    'output_dir': './output',
    'images_dir': './images'
}

ELEMENT_NAMES = {
    'AL': 'Aluminum', 'V': 'Vanadium', 'FE': 'Iron', 'C': 'Carbon',
    'N': 'Nitrogen', 'O': 'Oxygen', 'Y': 'Yttrium', 'H': 'Hydrogen',
    'TI': 'Titanium', 'SI': 'Silicon', 'MN': 'Manganese', 'P': 'Phosphorus',
    'S': 'Sulfur', 'CR': 'Chromium', 'MO': 'Molybdenum', 'NI': 'Nickel',
    'CU': 'Copper', 'W': 'Tungsten', 'CO': 'Cobalt', 'NB': 'Niobium',
    'B': 'Boron', 'SN': 'Tin', 'ZN': 'Zinc', 'PB': 'Lead', 'ZR': 'Zirconium',
    'TA': 'Tantalum', 'HF': 'Hafnium', 'MG': 'Magnesium', 'CA': 'Calcium'
}
ELEMENTS = set(ELEMENT_NAMES.keys())

print("✓ Config loaded")

✓ Config loaded


In [4]:
# Cell 4: PDF to Images Pipeline

In [6]:
def fix_orientation(img):
    """Fix image orientation using Tesseract OSD"""
    try:
        osd = pytesseract.image_to_osd(img)
        rotation = 0
        for line in osd.split('\n'):
            if 'Rotate:' in line:
                rotation = int(line.split(':')[1].strip())
                break
        
        if rotation == 90:
            img = img.rotate(-90, expand=True)
        elif rotation == 180:
            img = img.rotate(180, expand=True)
        elif rotation == 270:
            img = img.rotate(-270, expand=True)
        
        return img
    except:
        return img


def pdf_to_images(pdf_path, dpi=None, output_dir=None):
    """Convert any PDF to correctly oriented images"""
    dpi = dpi or CONFIG['dpi']
    output_dir = output_dir or CONFIG['images_dir']
    
    os.makedirs(output_dir, exist_ok=True)
    pdf_name = Path(pdf_path).stem
    
    print(f"Converting: {pdf_path} (DPI: {dpi})")
    
    images = convert_from_path(pdf_path, dpi=dpi)
    print(f"  Pages: {len(images)}")
    
    saved_paths = []
    for i, img in enumerate(images, 1):
        img = fix_orientation(img)
        img_path = f"{output_dir}/{pdf_name}_page_{i}.png"
        img.save(img_path, 'PNG')
        saved_paths.append(img_path)
        print(f"  ✓ Page {i} saved")
    
    return saved_paths


print("✓ PDF pipeline defined")

✓ PDF pipeline defined


In [21]:
# Cell 5: OCR Pipeline

In [7]:
def run_ocr(image_path):
    """Run DeepSeek OCR on a single image"""
    prompt = "<image>\nFree OCR."
    os.makedirs(CONFIG['output_dir'], exist_ok=True)
    
    old_stdout = sys.stdout
    sys.stdout = captured = StringIO()
    
    try:
        result = model.infer(
            tokenizer, 
            prompt=prompt, 
            image_file=image_path, 
            output_path=CONFIG['output_dir'],
            base_size=CONFIG['base_size'], 
            image_size=CONFIG['image_size'], 
            crop_mode=True,
            save_results=True, 
            test_compress=True
        )
    finally:
        sys.stdout = old_stdout
    
    return result if result else captured.getvalue()


def run_ocr_on_all_images(image_paths):
    """Run OCR on all images and merge results"""
    all_text = []
    
    for i, img_path in enumerate(image_paths, 1):
        print(f"  OCR Page {i}/{len(image_paths)}...", end=" ")
        text = run_ocr(img_path)
        all_text.append(f"\n{'='*50}\nPAGE {i}\n{'='*50}\n{text}")
        print(f"✓ {len(text)} chars")
    
    return '\n'.join(all_text)


print("✓ OCR pipeline defined")

✓ OCR pipeline defined


In [23]:
# Cell 5: Parser Pipeline

In [8]:
def german_to_float(text):
    """Convert German number format"""
    if not text:
        return None
    text = str(text).strip().replace(' ', '')
    if '.' in text and ',' in text:
        text = text.replace('.', '').replace(',', '.')
    elif ',' in text:
        text = text.replace(',', '.')
    try:
        return float(text)
    except:
        return None


def parse_value(text):
    """Parse numeric value from text"""
    if not text:
        return None, None, None
    
    text = str(text).strip()
    text = re.sub(r'(\d),(\d)', r'\1.\2', text)
    
    m = re.search(r'(\d+\.?\d*)\s*[-–]\s*(\d+\.?\d*)', text)
    if m:
        return float(m.group(1)), float(m.group(2)), 'range'
    
    m = re.search(r'max\s*[:\s]*(\d+\.?\d*)', text, re.I)
    if m:
        return float(m.group(1)), None, 'max'
    
    m = re.search(r'[<≤]\s*(\d+\.?\d*)', text)
    if m:
        return float(m.group(1)), None, 'less_than'
    
    m = re.search(r'(\d+\.?\d*)', text)
    if m:
        return float(m.group(1)), None, 'exact'
    
    return None, None, None


def extract_composition(text):
    """Extract chemical composition from OCR text"""
    results = []
    
    # Find markdown table rows
    rows = []
    for line in text.split('\n'):
        if '|' in line and not re.match(r'^[\|\s\-:]+$', line.strip()):
            cells = [c.strip() for c in line.split('|') if c.strip()]
            if cells:
                rows.append(cells)
    
    # Find element header row
    header_idx, header_row = None, None
    for i, row in enumerate(rows):
        if sum(1 for c in row if c.upper().strip() in ELEMENTS) >= 3:
            header_idx, header_row = i, row
            break
    
    # Extract values
    if header_row:
        elem_map = {j: c.upper().strip() for j, c in enumerate(header_row) if c.upper().strip() in ELEMENTS}
        
        for row_idx in range(header_idx + 1, len(rows)):
            row = rows[row_idx]
            if sum(1 for c in row if c.upper().strip() in ELEMENTS) >= 3:
                continue
            
            row_type = 'actual'
            first = row[0].upper() if row else ''
            if 'TOP' in first:
                row_type = 'top'
            elif 'BOTTOM' in first:
                row_type = 'bottom'
            elif any(x in first for x in ['REQ', 'MIN', 'MAX', 'SPEC']):
                row_type = 'requirement'
            
            for col_idx, elem in elem_map.items():
                if col_idx < len(row):
                    val, max_val, vtype = parse_value(row[col_idx])
                    if val is not None:
                        entry = {
                            'element_symbol': elem,
                            'element_name': ELEMENT_NAMES.get(elem, elem),
                            'value': val,
                            'unit': 'wt.%',
                            'value_type': vtype,
                            'sample_position': row_type
                        }
                        if max_val is not None:
                            entry['max_value'] = max_val
                        results.append(entry)
    
    # Ti-remainder
    if re.search(r'ti[-\s]?remainder', text, re.I):
        results.append({
            'element_symbol': 'TI', 'element_name': 'Titanium',
            'value': None, 'unit': 'wt.%', 'value_type': 'balance', 'sample_position': 'actual'
        })
    
    return results


def extract_metadata(text):
    """Extract certificate metadata"""
    info = {}
    m = re.search(r'(Ti-6Al-4V|TI-6AL-4V)', text, re.I)
    if m:
        info['alloy'] = m.group(1).upper()
    m = re.search(r'Heat\s*(?:№|No\.?)[:\s]*([0-9\-]+)', text, re.I)
    if m:
        info['heat_no'] = m.group(1)
    return info


print("✓ Parser pipeline defined")

✓ Parser pipeline defined


In [25]:
# Cell 6: Main Pipeline Function

In [9]:
def extract_chemical_composition(pdf_path, output_csv=None):
    """
    MAIN PIPELINE: Extract chemical composition from any PDF
    
    Args:
        pdf_path: Path to PDF file
        output_csv: Output CSV filename (optional)
    
    Returns:
        DataFrame with extracted composition
    """
    print("="*60)
    print("CHEMICAL COMPOSITION EXTRACTION PIPELINE")
    print("="*60)
    
    # Step 1: Convert PDF to images
    print("\n[Step 1] Converting PDF to images...")
    image_paths = pdf_to_images(pdf_path)
    
    # Step 2: Run OCR on all images
    print("\n[Step 2] Running OCR...")
    ocr_text = run_ocr_on_all_images(image_paths)
    print(f"  Total: {len(ocr_text)} chars")
    
    # Save raw OCR
    with open('merged_ocr.txt', 'w', encoding='utf-8') as f:
        f.write(ocr_text)
    
    # Step 3: Parse composition
    print("\n[Step 3] Parsing chemical composition...")
    composition = extract_composition(ocr_text)
    metadata = extract_metadata(ocr_text)
    
    if not composition:
        print("  ⚠ No composition found")
        return pd.DataFrame()
    
    for item in composition:
        item.update(metadata)
    
    df = pd.DataFrame(composition)
    
    # Order columns
    cols = ['element_symbol', 'element_name', 'value', 'max_value', 'unit', 
            'value_type', 'sample_position', 'alloy', 'heat_no']
    df = df[[c for c in cols if c in df.columns] + [c for c in df.columns if c not in cols]]
    df = df.sort_values(['element_symbol', 'sample_position']).reset_index(drop=True)
    
    # Step 4: Save CSV
    if output_csv is None:
        output_csv = f"chemical_composition_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    
    df.to_csv(output_csv, index=False)
    
    print(f"\n[Step 4] Results saved: {output_csv}")
    print(f"  Entries: {len(df)}")
    print(f"  Elements: {', '.join(sorted(df['element_symbol'].unique()))}")
    
    print("\n" + "="*60)
    print("✓ PIPELINE COMPLETE")
    print("="*60)
    
    return df


print("✓ Main pipeline defined")

✓ Main pipeline defined


In [27]:
# Cell 7: Run Pipeline

In [10]:
# ============================================================
# USAGE: Just change the PDF path and run!
# ============================================================

PDF_PATH = "matcert _AM_ 8_31_04157.pdf"

df = extract_chemical_composition(PDF_PATH)
display(df)

CHEMICAL COMPOSITION EXTRACTION PIPELINE

[Step 1] Converting PDF to images...
Converting: matcert _AM_ 8_31_04157.pdf (DPI: 400)
  Pages: 2
  ✓ Page 1 saved
  ✓ Page 2 saved

[Step 2] Running OCR...
  OCR Page 1/2... 

image: 0it [00:00, ?it/s]
other: 0it [00:00, ?it/s]


✓ 2045 chars
  OCR Page 2/2... 

image: 0it [00:00, ?it/s]
other: 0it [00:00, ?it/s]

✓ 1829 chars
  Total: 4095 chars

[Step 3] Parsing chemical composition...

[Step 4] Results saved: chemical_composition_20251224_221305.csv
  Entries: 61
  Elements: AL, C, FE, H, N, O, V, Y

✓ PIPELINE COMPLETE





Unnamed: 0,element_symbol,element_name,value,max_value,unit,value_type,sample_position,alloy
0,AL,Aluminum,11.0000,,wt.%,exact,actual,TI-6AL-4V
1,AL,Aluminum,203.0000,,wt.%,exact,actual,TI-6AL-4V
2,AL,Aluminum,891.0000,,wt.%,exact,actual,TI-6AL-4V
3,AL,Aluminum,930.0000,,wt.%,exact,actual,TI-6AL-4V
4,AL,Aluminum,891.0000,,wt.%,exact,actual,TI-6AL-4V
...,...,...,...,...,...,...,...,...
56,V,Vanadium,4.0800,4.090,wt.%,range,bottom,TI-6AL-4V
57,V,Vanadium,3.5000,4.500,wt.%,range,top,TI-6AL-4V
58,Y,Yttrium,0.0036,,wt.%,exact,actual,TI-6AL-4V
59,Y,Yttrium,0.0034,,wt.%,exact,actual,TI-6AL-4V


In [29]:
# Cell 8: Summary View

In [11]:
if not df.empty:
    print("SUMMARY BY ELEMENT:")
    summary = df.groupby('element_symbol').agg({
        'value': ['min', 'max', 'mean'],
        'value_type': 'first'
    }).round(4)
    display(summary)

SUMMARY BY ELEMENT:


Unnamed: 0_level_0,value,value,value,value_type
Unnamed: 0_level_1,min,max,mean,first
element_symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AL,0.2,930.0,469.7858,exact
C,0.006,46.4,30.4229,exact
FE,0.19,9183.0,1251.5989,exact
H,0.0004,0.0004,0.0004,range
N,0.002,46.4,21.7128,exact
O,0.0034,18.2,3.7502,exact
V,3.5,1017.0,681.568,exact
Y,0.0034,0.194,0.067,exact


In [31]:
# Cell 9: Analysis

In [12]:
df[df['element_name']=='Aluminum']

Unnamed: 0,element_symbol,element_name,value,max_value,unit,value_type,sample_position,alloy
0,AL,Aluminum,11.0,,wt.%,exact,actual,TI-6AL-4V
1,AL,Aluminum,203.0,,wt.%,exact,actual,TI-6AL-4V
2,AL,Aluminum,891.0,,wt.%,exact,actual,TI-6AL-4V
3,AL,Aluminum,930.0,,wt.%,exact,actual,TI-6AL-4V
4,AL,Aluminum,891.0,,wt.%,exact,actual,TI-6AL-4V
5,AL,Aluminum,930.0,,wt.%,exact,actual,TI-6AL-4V
6,AL,Aluminum,891.0,,wt.%,exact,actual,TI-6AL-4V
7,AL,Aluminum,878.0,,wt.%,exact,actual,TI-6AL-4V
8,AL,Aluminum,6.53,6.54,wt.%,range,bottom,TI-6AL-4V
9,AL,Aluminum,0.2,,wt.%,exact,requirement,TI-6AL-4V


In [13]:
df.to_csv("submission.csv")