# 라인 추출

In [22]:
import torch

############################################
# 0) GPU 설정 (Parell 일때 CuDA )
############################################
# 물리 GPU #3만 사용하도록
torch.cuda.set_device(3)

import os
import cv2
import numpy as np
import easyocr
import pandas as pd
print(f"PyTorch CUDA Available: {torch.cuda.is_available()}")
print(f"Current Device Index: {torch.cuda.current_device()}")

############################################
# 1) 라인 세그먼트 함수
############################################
def segment_lines_by_projection(
    image_path: str,
    output_dir: str,
    prefix: str = "line_seg",
    threshold_val: int = 150,
    min_line_height: int = 10
):
    """
    Projection Profile을 이용하여 라인(줄) 단위로 분할하고, 잘려진 라인 이미지를 저장합니다.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # 1) 이미지 로드 (그레이스케일)
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise FileNotFoundError(f"Could not read the image file: {image_path}")
    
    # 2) 이진화 (글자 영역이 흰색이 되도록 Invert)
    _, bin_img = cv2.threshold(img, threshold_val, 255, cv2.THRESH_BINARY_INV)
    
    # 3) 수평 투영 (행 단위 흰 픽셀 수 카운트)
    h, w = bin_img.shape
    horizontal_projection = np.sum(bin_img, axis=1)
    
    # 4) 라인 경계 찾기
    line_boxes = []
    in_line = False
    line_start = 0
    for y in range(h):
        if horizontal_projection[y] > 0:  # 글자 영역 존재
            if not in_line:
                in_line = True
                line_start = y
        else:
            if in_line:
                line_end = y
                if (line_end - line_start) > min_line_height:
                    line_boxes.append((line_start, line_end))
                in_line = False
    
    # 마지막 줄 처리
    if in_line:
        line_end = h
        if (line_end - line_start) > min_line_height:
            line_boxes.append((line_start, line_end))
    
    # 5) 라인별 Crop 및 저장
    for i, (ystart, yend) in enumerate(line_boxes):
        cropped_line = img[ystart:yend, 0:w]
        line_filename = f"{prefix}_line_{i}.png"
        cv2.imwrite(os.path.join(output_dir, line_filename), cropped_line)
    
    return line_boxes

############################################
# 2) 라인 이미지를 EasyOCR Recognizer만 사용하여 OCR
############################################
def recognize_line_image(line_image_path: str, reader) -> str:
    """
    Craft Detector 없이 Recognizer만 사용하는 함수.
    이미 라인 단위로 잘린 이미지를 바로 인식합니다.
    """
    if not os.path.exists(line_image_path):
        raise FileNotFoundError(f"Could not read line image: {line_image_path}")
    
    # 이미지 로드 (회색조)
    img = cv2.imread(line_image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise FileNotFoundError(f"Could not read line image: {line_image_path}")
    
    # 이미 잘려진(줄 단위) 이미지이므로 Detector는 불필요
    # reader.recognize()를 통해 인식 (detail=1 → [bbox, text, confidence] 형태)
    results = reader.recognize(
        img_cv_grey=img,
        horizontal_list=None,
        free_list=None,
        detail=1
    )
    
    # results는 [ [ [x1,y1],[x2,y2], ...], text, confidence ] 형태
    # -2 인덱스가 인식된 text, -1이 confidence
    line_texts = [r[-2] for r in results]
    return " ".join(line_texts)

############################################
# 3) 문서 전체 라인 세그먼트 -> Recognizer
############################################
def process_document_lines_no_detector(
    doc_image_path: str,
    seg_output_dir: str,
    prefix: str = "doc",
    line_threshold: int = 150,
    min_line_h: int = 10,
    reader=None
):
    """
    1) Projection Profile로 라인을 세그먼트,
    2) 각 라인 이미지를 Recognizer만 이용하여 OCR (Detector 없음)
    """
    # 1) 라인 세그먼트
    line_boxes = segment_lines_by_projection(
        image_path=doc_image_path,
        output_dir=seg_output_dir,
        prefix=prefix,
        threshold_val=line_threshold,
        min_line_height=min_line_h
    )
    
    # 2) 각 라인 이미지에 대해 Recognizer
    ocr_results = []
    for i, (ystart, yend) in enumerate(line_boxes):
        line_file = os.path.join(seg_output_dir, f"{prefix}_line_{i}.png")
        line_text = recognize_line_image(line_file, reader=reader)
        ocr_results.append(line_text)
    
    return ocr_results


############################################
# 4) 메인 실행부
############################################
if __name__ == "__main__":
    # 1) EasyOCR Reader 생성 (Detector 미사용)
    reader = easyocr.Reader(
        lang_list=["ko", "en"],  # 원하는 언어
        gpu=True,                # GPU 사용 (메모리가 부족하면 gpu=False)
        detector=False,          # Craft Detector를 로드하지 않음
        recognizer=True
    )
    
    # 2) DataParallel 해제 (Recognize 모델이 래핑되었을 수 있음)
    if hasattr(reader.recognizer, 'module'):
        reader.recognizer = reader.recognizer.module
    
    # 입력/출력 폴더
    input_dir = "/workspace/Hand/original"
    seg_dir   = "/workspace/MIL/line_segments"
    os.makedirs(seg_dir, exist_ok=True)
    
    # PNG 파일 목록
    all_files = [f for f in os.listdir(input_dir) if f.lower().endswith(".png")]
    
    # 파일이 많다면, 앞쪽 30개만 가져오기 (원하는 만큼 조절)
    sample_files = all_files[:30]
    # 파일별 처리
    for filename in sample_files:
        doc_image = os.path.join(input_dir, filename)
        prefix = os.path.splitext(filename)[0]
        
        print(f"Processing: {doc_image}")
        results = process_document_lines_no_detector(
            doc_image_path=doc_image,
            seg_output_dir=seg_dir,
            prefix=prefix,
            line_threshold=150,
            min_line_h=10,
            reader=reader
        )
        
        # 결과 확인
        for idx, line_text in enumerate(results):
            print(f"File {filename} - Line {idx}: {line_text}")


PyTorch CUDA Available: True
Current Device Index: 3
Processing: /workspace/Hand/original/w0001_s01_pLND_r01.png
File w0001_s01_pLND_r01.png - Line 0: ~염소 하팔다하 } 하4
File w0001_s01_pLND_r01.png - Line 1:    cKes +v Tisn Cun  Ko cncl Lll 값 (05
File w0001_s01_pLND_r01.png - Line 2:   Loldl   Ra<니 cndl cru 2 A4ns   Griece,
File w0001_s01_pLND_r01.png - Line 3: ' Uewhev   87 c   Uecexbi ?   (ettvs
File w0001_s01_pLND_r01.png - Line 4: '
File w0001_s01_pLND_r01.png - Line 5: 13586 Gdspitf "thaiis YgilifriBwani]:
File w0001_s01_pLND_r01.png - Line 6: ' Ly- L  Wclucicl   cwcl   Kokei+   Unoks , Es9: ,
File w0001_s01_pLND_r01.png - Line 7: L++ w T 'YY' Exprcss  Krot:
Processing: /workspace/Hand/original/w0001_s01_pLND_r02.png
File w0001_s01_pLND_r02.png - Line 0: ~n미 다"%:평"
File w0001_s01_pLND_r02.png - Line 1: '  1나9 (   Zervo+   SNce+ Cvid  huv %koes 4
File w0001_s01_pLND_r02.png - Line 2: 'Tn cwcl Pxe nal   wuYl jOikColoit
File w0001_s01_pLND_r02.png - Line 3:  Rvpjoscfewrhi sta Afry Cveis N

File w0001_s03_pWOZ_r02.png - Line 0:   Wi a Sha4 +v Sh WiS   Wil나ka]
File w0001_s03_pWOZ_r02.png - Line 1: biezly +cvtid fu   Emxzl4 Cj (he
File w0001_s03_pWOZ_r02.png - Line 2: 젊 하하
File w0001_s03_pWOZ_r02.png - Line 3: 'jns Bitao"irntaig rriina"}
File w0001_s03_pWOZ_r02.png - Line 4: '
Processing: /workspace/Hand/original/w0001_s03_pWOZ_r03.png
File w0001_s03_pWOZ_r03.png - Line 0: ' |ti 0 CV+ Ie SVC   WaS   Wa(kivy]
File w0001_s03_pWOZ_r03.png - Line 1: Drvii _ hwwwdhrin manai Ch'hai
File w0001_s03_pWOZ_r03.png - Line 2:   navd   uellov   Yoadhecl' TKt Suv Shon
File w0001_s03_pWOZ_r03.png - Line 3: '  briowt and tu   bids   Sano   (we
File w0001_s03_pWOZ_r03.png - Line 4: ~무"sryd nr iin #
File w0001_s03_pWOZ_r03.png - Line 5: ')
File w0001_s03_pWOZ_r03.png - Line 6: '
File w0001_s03_pWOZ_r03.png - Line 7: '   (WSLid   aW추   fva   Yev   Dww Lau Kul
File w0001_s03_pWOZ_r03.png - Line 8:  ard s dov iv hu widst  4 9 SYanck
File w0001_s03_pWOZ_r03.png - Line 9: ' ( (uCl ,
Processing: /w

하나의 문서에서 문장을 잘라내는 것을 반복문을 통해 여러 파일을 만들고 있음. 
즉, 한 문서를 가지고 여러 파일을 만들고, 다 자르고 나면 그 다음 문서로 넘어가 같은 작업을 수행함.
2줄 이상으로 이루어진 문장 파일만 얻으려면, 2줄 문장 만큼에 해당하는 검정생 픽셀값을 기준을 두고 
그 이하에서 만들어지는 파일들은 skip 하고 그 이상의 픽셀값에 해당 되는 것들만 골라내어 저장시킨다.

# EasyOCR 2줄 이상으로 이루어진 문장 저장 (min_black_pixels 옵션 설정)

In [39]:
import torch

############################################
# 0) GPU 설정 (Parell 일때 CuDA)
############################################
# 물리 GPU #3만 사용하도록
torch.cuda.set_device(3)

import os
import cv2
import numpy as np
import easyocr
import pandas as pd

print(f"PyTorch CUDA Available: {torch.cuda.is_available()}")
print(f"Current Device Index: {torch.cuda.current_device()}")

############################################
# 1) 라인 세그먼트 (원본 vs 이진화)
############################################
def segment_lines_by_projection(
    image_path: str,
    threshold_val: int = 150,
    min_line_height: int = 10
):
    """
    1) 원본 (회색조) 이미지와
    2) 이진화(THRESH_BINARY)된 bin_img
    를 동시에 다룸.
    
    - bin_img: 글자=0, 배경=255
    - 분할 후 line_boxes만 반환(파일 저장은 나중)
    """
    # (A) 원본 이미지를 읽음 (그레이스케일)
    img_gray = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img_gray is None:
        raise FileNotFoundError(f"Could not read the image file: {image_path}")
    
    # (B) 이진화 (글자=0, 배경=255)
    _, bin_img = cv2.threshold(img_gray, threshold_val, 255, cv2.THRESH_BINARY)

    # (C) Projection Profile 위해 invert
    inverted = 255 - bin_img  # 글자=255, 배경=0
    h, w = inverted.shape
    horizontal_projection = np.sum(inverted, axis=1)

    line_boxes = []
    in_line = False
    line_start = 0

    for y in range(h):
        if horizontal_projection[y] > 0:  # 글자(=255) 픽셀 존재
            if not in_line:
                in_line = True
                line_start = y
        else:
            if in_line:
                line_end = y
                if (line_end - line_start) > min_line_height:
                    line_boxes.append((line_start, line_end))
                in_line = False

    if in_line:
        line_end = h
        if (line_end - line_start) > min_line_height:
            line_boxes.append((line_start, line_end))

    # 이 함수는 line_boxes, 원본 img_gray, 이진화 bin_img를 반환
    return line_boxes, img_gray, bin_img

############################################
# 2) OCR Recognizer (회색조 라인) 
############################################
def recognize_line_image(line_image: np.ndarray, reader) -> str:
    """
    EasyOCR에 '회색조' 라인 이미지를 전달해 문자 인식.
    """
    # reader.recognize(..., detail=1) internally
    results = reader.recognize(line_image, detail=1)
    texts = [r[-2] for r in results]
    return " ".join(texts)

############################################
# 3) 문서 전체 -> 라인 분할 -> (검정 픽셀 수 체크) -> Crop(원본) -> 저장 + OCR
############################################
def process_document_lines_no_detector(
    doc_image_path: str,
    seg_output_dir: str,
    prefix: str = "doc",
    threshold_val: int = 150,
    min_line_h: int = 10,
    min_black_pixels: int = 12000,  # "2줄 이상" 판단용 임계값 (예시)
    reader=None
):
    """
    1) line_boxes = segment_lines_by_projection(원본, bin_img)
    2) bin_img에서 검정 픽셀(=0) 개수 세어 기준 미달이면 skip
    3) 기준 이상이면 원본(img_gray)에서 Crop → 저장, 
       그리고 EasyOCR (회색조 라인)
    """
    # (A) 라인 경계 + 원본/이진화 이미지 얻기
    line_boxes, img_gray, bin_img = segment_lines_by_projection(
        image_path=doc_image_path,
        threshold_val=threshold_val,
        min_line_height=min_line_h
    )

    h, w = bin_img.shape
    os.makedirs(seg_output_dir, exist_ok=True)

    ocr_results = []
    valid_idx = 0

    # (B) 라인별 반복
    for i, (ystart, yend) in enumerate(line_boxes):
        # 1) 이진화 이미지에서 Crop (글자=0, 배경=255)
        cropped_bin = bin_img[ystart:yend, 0:w]

        # 2) 검정 픽셀 수 계산
        total_px = cropped_bin.size
        background_count = cv2.countNonZero(cropped_bin)  
        black_pixels = total_px - background_count
        
        # 기준치 미만이면 skip
        if black_pixels < min_black_pixels:
            continue

        # 3) 원본(img_gray)에서 동일 구간을 Crop → 색/질감 유지
        cropped_gray = img_gray[ystart:yend, 0:w]

        # 4) 저장(원본 톤)
        line_filename = f"{prefix}_line_{valid_idx}.png"
        save_path = os.path.join(seg_output_dir, line_filename)
        cv2.imwrite(save_path, cropped_gray)  # 원본 회색조 그대로 저장

        # 5) EasyOCR (회색조 이미지)
        line_text = recognize_line_image(cropped_gray, reader=reader)
        ocr_results.append(line_text)

        valid_idx += 1

    return ocr_results


############################################
# 4) 메인
############################################
if __name__ == "__main__":
    reader = easyocr.Reader(["ko","en"], gpu=True, detector=False, recognizer=True)
    if hasattr(reader.recognizer, 'module'):
        reader.recognizer = reader.recognizer.module

    input_dir = "/workspace/Hand/original"
    seg_dir   = "/workspace/MIL/line_segments"
    os.makedirs(seg_dir, exist_ok=True)

    all_files = [f for f in os.listdir(input_dir) if f.lower().endswith(".png")]
    sample_files = all_files[:30]

    # 예시: 검정 픽셀이 500개 이상이면 "라인에 글자 많다"라고 간주
    MIN_BLACK_PIXELS = 12000

    for filename in sample_files:
        doc_image = os.path.join(input_dir, filename)
        prefix = os.path.splitext(filename)[0]

        print(f"Processing: {doc_image}")

        results = process_document_lines_no_detector(
            doc_image_path=doc_image,
            seg_output_dir=seg_dir,
            prefix=prefix,
            threshold_val=150,    # 이진화 임계값
            min_line_h=10,        # 최소 라인 높이
            min_black_pixels=MIN_BLACK_PIXELS,
            reader=reader
        )

        for idx, line_text in enumerate(results):
            print(f"File {filename} - FilteredLine {idx}: {line_text}")

    torch.cuda.empty_cache()


PyTorch CUDA Available: True
Current Device Index: 3
Processing: /workspace/Hand/original/w0001_s01_pLND_r01.png
File w0001_s01_pLND_r01.png - FilteredLine 0: ~염소 하팔다하 } 하4
Processing: /workspace/Hand/original/w0001_s01_pLND_r02.png
File w0001_s01_pLND_r02.png - FilteredLine 0: ~n미 다"%:평"
File w0001_s01_pLND_r02.png - FilteredLine 1: K: iaa" 언다
Processing: /workspace/Hand/original/w0001_s01_pLND_r03.png
File w0001_s01_pLND_r03.png - FilteredLine 0: 모자 자다?
Processing: /workspace/Hand/original/w0001_s01_pPHR_r01.png
File w0001_s01_pPHR_r01.png - FilteredLine 0:  Tw 오arwalvefom도" %atr  ju Uluw' bur
Processing: /workspace/Hand/original/w0001_s01_pPHR_r02.png
Processing: /workspace/Hand/original/w0001_s01_pPHR_r03.png
Processing: /workspace/Hand/original/w0001_s01_pWOZ_r01.png
File w0001_s01_pWOZ_r01.png - FilteredLine 0: #업하_철 표;하i"하다
Processing: /workspace/Hand/original/w0001_s01_pWOZ_r02.png
File w0001_s01_pWOZ_r02.png - FilteredLine 0: ; } 춤" 6 물보.
File w0001_s01_pWOZ_r02.png - Filtered

# 좀 더 다양한 작성자에 대해 얻을 필요가 있을듯, 한 작성자에 대해서만 너무 많이 얻은 것
# 이 기준이 맞는지 아닌지도 생각해야 할 듯