# 라이브러리 임포트

In [None]:
import os
import cv2
import easyocr
import itertools
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [28]:
from paddleocr import PaddleOCR

# OCR 모델 초기화 (angle_cls: 회전된 글씨도 잡아줌)
ocr = PaddleOCR(det_db_box_thresh=0.4, use_angle_cls=True, lang='korean')

# 이미지 경로
img_path = '/Users/chami/Desktop/monu/data/frames/20250321_1.0/frame_00016.jpg'

# OCR 수행
results = ocr.ocr(img_path)[0]

merged_line = ''
for res in results:
    text = res[1][0]
    merged_line += text + ' '  # 박스별 텍스트 합치기

print(merged_line.strip())

[2025/04/23 15:12:50] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/chami/.paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.4, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/chami/.paddleocr/whl/rec/korean/korean_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 3

In [None]:
import cv2
import matplotlib.pyplot as plt

# 이미지 불러오기
img = cv2.imread('/Users/chami/Desktop/monu/data/frames/20250321_1.0/frame_00016.jpg')

gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# 하단 30%만 크롭 (자막 영역)
h, w = gray.shape
cropped = gray[int(h*0.7):h, :]

# 대비 조정
alpha = 2.0  # contrast
beta = 0
adjusted = cv2.convertScaleAbs(cropped, alpha=alpha, beta=beta)

# Otsu thresholding
_, thresh = cv2.threshold(adjusted, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

# Morphology 팽창
kernel = np.ones((3,3), np.uint8)
dilated = cv2.dilate(thresh, kernel, iterations=1)

# 시각화
import matplotlib.pyplot as plt
plt.imshow(dilated, cmap='gray')
plt.axis('off')
plt.show()

In [None]:
# EasyOCR 초기화 (한글 + 영어)
reader = easyocr.Reader(['ko', 'en'], recog_network='korean_g2', gpu=False)

# 프레임 이미지 폴더 경로
frame_folder = '/Users/chami/Desktop/monu/data/frames/20250321_1.0'  # 저장한 경로로 바꿔줘

# 결과 저장할 리스트
ocr_results = []

for filename in sorted(os.listdir(frame_folder)):
    if filename.endswith('.jpg') or filename.endswith('.png'):
        img_path = os.path.join(frame_folder, filename)
        image = cv2.imread(img_path)

        # 1. 자막 영역만 크롭 (하단 30%)
        h, w, _ = image.shape
        cropped = image[int(h*0.7):h, :]

        # 2. 그레이스케일 + 대비 강화 + 블러
        gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
        enhanced = cv2.equalizeHist(gray)  # 대비 강화
        blurred = cv2.GaussianBlur(enhanced, (1, 1), 0)  # 노이즈 제거

        # 3. OCR 수행 (confidence 포함)
        results = reader.readtext(blurred, detail=1)

        # 4. confidence 0.5 이상만 텍스트 추출
        filtered_texts = [text for _, text, conf in results if conf > 0.5]
        text = ' '.join(filtered_texts).strip()

        print(f"{filename}: {text}")
        ocr_results.append((filename, text))

In [50]:
import cv2
from paddleocr import PaddleOCR
import os

# 1️⃣ PaddleOCR 초기화
ocr = PaddleOCR(use_angle_cls=True, lang='korean')

# 2️⃣ 프레임 경로
frame_folder = "/Users/chami/Desktop/monu/data/frames/20250321_1.0"
test_files = ["frame_00015.jpg", "frame_00020.jpg", "frame_00029.jpg"]  # 테스트 프레임

for filename in test_files:
    img_path = os.path.join(frame_folder, filename)
    img = cv2.imread(img_path)

    # 3️⃣ 하단 30% 크롭
    h, w = img.shape[:2]
    cropped = img[int(h*0.7):h, :]

    # 4️⃣ 그레이스케일 + 대비 조정
    gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
    adjusted = cv2.equalizeHist(gray)

    # 5️⃣ Adaptive Thresholding
    thresh = cv2.adaptiveThreshold(adjusted, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, 10)

    # 6️⃣ OCR 수행
    result = ocr.ocr(thresh, cls=False)[0]

    if result:  # 결과 있을 때만 처리
        texts = [line[1][0] for line in result if line[1][1] > 0.5]  # confidence > 0.5
    else:
        texts = []

    print(f"[{filename}] OCR 결과: {' '.join(texts) if texts else '자막 없음!'}")

[2025/04/23 16:01:07] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/chami/.paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/chami/.paddleocr/whl/rec/korean/korean_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 3

# GPT

In [44]:
from openai import OpenAI
import base64
import os
import datetime

# 1️⃣ OpenAI client 초기화
client = OpenAI(api_key='REMOVED')

# 2️⃣ SRT 시간 포맷 함수
def seconds_to_srt_time(seconds):
    td = datetime.timedelta(seconds=seconds)
    return str(td)[:-3].replace('.', ',')

# 3️⃣ 이미지 base64 인코딩 함수
def encode_image(image_path):
    with open(image_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode('utf-8')

# 4️⃣ 프레임 폴더 경로
frame_folder = "/Users/chami/Desktop/monu/data/frames/20250321_1.0"
output_srt = "/Users/chami/Desktop/monu/data/srt/output_gpt4v.srt"

# 5️⃣ 테스트할 프레임만 선택
frame_files = sorted([f for f in os.listdir(frame_folder) if f.endswith('.jpg') or f.endswith('.png')])
test_files = [frame_files[0], frame_files[15], frame_files[20]]

with open(output_srt, 'w', encoding='utf-8') as f:
    for idx, filename in enumerate(test_files):
        img_path = os.path.join(frame_folder, filename)
        image_base64 = encode_image(img_path)

        # GPT-4V OCR 요청
        response = client.chat.completions.create(
        model="gpt-4-turbo-2024-04-09",
        messages=[
            {"role": "system", "content": "You are a helpful translator."},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "이 이미지에서 영상 자막에 해당하는 텍스트를 **OCR 방식으로 그대로 추출** 추출해줘. 자막이 없으면 '없음'이라고 답해줘."},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}
                ],
            },
        ],
        max_tokens=300,
)

        ocr_text = response.choices[0].message.content.strip()
        if ocr_text.lower() != '없음':
            start_time = idx
            end_time = idx + 1
            f.write(f"{idx+1}\n")
            f.write(f"{seconds_to_srt_time(start_time)} --> {seconds_to_srt_time(end_time)}\n")
            f.write(f"{ocr_text}\n\n")
            print(f"[{filename}] OCR: {ocr_text}")
        else:
            print(f"[{filename}] 자막 없음!")

[frame_00001.jpg] OCR: 이미지에 있는 자막은 다음과 같습니다:

"옷들도 날씬핏 자체 움직이라고 자다가.."
[frame_00016.jpg] OCR: 이미지에 있는 자막은 "볼링라인 엉덩이너무좋죠( ͡° ͜ʖ ͡°)" 입니다.
[frame_00021.jpg] OCR: 이미지에 나타난 자막은 다음과 같습니다: "그리고 저는 카디건에 단추를 중요한데"


In [None]:
with open('/Users/chami/Desktop/monu/data/srt/output_gpt4v.srt', 'r', encoding='utf-8') as f:
    content = f.read()
    print(content)