In [None]:
!pip install --upgrade google-cloud-storage transformers accelerate torch torchvision matplotlib

Collecting google-cloud-storage
  Downloading google_cloud_storage-3.1.0-py2.py3-none-any.whl.metadata (12 kB)
Collecting torch
  Downloading torch-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision
  Downloading torchvision-0.22.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.6.80 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinu

In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from google.colab import auth, drive
from google.cloud import storage
import pandas as pd
import re
import io
import os
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

In [None]:
# ─── 설정 ─────────────────────────────────────────────────────────────────────
BATCH_SIZE    = 10000
MAX_WORKERS   = 8
GCS_PREFIX    = "kyobo/csv/combined/"
INPUT_CSV     = "after_preprocessing_novel.csv"
OUTPUT_CSV    = "object_detection_novel_20250614_MJ.csv"
TITLE_TEXT    = "title."
GENERAL_TEXT  = (
    "person. hand. robot. animal. insect. plant. flower. landscape. "
    "cloud. star. river. sea. building. house. castle. palace. food. drink. "
    "clock. mirror. chair. instrument. electronic device. pencil. desk. "
    "vehicle. airplane. helicopter. planet. lightbulb. flag. weapon."
)

In [None]:
# 1) GCS 인증 및 초기화
auth.authenticate_user()
client = storage.Client()
bucket = client.bucket("de-project2-bucket-1")

# 2) 원본 DataFrame 로드
blob = bucket.blob(GCS_PREFIX + INPUT_CSV)
data = blob.download_as_text()
df = pd.read_csv(io.StringIO(data))

# 3) URL 추출 헬퍼
def extract_image_url(cell):
    if isinstance(cell, str):
        m = re.search(r'"(https://[^"]+)"', cell)
        if m: return m.group(1)
        if cell.startswith("https://"): return cell
    return None

df["image_url_clean"] = df["image_url"].apply(extract_image_url)

In [None]:
# 4) 모델 로드 (공유)
device    = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
model     = AutoModelForZeroShotObjectDetection.from_pretrained(
    "IDEA-Research/grounding-dino-base"
).to(device)

# 5) Drive 백업 준비 (선택)
drive.mount('/content/drive')
DRIVE_BACKUP_DIR  = "/content/drive/MyDrive/checkpoints_MJ"
os.makedirs(DRIVE_BACKUP_DIR, exist_ok=True)
DRIVE_BACKUP_PATH = os.path.join(DRIVE_BACKUP_DIR, OUTPUT_CSV)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/457 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/933M [00:00<?, ?B/s]

Mounted at /content/drive


In [None]:
# 6) 한 행 처리 함수
def process_row(item):
    idx, row = item
    pid   = row.get("product_id", "")
    url   = row["image_url_clean"]
    pos   = dir_ = obj = ""

    try:
        resp = requests.get(url, timeout=10, stream=True)
        img  = Image.open(resp.raw).convert("RGB")
        w,h  = img.size

        # Title 검출
        inp_t = processor(images=img, text=TITLE_TEXT, return_tensors="pt").to(device)
        with torch.no_grad():
            out_t = model(**inp_t)
        res_t = processor.post_process_grounded_object_detection(
            out_t, inp_t.input_ids,
            box_threshold=0.3, text_threshold=0.25,
            target_sizes=[(h,w)]
        )[0]
        if len(res_t["scores"])>0:
            i_max   = res_t["scores"].argmax()
            x0,y0,x1,y1 = res_t["boxes"][i_max].tolist()
            cx,cy       = (x0+x1)/2,(y0+y1)/2
            col        = ["왼","중","오"][0 if cx<w/3 else 1 if cx<2*w/3 else 2]
            row_lbl    = ["상","중","하"][0 if cy<h/3 else 1 if cy<2*h/3 else 2]
            pos        = f"{row_lbl}{col}"
            bw,bh      = x1-x0, y1-y0
            dir_       = "가로" if bw>bh*1.5 else "세로" if bh>bw*1.3 else "알수없음"
        else:
            pos,dir_ = "감지안됨","감지안됨"

        # General 검출 (면적 최대 객체)
        inp_o = processor(images=img, text=GENERAL_TEXT, return_tensors="pt").to(device)
        with torch.no_grad():
            out_o = model(**inp_o)
        res_o = processor.post_process_grounded_object_detection(
            out_o, inp_o.input_ids,
            box_threshold=0.3, text_threshold=0.25,
            target_sizes=[(h,w)]
        )[0]
        max_area = 0
        obj      = "없음"
        for box, lbl in zip(res_o["boxes"], res_o["text_labels"]):
            x0,y0,x1,y1 = box.tolist()
            area = (x1-x0)*(y1-y0)
            if area>max_area:
                max_area = area
                obj      = lbl.split()[0] if lbl.strip()!="electronic device" else "electronic device"

    except Exception as e:
        pos,dir_,obj = f"에러:{e}",f"에러:{e}",f"에러:{e}"

    return f"{idx},{pid},{pos},{dir_},{obj}"

In [None]:
# 7) 병렬 처리 및 결과 집계
all_results = []
items = list(df.iterrows())
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as exe:
    for line in tqdm(exe.map(process_row, items), total=len(items), desc="전체 병렬 처리"):
        all_results.append(line)

# 8) 단일 CSV로 GCS 업로드
header      = "product_id,title_position,title_direction,detected_objects\n"
csv_content = header + "\n".join(all_results)
out_blob    = bucket.blob(GCS_PREFIX + OUTPUT_CSV)
out_blob.upload_from_string(csv_content, content_type="text/csv")
print(f"✅ GCS에 단일 파일 업로드 완료: {GCS_PREFIX + OUTPUT_CSV}")

# 9) Drive에도 백업 (선택)
with open(DRIVE_BACKUP_PATH, "w", encoding="utf-8") as f:
    f.write(csv_content)
print(f"✅ Drive 백업 완료: {DRIVE_BACKUP_PATH}")

전체 병렬 처리: 100%|██████████| 28286/28286 [3:23:31<00:00,  2.32it/s]


✅ GCS에 단일 파일 업로드 완료: kyobo/csv/combined/object_detection_novel_20250614_MJ.csv
✅ Drive 백업 완료: /content/drive/MyDrive/checkpoints_MJ/object_detection_novel_20250614_MJ.csv
