In [1]:
!pip install --upgrade google-cloud-storage transformers accelerate torch torchvision matplotlib

Collecting torch
  Using cached torch-2.7.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting torchvision
  Using cached torchvision-0.22.1-cp312-cp312-win_amd64.whl.metadata (6.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Using cached torch-2.7.1-cp312-cp312-win_amd64.whl (216.1 MB)
Using cached torchvision-0.22.1-cp312-cp312-win_amd64.whl (1.7 MB)
Using cached sympy-1.14.0-py3-none-any.whl (6.3 MB)
Installing collected packages: sympy, torch, torchvision
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.1
    Uninstalling sympy-1.13.1:
      Successfully uninstalled sympy-1.13.1
  Attempting uninstall: torch
    Found existing installation: torch 2.5.1+cu121
    Uninstalling torch-2.5.1+cu121:
      Successfully uninstalled torch-2.5.1+cu121
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.20.1+cu121
    Uninstalling torchvision-0.20.1+cu121:
      Successfully u

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.5.1+cu121 requires torch==2.5.1+cu121, but you have torch 2.7.1 which is incompatible.

[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import pandas as pd
import torch
import requests
import json
import io
import re
from PIL import Image
from torchvision import transforms
from transformers import ViTForImageClassification
from google.cloud import storage
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "valid-might-460212-k2-aa639e904de6.json"

# 클라이언트 생성
client = storage.Client()
bucket = client.bucket("de-project2-bucket-1")

# 데이터셋 준비

In [6]:

# [2] CSV 다운로드 및 로드
csv_blob = bucket.blob("kyobo/csv/combined/after_preprocessing_novel.csv")
csv_data = csv_blob.download_as_text()
df = pd.read_csv(io.StringIO(csv_data))

# [3] 이미지 URL 추출
def extract_image_url(cell):
    if isinstance(cell, str):
        match = re.search(r'"(https://[^"]+)"', cell)
        if match:
            return match.group(1)
        elif cell.startswith("https://"):
            return cell
    return None

df["image_url_clean"] = df["image_url"].apply(extract_image_url)

# 모델 불러오기

In [8]:
# [4] 드라이브 마운트 및 모델 불러오기
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", num_labels=5, ignore_mismatched_sizes=True)
model.load_state_dict(torch.load("best_model_vit_mood2.pt", map_location=device))

model.to(device)
model.eval()

with open("label_classes_final2.json", "r") as f:
    class_names = json.load(f)

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# [5] 이미지 예측 함수
def predict_image_mood(image_url):
    if not image_url:
        return "제품 이미지 없음"
    try:
        response = requests.get(image_url, timeout=10)
        if response.status_code != 200:
            return "제품 이미지 없음"
        image = Image.open(io.BytesIO(response.content)).convert("RGB")
        image_tensor = transform(image).unsqueeze(0).to(device)
        with torch.no_grad():
            outputs = model(image_tensor)
            logits = outputs.logits
            predicted_class = torch.argmax(logits, dim=1).item()
        return class_names[predicted_class]
    except Exception as e:
        print(f"❌ 예외 발생: {image_url} | {e}")
        return "제품 이미지 없음"

# [6] 개별 행 처리 함수
def process_row(i_row):
    i, row = i_row
    image_url = row["image_url_clean"]
    product_id = str(row.get("product_id", "")).replace(",", " ").replace("\n", " ")
    mood = predict_image_mood(image_url)
    return f"{i},{product_id},{mood}"


In [14]:
# [7] 백업 및 결과 경로 설정
DRIVE_BACKUP_PATH = "mood_inference_backup_product_id.txt"
GCS_PATH_PREFIX = "kyobo/csv/combined/inference_mood_batches/"
GCS_FILE_NAME = "mood_classification.csv"

# 헤더 초기화
with open(DRIVE_BACKUP_PATH, "w", encoding="utf-8") as f:
    f.write("index,product_id,mood\n")


# [8] 병렬 Inference + N줄 단위로 I/O
batch_results = []
buffer_size = 50  # N줄마다 저장

with ThreadPoolExecutor(max_workers=16) as executor:
    futures = [executor.submit(process_row, item) for item in df.iterrows()]
    for future in tqdm(as_completed(futures), total=len(df)):
        try:
            line = future.result()
            batch_results.append(line)

            # N줄마다 저장
            if len(batch_results) % buffer_size == 0:
                with open(DRIVE_BACKUP_PATH, "a", encoding="utf-8") as f:
                    f.write("\n".join(batch_results[-buffer_size:]) + "\n")

        except Exception as e:
            print(f"❌ 처리 실패: {e}")

# [9] 최종 누락분 저장
with open(DRIVE_BACKUP_PATH, "a", encoding="utf-8") as f:
    remaining = len(batch_results) % buffer_size
    if remaining > 0:
        f.write("\n".join(batch_results[-remaining:]) + "\n")

100%|██████████| 28286/28286 [37:38<00:00, 12.52it/s]  


In [17]:
# [10] GCS 업로드
csv_string = "index,product_id,mood\n" + "\n".join(batch_results)
blob = bucket.blob(f"{GCS_PATH_PREFIX}{GCS_FILE_NAME}")
blob.upload_from_string(csv_string, content_type="text/csv")

print(f"✅ GCS 업로드 완료: {GCS_PATH_PREFIX}{GCS_FILE_NAME} ({len(batch_results)}개)")
print("🎉 전체 병렬 inference 및 저장 완료!")

✅ GCS 업로드 완료: kyobo/csv/combined/inference_mood_batches/mood_classification.csv (28286개)
🎉 전체 병렬 inference 및 저장 완료!
