In [1]:
import pandas as pd
import numpy as np
from PIL import Image
import requests
from io import BytesIO
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.models import densenet121, DenseNet121_Weights
from torch.nn.functional import cosine_similarity
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv('./tourism_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3043 entries, 0 to 3042
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   galContentId            3041 non-null   float64
 1   galTitle                3043 non-null   object 
 2   galWebImageUrl          3043 non-null   object 
 3   galSearchKeyword        3042 non-null   object 
 4   processedLocation       3043 non-null   object 
 5   galPhotographyLocation  3043 non-null   object 
dtypes: float64(1), object(5)
memory usage: 142.8+ KB


In [4]:
# DenseNet 모델 불러오기 및 feature extractor 설정
weights = DenseNet121_Weights.IMAGENET1K_V1
model = densenet121(weights=weights)
model.classifier = nn.Identity()  # DenseNet의 분류기를 제거하고 feature extractor로 사용
model.eval()

# 이미지 전처리 함수
def preprocess_image(image_path, is_url=False):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    if is_url:
        response = requests.get(image_path)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)
    return image

# 이미지 임베딩 생성 함수
def get_image_embedding(image_path, is_url=False):
    image_tensor = preprocess_image(image_path, is_url)
    with torch.no_grad():
        embedding = model(image_tensor)
    return embedding.numpy()  # numpy 배열로 변환

In [None]:
# 모든 이미지의 임베딩을 계산하여 저장
embeddings = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    try:
        db_image_url = row['galWebImageUrl']
        embedding = get_image_embedding(db_image_url, is_url=True)
        embeddings.append(embedding)
    except Exception as e:
        print(f"Error processing image at index {idx}: {e}")
        embeddings.append(None)

df['embedding'] = embeddings

# 임베딩을 저장
df.to_pickle('densenet_embedding.pkl')

  0%|          | 0/3043 [00:00<?, ?it/s]

In [24]:
# 임베딩이 저장된 데이터프레임 불러오기
df = pd.read_pickle('densenet_embedding.pkl')

# 사용자 입력 이미지 경로 (로컬 파일 경로)
input_image_path = './몽마르트.jpg'
input_image_embedding = get_image_embedding(input_image_path, is_url=False)

# 데이터프레임의 이미지와 비교하여 유사도 계산
similarities = []
for idx, row in df.iterrows():
    if row['embedding'] is not None:
        db_image_embedding = row['embedding']
        similarity = cosine_similarity(torch.tensor(input_image_embedding), torch.tensor(db_image_embedding)).item()  # PyTorch 코사인 유사도 계산
        similarity_percentage = similarity * 100  # 유사도를 백분율로 변환
        similarities.append((similarity_percentage, row['galPhotographyLocation'], row['galWebImageUrl']))

# 유사성 내림차순으로 정렬하여 가장 유사한 이미지 출력
similarities.sort(reverse=True, key=lambda x: x[0])
top_similarities = similarities[:3]

# 결과 출력
for similarity, processedLocation, image_url in top_similarities:
    print(f"Similarity: {similarity:.2f}%, Location: {processedLocation}, Image URL: {image_url}")


Similarity: 70.84%, Location: 강원도 춘천시 죽림동, Image URL: http://tong.visitkorea.or.kr/cms2/website/32/2802432.jpeg
Similarity: 70.50%, Location: 대구광역시 달서구 신당동, Image URL: http://tong.visitkorea.or.kr/cms2/website/54/2951954.jpg
Similarity: 70.43%, Location: 경상북도 칠곡군, Image URL: http://tong.visitkorea.or.kr/cms2/website/65/1053465.jpg
