In [3]:
import pandas as pd
import numpy as np
from PIL import Image
from pathlib import Path
from collections import Counter
from tqdm import tqdm
import cv2
import os

# Set paths
file_location_path = Path.cwd()
ns6_wiki_paths = Path('../../data/processed/landscape_score')
image_folder = Path('/home/ubuntu/landscape-aesthetics')
project_base_path = file_location_path.parent.parent

def get_dominant_colors(image_path, bins_per_channel=8, top_colors=8):
    image = cv2.imread(str(image_path))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    hist = cv2.calcHist([image], [0, 1, 2], None, [bins_per_channel] * 3, [0, 256] * 3)
    hist = hist.flatten()
    
    bin_size = 256 // bins_per_channel
    bin_centers = np.arange(bin_size // 2, 256, bin_size)
    color_bins = [(r, g, b) for r in bin_centers for g in bin_centers for b in bin_centers]
    
    color_freq = Counter({color_bins[i]: hist[i] for i in range(len(hist))})
    dominant_colors = [color for color, freq in color_freq.most_common(top_colors)]
    
    return dominant_colors

# Loop through each file and process images in score range 5-6
for file_name in os.listdir(ns6_wiki_paths):
    file_path = ns6_wiki_paths / file_name
    dominant_colors_data = []
    if file_path.is_file():
        label_file_path = Path(file_name)
        labeled_csv_name = label_file_path.with_suffix('.csv')
        result = project_base_path / 'data' / 'processed' / 'dominant_color_wikidata_5' / labeled_csv_name
        result.parent.mkdir(parents=True, exist_ok=True)

        data = pd.read_csv(file_path, usecols=['image_path','predicted_score'])
        # Filter images with predicted_score in range 5-6
        data = data[(data['predicted_score'] >= 5) & (data['predicted_score'] < 6)]
        image_paths = data['image_path'].tolist()
        predicted_scores = data['predicted_score'].tolist()

        for img_path, predicted_score in tqdm(zip(image_paths, predicted_scores), desc="Processing images (5-6)"):
            image_path = image_folder / img_path
            try:
                dominant_colors = get_dominant_colors(image_path)
                dominant_colors_data.append([str(image_path), predicted_score, *np.ravel(dominant_colors)])
            except Exception as e:
                print(f"Error processing image {image_path}: {e}")
                continue
        
        output_df = pd.DataFrame(dominant_colors_data, columns=[
            "image_path", "predicted_score", 'R1', 'G1', 'B1', 'R2', 'G2', 'B2', 'R3', 'G3', 'B3', 
            'R4', 'G4', 'B4', 'R5', 'G5', 'B5', 'R6', 'G6', 'B6', 'R7', 'G7', 'B7', 'R8', 'G8', 'B8'
        ])
        output_df.to_csv(result, index=False)


Processing CSV files:   0%|                             | 0/101 [00:06<?, ?it/s]


KeyboardInterrupt: 