In [2]:
import pandas as pd
import numpy as np
from PIL import Image
from pathlib import Path
from collections import Counter
from tqdm import tqdm
import cv2
import os
# Set paths
file_location_path = Path.cwd()
ns6_wiki_paths = Path('../../data/processed/landscape_score')
image_folder = Path('/home/ubuntu/landscape-aesthetics')
project_base_path = file_location_path.parent.parent
def get_dominant_colors(image_path, bins_per_channel=8, top_colors=8):
    
    image = cv2.imread(str(image_path))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    hist = cv2.calcHist([image], [0, 1, 2], None, 
                        [bins_per_channel] * 3, [0, 256] * 3)
    hist = hist.flatten()
    
    bin_size = 256 // bins_per_channel
    bin_centers = np.arange(bin_size // 2, 256, bin_size)
    color_bins = [(r, g, b) for r in bin_centers for g in bin_centers for b in bin_centers]
    
    
    color_freq = Counter({color_bins[i]: hist[i] for i in range(len(hist))})
    dominant_colors = [color for color, freq in color_freq.most_common(top_colors)]
    
    return dominant_colors

# Loop through each image with a progress bar
for file_name in os.listdir(ns6_wiki_paths):
    file_path = ns6_wiki_paths / file_name
    dominant_colors_data = []
    if file_path.is_file():
        label_file_path = Path(file_name)
        labeled_csv_name = label_file_path.with_suffix('.csv')
        result = project_base_path / 'data' / 'processed' / 'dominant_color_wikidata' / labeled_csv_name
        result.parent.mkdir(parents=True, exist_ok=True)

        results = []
        data = pd.read_csv(file_path, usecols=['image_path','predicted_score'])
        image_paths = data['image_path'].tolist()
        predicted_scores = data['predicted_score'].tolist()
        for img_path, predicted_score in tqdm(zip(image_paths,predicted_scores), desc="Processing images"):
            image_path = image_folder / img_path
            try:
                # Read and process the image
                dominant_colors = get_dominant_colors(image_path)
                # Add image path and dominant colors to results list
                dominant_colors_data.append([str(image_path), predicted_score, *np.ravel(dominant_colors)])
            except Exception as e:
                print(f"Error processing image {image_path}: {e}")
                continue  # Skip this image and proceed if an error occurs
        output_df = pd.DataFrame(dominant_colors_data, columns=["image_path", "predicted_score", 'R1', 'G1', 'B1', 'R2', 'G2', 'B2', 'R3', 'G3', 'B3', 'R4', 'G4', 'B4', 'R5', 'G5', 'B5', 'R6', 'G6', 'B6', 'R7', 'G7', 'B7', 'R8', 'G8', 'B8'])
        output_df.to_csv(result, index=False)


Processing images: 2192it [01:09, 31.76it/s]


KeyboardInterrupt: 

In [8]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from PIL import Image
from pathlib import Path
from collections import Counter
from tqdm import tqdm
import cv2

def get_dominant_colors(image_path, bins_per_channel=8, top_colors=8):
    
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    
    hist = cv2.calcHist([image], [0, 1, 2], None, 
                        [bins_per_channel] * 3, [0, 256] * 3)
    hist = hist.flatten()
    
    
    bin_size = 256 // bins_per_channel
    bin_centers = np.arange(bin_size // 2, 256, bin_size)
    color_bins = [(r, g, b) for r in bin_centers for g in bin_centers for b in bin_centers]
    
    
    color_freq = Counter({color_bins[i]: hist[i] for i in range(len(hist))})
    dominant_colors = [color for color, freq in color_freq.most_common(top_colors)]
    
    return dominant_colors


# Set paths
data_path = '/home/ubuntu/landscape-aesthetics/data/external/scenicornot/scenicornot.metadata.csv'
image_folder = Path('/home/ubuntu/landscape-aesthetics/data/external/scenicornot') 
data = pd.read_csv(data_path)

df_scores = data[['filename', 'average']].copy()

# Divide the scores into 10 equal-frequency intervals
df_scores['score_category'] = pd.qcut(df_scores['average'], q=10, labels=False)

sampled_df = df_scores.groupby('score_category', group_keys=False).apply(lambda x: x.sample(n=500, random_state=42)).reset_index(drop=True)

# List to store image paths and dominant colors
dominant_colors_data = []

# Loop through each image with a progress bar
for idx in tqdm(range(len(sampled_df)), desc="Processing Images"):
    img_name = sampled_df.iloc[idx]['filename']
    image_path = image_folder / Path(img_name)
    try:
        # Read and process the image
        dominant_colors = get_dominant_colors(image_path)
        # Add image path and dominant colors to results list
        dominant_colors_data.append([str(image_path), *np.ravel(dominant_colors)])
        
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        continue  # Skip this image and proceed if an error occurs

# Save results to CSV file
output_df = pd.DataFrame(dominant_colors_data, columns=['image_path', 'R1', 'G1', 'B1', 'R2', 'G2', 'B2', 'R3', 'G3', 'B3', 'R4', 'G4', 'B4', 'R5', 'G5', 'B5', 'R6', 'G6', 'B6', 'R7', 'G7', 'B7', 'R8', 'G8', 'B8'])
output_df.to_csv('/home/ubuntu/landscape-aesthetics/reports/dominant_colors_hist_8.csv', index=False)


  sampled_df = df_scores.groupby('score_category', group_keys=False).apply(lambda x: x.sample(n=500, random_state=42)).reset_index(drop=True)
Processing Images: 100%|███████████████████| 5000/5000 [00:38<00:00, 128.53it/s]
