In [None]:
import pandas as pd
import os
import cv2
import numpy as np
import tqdm as tqdm
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.cluster import KMeans

In [None]:
output_folder ='/content/drive/MyDrive/Proyecto Cultivos/deepglobe/'
train_folder = '/content/drive/MyDrive/Proyecto Cultivos/deepglobe/tiles'
val_folder = '/content/drive/MyDrive/Proyecto Cultivos/deepglobe/val_tiles/'
test_folder = '/content/drive/MyDrive/Proyecto Cultivos/deepglobe/test_tiles/'

In [None]:
class_to_rgb = {
    'urban_land': (0, 255, 255),
    'agriculture_land': (255, 255, 0),
    'rangeland': (255, 0, 255),
    'forest_land': (0, 255, 0),
    'water': (0, 0, 255),
    'barren_land': (255, 255, 255),
    'unknown': (0, 0, 0)
}

In [None]:
class_rgb = np.array(class_to_rgb)

Mediante **k-Means** vamos a determinar el color dominante de cada máscara y,con ello, etiquetar cada imagen según la clase a la que pertenece dicho color:

In [None]:
import time
from tqdm import tqdm

# Calcula color dominante de una imagen
def calculate_dominant_color(file, folder):
    try:
        image_path = os.path.join(folder, file)
        image = cv2.imread(image_path)
        if image is  None:
            print(f"Error loading image: {file}")
            return None
    except Exception as e:
        print(f"An error occurred while loading the image: {str(e)}")
        return None
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    pixels = image_rgb.reshape((-1, 3))

    k=1
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(pixels)

    dominant_color = kmeans.cluster_centers_.astype(int)[0]

    return dominant_color

# Devuelve la clase dominante que corresponde al color
def dominant_class(file, folder, dict_classes, array_classes):
    dominant_color = calculate_dominant_color(file, folder)
    if dominant_color is None:
      return 'unknown'  # Handle the case where loading the image failed
    distances =  {class_name: np.linalg.norm(np.array(class_rgb) - dominant_color) for class_name, class_rgb in class_to_rgb.items()}
    return min(distances, key=distances.get)

# Crea df actualizado
def labeling_dataset(file_path, dict_classes):
    start_time = time.time()
    df = pd.DataFrame(columns=["image_name", "label"])
    class_rgb = np.array(dict_classes)
    filenames = os.listdir(file_path)

    for filename in tqdm(filenames, desc="Processing..."):
        if "_mask" in filename:
            dominant = dominant_class(filename, file_path, dict_classes, class_rgb)
            image_path = filename.replace('mask', 'sat').replace('.png', '.jpg')
            image_path = os.path.join(folder, image_path)
            df = df.append({'image_path': image_path, 'label': dominant}, ignore_index=True)

    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time} seconds")
    return df



In [None]:
df_labeled = labeling_dataset("/content/drive/MyDrive/Proyecto Cultivos/deepglobe/tiles-128/", class_to_rgb)

In [None]:
pd.set_option('display.max_colwidth', None)
df_labeled = df_labeled.drop('image_name', axis=1)
df_labeled.head()

In [None]:
df_labeled['image_name'] = df_labeled['image_path'].str.split('/').str.get(-1)
df_labeled.tail(20)

In [None]:
# Save df
df_labeled.to_csv('/content/drive/MyDrive/Proyecto Cultivos/deepglobe/tiles-128/df_labeled-128.csv', index=False)