In [43]:
import pandas as pd

def extract_painting_name(filename):
    """
    Extrae el nombre de la pintura del nombre del archivo
    (la parte entre el primer guion bajo y _resize1024.jpg después del último /)
    """
    try:
        file_part = filename.split('/')[-1]
        parts = file_part.split('_')
        if len(parts) >= 2:
            painting_name = parts[1]
            if painting_name in ["arnold-bocklin", "arnold-bocklin"]:
                painting_name = "pan-" + painting_name
            return painting_name  # Retorna la parte del nombre de la pintura
    except:
        return None
    return None

def merge_csv_files(metadata_blip2_path, dataset_path):
    """
    Combina dos archivos CSV usando el nombre de la pintura como clave
    y mantiene la descripción BLIP2 de metadata_blip2.csv
    """
    blip2_df = pd.read_csv(metadata_blip2_path)
    dataset_df = pd.read_csv(dataset_path)

    blip2_df['match'] = blip2_df['file_name'].apply(extract_painting_name)
    dataset_df['match'] = dataset_df['file_name'].apply(extract_painting_name)
    result_df = dataset_df.copy()

    blip2_dict = dict(zip(blip2_df['match'], blip2_df['blip2']))

    unmatched_files = []

    def update_description(row):
        if row['match'] in blip2_dict:
            return blip2_dict[row['match']]
        else:
            unmatched_files.append(row['file_name'])
            return row.get('description', '')

    result_df['description'] = result_df.apply(update_description, axis=1)

    result_df = result_df.drop('match', axis=1)
    result_df = result_df.drop('paint_name.1', axis=1)

    result_df = result_df.loc[:, ~result_df.columns.duplicated()]

    if unmatched_files:
        print("\nArchivos sin coincidencia encontrada:")
        for file in unmatched_files:
            print(f"- {file}")

    # Imprimir estadísticas
    total_files = len(result_df)
    matched_files = total_files - len(unmatched_files)
    print(f"\nEstadísticas:")
    print(f"Total de archivos procesados: {total_files}")
    print(f"Archivos con coincidencia: {matched_files}")
    print(f"Archivos sin coincidencia: {len(unmatched_files)}")

    # Renombrar la columna paint_name a painting_name
    result_df = result_df.rename(columns={'paint_name': 'painting_name'})

    # Reordenar las columnas
    result_df = result_df[['file_name', 'genre', 'artist', 'painting_name', 'phash', 'description', 'subset']]

    return result_df

# Ejemplo de uso
metadata_blip2_path = 'metadata_blip2.csv'
dataset_path = 'newName.csv'

resultado = merge_csv_files(metadata_blip2_path, dataset_path)



Estadísticas:
Total de archivos procesados: 81444
Archivos con coincidencia: 81444
Archivos sin coincidencia: 0


In [44]:
resultado

Unnamed: 0,file_name,genre,artist,painting_name,phash,description,subset
0,Impressionism/william-merritt-chase_still-life...,Impressionism,william merritt chase,still life with cockatoo,b0e24b85961e6de9,a painting of a white bird sitting on a vase,train
1,Expressionism/pablo-picasso_study-to-two-siste...,Expressionism,pablo picasso,study to two sisters 1902,a73452d3366e86c9,a drawing of two people embracing each other,train
2,Post_Impressionism/pyotr-konchalovsky_dry-pain...,Post Impressionism,pyotr konchalovsky,dry paints 1913,d1512e3c94a7d1e3,a painting of a table with various items on it,train
3,High_Renaissance/pietro-perugino_madonna-with-...,High Renaissance,pietro perugino,madonna with child and little st john,dab3114c64cae72d,madonna and child with two children by luigi d...,train
4,Realism/john-everett-millais_portrait-of-mrs-j...,Realism,john everett millais,portrait of mrs james wyatt,9bd1e4acd211ad4b,a painting of a woman holding a child,train
...,...,...,...,...,...,...,...
81439,Early_Renaissance/carlo-crivelli_saint-john-th...,Early Renaissance,carlo crivelli,saint john the evangelist,aef969b1c224f283,a painting of a man holding a book,test
81440,Realism/robert-brackman_pastel_resize1024.jpg,Realism,robert brackman,pastel,edc5961bb2199a26,a drawing of a naked woman sitting on a towel,test
81441,Impressionism/kazimir-malevich_spring-landscap...,Impressionism,kazimir malevich,spring landscape,d1d096ab5be2926a,a painting of trees in a field with grass,test
81442,Fauvism/ion-pacea_marina-with-yellow-boat_resi...,Fauvism,ion pacea,marina with yellow boat,c0fe2f2d10402fbf,a painting of boats in the water with a blue sky,test


In [45]:
import os


image_base_path = 'imagenes/resizeSD'
missing_images = []

for index, row in resultado.iterrows():
    image_path = os.path.join(image_base_path, row['file_name'])
    if not os.path.exists(image_path):
        missing_images.append(row['file_name'])


if missing_images:
    print("Imágenes faltantes:")
    print("\n".join(missing_images))
else:
    print("Todas las imágenes existen.")

Todas las imágenes existen.


In [46]:
resultado.to_csv("dataset.csv", index=False)
