In [1]:
import os
import tarfile
import pandas as pd
import numpy as np
from PIL import Image
from io import BytesIO
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler

In [2]:
# === CONFIGURACIÓN ===
tar_folder = "Images"          # Carpeta con los archivos TAR
pca_output = "Images_PCA.parquet"  # Archivo final con PCA
n_components = 100                   # Número de componentes PCA
batch_size = 1000                     # Lotes de imágenes para procesar en memoria

# === RECOGER TODOS LOS TAR ===
tar_files = [os.path.join(tar_folder, f) for f in os.listdir(tar_folder) if f.lower().endswith(".tar")]
print(f"Se encontraron {len(tar_files)} archivos TAR.")

# === INICIALIZAR ESCALADOR Y PCA INCREMENTAL ===
scaler = StandardScaler()
ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)

Se encontraron 70 archivos TAR.


In [3]:
# === PRIMERA PASADA: AJUSTAR ESCALADOR EN LOTES ===
print("Calculando escalador incremental...")

X_batch = []
for tar_path in tar_files:
    with tarfile.open(tar_path, "r") as tar:
        members = [m for m in tar.getmembers() if m.name.lower().endswith(".jpg")]
        for member in members:
            f = tar.extractfile(member)
            if f is not None:
                try:
                    img = Image.open(BytesIO(f.read())).convert("RGB").resize((64, 64))
                    pixels = np.array(img).astype(np.float32).flatten() / 255.0
                    X_batch.append(pixels)
                    
                    if len(X_batch) >= batch_size:
                        scaler.partial_fit(np.array(X_batch))
                        X_batch = []
                except Exception as e:
                    print(f"Error leyendo {member.name}: {e}")

# Procesar último lote
if X_batch:
    scaler.partial_fit(np.array(X_batch))
    X_batch = []

Calculando escalador incremental...


In [4]:
# === SEGUNDA PASADA: AJUSTAR PCA INCREMENTAL EN LOTES ===
print("Ajustando PCA incremental...")

X_batch = []
for tar_path in tar_files:
    with tarfile.open(tar_path, "r") as tar:
        members = [m for m in tar.getmembers() if m.name.lower().endswith(".jpg")]
        for member in members:
            f = tar.extractfile(member)
            if f is not None:
                try:
                    img = Image.open(BytesIO(f.read())).convert("RGB").resize((64, 64))
                    pixels = np.array(img).astype(np.float32).flatten() / 255.0
                    X_batch.append(pixels)
                    
                    if len(X_batch) >= batch_size:
                        X_scaled = scaler.transform(np.array(X_batch))
                        ipca.partial_fit(X_scaled)
                        X_batch = []
                except:
                    pass

# Último lote
if X_batch:
    X_scaled = scaler.transform(np.array(X_batch))
    ipca.partial_fit(X_scaled)
    X_batch = []

Ajustando PCA incremental...


In [5]:
# === TERCERA PASADA: TRANSFORMAR Y GUARDAR PCA ===
print("Transformando imágenes y guardando PCA...")

all_filenames = []
all_pca = []
X_batch = []

for tar_path in tar_files:
    with tarfile.open(tar_path, "r") as tar:
        members = [m for m in tar.getmembers() if m.name.lower().endswith(".jpg")]
        for member in members:
            f = tar.extractfile(member)
            if f is not None:
                try:
                    img = Image.open(BytesIO(f.read())).convert("RGB").resize((64, 64))
                    pixels = np.array(img).astype(np.float32).flatten() / 255.0
                    X_batch.append(pixels)
                    # Nombre limpio: solo nombre base sin extensión
                    clean_name = os.path.splitext(os.path.basename(member.name))[0]
                    all_filenames.append(clean_name)
                    
                    if len(X_batch) >= batch_size:
                        X_scaled = scaler.transform(np.array(X_batch))
                        pcs = ipca.transform(X_scaled)
                        all_pca.append(pcs)
                        X_batch = []
                except:
                    pass

# Procesar último lote
if X_batch:
    X_scaled = scaler.transform(np.array(X_batch))
    pcs = ipca.transform(X_scaled)
    all_pca.append(pcs)

Transformando imágenes y guardando PCA...


In [6]:
# Combinar todos los lotes
X_pca = np.vstack(all_pca)
pca_columns = [f"PC{i+1}" for i in range(n_components)]
df_pca = pd.DataFrame(X_pca, columns=pca_columns)
df_pca.insert(0, "filename", all_filenames)

In [7]:
# Guardar PCA final
df_pca.to_parquet(pca_output, index=False)

print(f"PCA completado y guardado en: {pca_output}")
print(f"Total de imágenes procesadas: {len(df_pca)}")
print(f"Varianza explicada total: {ipca.explained_variance_ratio_.sum():.2%}")

PCA completado y guardado en: Images_PCA.parquet
Total de imágenes procesadas: 69352
Varianza explicada total: 89.16%


In [9]:
import pandas as pd

# Leer CSV y PCA
df_catalog = pd.read_csv("ZooSpecPhotoDR19_filtered.csv")
df_pca = pd.read_parquet("Images_PCA.parquet")

# Convertir ambos a string
df_catalog["objid"] = df_catalog["objid"].astype(str)
df_pca["filename"] = df_pca["filename"].astype(str)

# Merge
df_final = pd.merge(df_catalog, df_pca, left_on="objid", right_on="filename", how="inner")

# Opcional: eliminar columna filename duplicada
df_final = df_final.drop(columns=["filename"])

print(df_final.head())
print(df_final.shape)

# Guardar dataset combinado
df_final.to_parquet("Dataset_combinado.parquet", index=False)

   Unnamed: 0            specobjid                objid            dr7objid  \
0           1  1578598304118237184  1237661463301456237  587735742076551517   
1           3  1578599678507771904  1237661463301521650  587735742076616950   
2           7  1578583460711262208  1237661463301718141  587735742076813455   
3          20  1578572190717077504  1237661463302045898  587735742077141189   
4          29  1579718437544945664  1237661463302308093  587735742077403410   

         ra       dec  p_el_debiased  p_cs_debiased  spiral  elliptical  ...  \
0  233.7615  34.60428          0.000          1.000       1           0  ...   
1  233.9483  34.48045          0.069          0.931       1           0  ...   
2  234.3422  34.38433          0.015          0.985       1           0  ...   
3  235.0977  33.95142          0.040          0.939       1           0  ...   
4  235.8138  33.56461          0.000          1.000       1           0  ...   

       PC91      PC92      PC93      PC94   