In [15]:
import os
import tarfile
import pandas as pd
import numpy as np
from PIL import Image
from io import BytesIO
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler

In [16]:
tar_folder = "Images"          #Folder with the TAR files
tar_files = [os.path.join(tar_folder, f) for f in os.listdir(tar_folder) if f.endswith('.tar')] #List of TAR files
pca_output = "Images_PCA.parquet"  #Final file with PCA
n_components = 100                   #PCA number of components. More number, less compression, better quality
batch_size = 1000                     #Set of images to process in memory

scaler = StandardScaler() #PCA depends on variance: if there is one characteristic with bigger values,
                          #it would dominate. In order to prevent that, we scalate so everything would contribute the same

ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size) #Inicializes an incremental PCA.
# It allows processing large volumes of data in batches, without loading everything into memory at once.

In [17]:
#FIRST PASS of preprocessing before applying PCA: Calculate the mean and global standard devitation of all pixels

X_batch = [] #Creates an empty list to temporarily store pixel vectors for each batch.
for tar_path in tar_files:
    with tarfile.open(tar_path, "r") as tar:
        members = [m for m in tar.getmembers() if m.name.lower().endswith(".jpg")] #Filters only the .jpg images inside the archive.
        for member in members:
            f = tar.extractfile(member) #Extracts the current image file as a binary object in memory.
            if f is not None: #Checking if the image is successfully extracted.
                try:
                    img = Image.open(BytesIO(f.read())).convert("RGB").resize((64, 64)) #Reads the image bytes into memory, opens it,
                                                                                        #ensures 3 color channels and resizes to a fixed shape
                    pixels = np.array(img).astype(np.float32).flatten() / 255.0 #Converts the image into a NumPy array, flattens it from
                                                                                #64×64×3=12288-length vector and normalizes pixel values to
                                                                                # the range [0, 1].
                                                                                #It makes values float32 (needed by scaler/IPCA).
                    X_batch.append(pixels) #Adds the pixel vector to the current batch.
                    
                    if len(X_batch) >= batch_size:
                        scaler.partial_fit(np.array(X_batch))  #When the batch is full, scaler.partial_fit() updates the mean and standard
                                                                #deviation for scaling using just that batch.

                        X_batch = [] #It clears X_batch to start collecting a new batch.
                except Exception as e:
                    print(f"Error reading {member.name}: {e}")

#Processing the last set
if X_batch:
    scaler.partial_fit(np.array(X_batch))
    X_batch = []

In [18]:
#SECOND PASS adjusting incremental PCA in sets: scalates all the data and trains IncrementalPCA

X_batch = []
for tar_path in tar_files:
    with tarfile.open(tar_path, "r") as tar:
        members = [m for m in tar.getmembers() if m.name.lower().endswith(".jpg")]
        for member in members:
            f = tar.extractfile(member)
            if f is not None:
                try:
                    img = Image.open(BytesIO(f.read())).convert("RGB").resize((64, 64))
                    pixels = np.array(img).astype(np.float32).flatten() / 255.0
                    X_batch.append(pixels)
                    
                    if len(X_batch) >= batch_size:
                        X_scaled = scaler.transform(np.array(X_batch)) #Convert the list to a NumPy array of shape
                                                                       #(n_samples_in_batch, n_features) and standardize it using the
                                                                       #scaler fitted during the first pass. This centers each feature
                                                                       #to mean 0 and scales to unit variance
                                                                       #(using global mean/std learned earlier).
                        ipca.partial_fit(X_scaled) #Feed the standardized batch to the IncrementalPCA. partial_fit() updates the PCA internal
                                                   #state (estimates of components, means...) incrementally. Each call refines the global PCA
                                                   #using the new batch.
                        X_batch = []
                except:
                    pass

print(len(X_batch))

#Last set
if X_batch:
    X_scaled = scaler.transform(np.array(X_batch))
    ipca.partial_fit(X_scaled)
    X_batch = []

0


In [19]:
#These two codes can't merge (even if they seem to be very similar)

#When using "StandardScaler", it needs to know the global mean and global standard deviation of the entire dataset
#in order to scale the data correctly. The problem is that during the first pass, those statistics are not yet known.
#They are still being computed gradually. If "scaler.transform()" is applied at the same time as "scaler.partial_fit()",
#you would be scaling using partial and changing values, not the final mean and standard deviation.
#This would cause each batch to be scaled differently and if the data aren’t scaled consistently, the PCA would learn
#distorted or incorrect components.

In [20]:
#THIRD PASS transforming and saving PCA

all_filenames = []
all_pca = []
X_batch = []

for tar_path in tar_files:
    with tarfile.open(tar_path, "r") as tar:
        members = [m for m in tar.getmembers() if m.name.lower().endswith(".jpg")]
        for member in members:
            f = tar.extractfile(member)
            if f is not None:
                try:
                    img = Image.open(BytesIO(f.read())).convert("RGB").resize((64, 64))
                    pixels = np.array(img).astype(np.float32).flatten() / 255.0
                    X_batch.append(pixels)
                    #Clean name: just base name without extention
                    clean_name = os.path.splitext(os.path.basename(member.name))[0] #Create a “clean” identifier
                                                                                    #(filename without directory or extension) for the image.
                    all_filenames.append(clean_name) #Append that clean name to the global filename list immediately
                    
                    if len(X_batch) >= batch_size:
                        X_scaled = scaler.transform(np.array(X_batch)) #Use the already-fitted StandardScaler (from FIRST PASS)
                                                                       #to standardize the batch.
                        pcs = ipca.transform(X_scaled) #Use the already-fitted IncrementalPCA (from SECOND PASS) to project
                                                       #the standardized batch into PCA
                        all_pca.append(pcs)
                        X_batch = []
                except:
                    pass

#Last set
if X_batch:
    X_scaled = scaler.transform(np.array(X_batch))
    pcs = ipca.transform(X_scaled)
    all_pca.append(pcs)

In [21]:
#Merging all the sets
X_pca = np.vstack(all_pca)
pca_columns = [f"PC{i+1}" for i in range(n_components)]
df_pca = pd.DataFrame(X_pca, columns=pca_columns)
df_pca.insert(0, "filename", all_filenames)

In [22]:
# Save final PCA
df_pca.to_parquet(pca_output, index=False)

In [24]:
#Read CSV and PCA
df_catalog = pd.read_csv("ZooSpecPhotoDR19_filtered.csv.gz", compression='gzip')
df_pca = pd.read_parquet("Images_PCA.parquet")

#Convert both to string so we can compare the 'objid' columns with the names of the images (saved as filename in the PCA)
df_catalog["objid"] = df_catalog["objid"].astype(str)
df_pca["filename"] = df_pca["filename"].astype(str)

# Merge
df_final = pd.merge(df_catalog, df_pca, left_on="objid", right_on="filename", how="inner")

#Eliminate the 'filename' column (duplicated)
df_final = df_final.drop(columns=["filename"])

#Save combinated dataset
df_final.to_parquet("Dataset_combinado.parquet", index=False)