## Contrails Dataset Creation Using PCA
* Compress dataset from 9 dimensions (bands 08-16) to 3 dimensions ("rgb") using PCA.
* Save only the labeled frame, which will be used for training.
* Save only the human_pixel_masks.
* Save the final numpy arrays in float16 dtype to reduce total data size.

In [None]:
# !pip install ipywidgets==8.1.5

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
# from tqdm.notebook import tqdm
from tqdm import tqdm
from pathlib import Path

from sklearn.decomposition import PCA

In [None]:
data_dir = '/kaggle/input/google-research-identify-contrails-reduce-global-warming/'

## Make the DataFrames

We will create train and valid dataframes, which will contain the record ids for each image.

In [None]:
train_rs = os.listdir(data_dir + 'train')
valid_rs = os.listdir(data_dir + 'validation')

train_df = pd.DataFrame(train_rs, columns=['record_id'])
valid_df = pd.DataFrame(valid_rs, columns=['record_id'])

train_df['train'] = 'train'
valid_df['train'] = 'valid'

In [None]:
train_df.shape, valid_df.shape

In [None]:
train_df.head()

In [None]:
train_df.to_csv('train_df.csv', index=False)
valid_df.to_csv('valid_df.csv', index=False)

## Save the Images as Numpy arrays

In [None]:
path = Path('contrails')
path.mkdir(exist_ok=True, parents=True)

In [None]:
N_TIMES_BEFORE = 4

def get_and_reshape_image(record_id, train_or_val):
    assert train_or_val=="train" or train_or_val=="validation", "\"train_or_val\" should either be \"train\" or \"validation\""
    
    imgs = []
    for i in range(8, 17):
        imgs.append(np.load(os.path.join(data_dir+train_or_val, record_id, f"band_{i:02}.npy"))[..., N_TIMES_BEFORE])
    return np.stack(imgs, axis=-1)

In [None]:
def pca_process_to_rgb(img):
    
    # Reshape the 3D array to 2D (pixels x bands)
    reshaped_image = img.reshape(-1, 9)  # Shape becomes (65536, 9)
    
    # Apply PCA to reduce to 3 components
    pca = PCA(n_components=3)
    reduced_image = pca.fit_transform(reshaped_image)  # Shape becomes (65536, 3)
    
    # Reshape back to 256 x 256 x 3
    rgb_image = reduced_image.reshape(256, 256, 3)
    
    # Normalize the RGB channels
    rgb_image_normalized = ((rgb_image - rgb_image.min()) / (rgb_image.max() - rgb_image.min()))

    return rgb_image_normalized

In [None]:
def get_human_mask(record_id, train_or_val):
    assert train_or_val=="train" or train_or_val=="validation", "\"train_or_val\" should either be \"train\" or \"validation\""
    return np.load(os.path.join(data_dir+train_or_val, record_id, "human_pixel_masks.npy"))

In [None]:
!rm -rf /kaggle/working/contrails/*
!ls /kaggle/working/contrails

In [None]:
def process_data(train_or_val):

    ds = None
    if(train_or_val=='train'):
        ds=train_rs
    else:
        ds=valid_rs
    
    for i in tqdm(ds):
    
        # Get and reshape the image
        img = get_and_reshape_image(str(i), train_or_val)
    
        # Convert from 9 channels to 3 channels (RGB) via PCA
        rgb_img = pca_process_to_rgb(img)
    
        # Get human mask for corresponding image
        human_mask = get_human_mask(str(i), train_or_val)
    
        # Save it
        final = np.dstack([img, human_mask])
        final = final.astype(np.float16)
        # np.save(str(path/f"{i}.npy"), final)
        np.savez_compressed(str(path / f"{i}.npz"), data=final)

In [None]:
process_data('train')

In [None]:
process_data('validation')