# Exploratory Data Analysis

# 1. Imports

## 1.1 Packages

In [13]:
import os

import imageio.v3 as imageio
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


## 1.2 Options

In [14]:
path_data = '../data/04_feature'

In [15]:
cols_target = [
    'X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean'
]

## 1.3 Data

In [16]:
df_train = pd.read_parquet(os.path.join(path_data, 'df_train.parquet'))

# 2. Analyze

In [17]:
df_train.sample(2)

Unnamed: 0,id,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,...,X50_mean,X3112_mean,X4_sd,X11_sd,X18_sd,X26_sd,X50_sd,X3112_sd,file_path,jpeg_bytes
39688,108593539,26.184694,1840.984741,485.112244,109.72081,117.118927,14.022959,123,129,125,...,2.575645,2344.857084,0.00375,0.199441,0.294814,17.800362,0.014335,51.459922,data/03_primary/train_images//108593539.jpeg,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
45145,173163113,22.877916,628.676208,103.985718,66.476944,296.025879,21.149048,126,137,132,...,2.605322,168.409963,0.007976,0.156414,0.328426,0.005863,0.051455,37.94906,data/03_primary/train_images//173163113.jpeg,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...


## 2.1 Plot example

In [18]:
def plot_example(df: pd.DataFrame, nrows: int=3, ncols: int=3) -> None:
    """Plot examples of images

    Args:
        df (pd.DataFrame): Input dataframe
        nrows (int): Number of rows for the subplots
        ncols (int): Number of columns for the subplots
    """
    nimgs = nrows * ncols
    df_to_plot = df.sample(n=nimgs, random_state=12)
    fig, axes = plt.subplots(nrows, ncols, figsize=(ncols*5, nrows*5))
    for i in range(nimgs):
        r = i // ncols
        c = i % ncols
        img = imageio.imread(df_to_plot.iloc[i]['jpeg_bytes'])
        image_id = df_to_plot.iloc[i]['id']
        axes[r, c].imshow(img)
        axes[r, c].set_title(f'{image_id} | shape: {img.shape}')
    plt.show()

# plot_example(df_train)

## 2.2 Analyze labels

In [19]:
# Labels Meta Data
target_name_meta = pd.read_csv('../data/03_primary/target_name_meta.tsv', delimiter='\t')
target_name_meta['trait_ID'] = target_name_meta['trait_ID'] + '_mean'
target_name_meta = target_name_meta.set_index('trait_ID').squeeze().to_dict()

pd.Series(target_name_meta).to_frame()

Unnamed: 0,0
X4_mean,Stem specific density (SSD) or wood density (s...
X11_mean,Leaf area per leaf dry mass (specific leaf are...
X18_mean,Plant height
X26_mean,Seed dry mass
X50_mean,Leaf nitrogen (N) content per leaf area
X3112_mean,"Leaf area (in case of compound leaves: leaf, u..."


In [20]:
# Minimum/Maximum Based On Train 0.1% and 99.9%
V_MIN = df_train[cols_target].quantile(0.001)
V_MAX = df_train[cols_target].quantile(0.999)

print('V_MIN:', V_MIN)  # noqa: T201
print('V_MAX', V_MAX)  # noqa: T201

# Percentiles of features to use
percentiles = [
    0.001,
    0.01,
    0.05,
    0.10,
    0.25,
    0.50,
    0.75,
    0.90,
    0.95,
    0.99,
    0.999,
]
labels_describe_df = pd.DataFrame()
for target in cols_target:
    labels_describe_df = pd.concat((
        labels_describe_df,
        df_train[target].describe(percentiles=percentiles).round(3)
    ), axis=1)

# Transpose DataFrame
labels_describe_df = labels_describe_df.T

# Minimum/Maximum Values
labels_describe_df.insert(4, 'v_min', V_MIN)
labels_describe_df.insert(16, 'v_max', V_MAX)

labels_describe_df

V_MIN: X4_mean      -0.443441
X11_mean      0.516698
X18_mean      0.005404
X50_mean      0.065051
X26_mean      0.000980
X3112_mean    1.108830
Name: 0.001, dtype: float64
V_MAX X4_mean            1.480369
X11_mean         534.948108
X18_mean         199.684349
X50_mean          39.876007
X26_mean        9845.905856
X3112_mean    453450.025900
Name: 0.999, dtype: float64


Unnamed: 0,count,mean,std,min,v_min,0.1%,1%,5%,10%,25%,50%,75%,90%,95%,99%,99.9%,v_max,max
X4_mean,44391.0,0.523,0.177,-2.431,-0.443441,-0.443,0.228,0.304,0.339,0.412,0.509,0.622,0.725,0.797,0.923,1.48,1.480369,4.475
X11_mean,44391.0,91.036,10106.92,0.0,0.516698,0.517,3.08,4.905,6.438,10.648,15.102,19.696,25.361,31.007,51.26,534.948,534.948108,1504254.0
X18_mean,44391.0,24602.09,2582362.0,0.0,0.005404,0.005,0.041,0.102,0.162,0.311,0.717,3.595,11.48,17.099,28.651,199.684,199.684349,272049400.0
X50_mean,44391.0,12.778,1313.408,0.0,0.065051,0.065,0.486,0.774,0.923,1.173,1.479,1.929,2.495,3.037,4.445,39.876,39.876007,159759.9
X26_mean,44391.0,3397.419,229890.0,0.0,0.00098,0.001,0.011,0.047,0.118,0.563,2.518,14.75,77.34,217.411,1394.459,9845.906,9845.905856,31065550.0
X3112_mean,44391.0,496199.996,102327900.0,0.0,1.10883,1.109,11.128,42.572,89.776,255.051,727.857,2152.198,4850.017,8169.295,24365.387,453450.026,453450.0259,21559110000.0


## 2.3 Outliers

In [21]:
def select_outliers(df: pd.DataFrame, list_target: list[str], n_outliers: int) -> pd.DataFrame:
    """Select rows with outliers

    Args:
        df (pd.DataFrame): Input DataFrame
        list_target (list[str]): List of the target columns
        n_outliers (int): Number of top outliers to select
    Returns:
        (pd.DataFrame): Dataframe of top n_outliers outliers
    """
    return df.sort_values(list_target, ascending=False).head(n_outliers)

In [22]:
df_out = select_outliers(df_train, cols_target, n_outliers=4)
# plot_example(df_train, nrows=2, ncols=2)

In [23]:
# Mask to exclude values outside of 0.1% - 99.9% range
def get_mask(df: pd.DataFrame, cols_target: list[str], v_min: pd.DataFrame, v_max: pd.DataFrame):
    """
    """
    mask = np.empty(shape=df[cols_target].shape, dtype=bool)
    # Fill mask based on minimum/maximum values of sample submission
    for idx, (t, v_min, v_max) in enumerate(zip(cols_target, v_min, v_max)):
        labels = df[t].values
        mask[:,idx] = ((labels > v_min) & (labels < v_max))
    return mask.min(axis=1)

In [24]:
# Masks
mask_train = get_mask(df_train, cols_target, V_MIN, V_MAX)
# mask_val = get_mask(df_val)
# Masked DataFrames
train_mask = df_train[mask_train].reset_index(drop=True)
# val_mask = df_val[mask_val].reset_index(drop=True)

## 2.4 Normalize labels

In [25]:
feat_log = [
    'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean'
]

# Fill labels using normalization tool
def fill_y(y, df, normalize=False):
    for target_idx, target in enumerate(cols_target):
        v = df[target]
        if normalize:
            # Log10 Transform
            if target in feat_log:
                v = np.log10(v)
            # Shift To Have Zero Median
            Y_SHIFT[target_idx] = np.mean(v)
            v = v - np.median(v)
            # Uniform Variance
            Y_STD[target_idx] = np.std(v)
            v = v / np.std(v)
        # Assign to y_train
        y[:,target_idx] = v

# Feature Scaler
Y_SHIFT = np.zeros(len(cols_target))
Y_STD = np.zeros(len(cols_target))
# Masked Labels
y_train_mask_raw = np.zeros_like(df_train[cols_target], dtype=np.float32)
y_train_mask = np.zeros_like(df_train[cols_target], dtype=np.float32)
# y_val_mask = np.zeros_like(df_val[CONFIG.TARGET_COLUMNS], dtype=np.float32)
# Fill Target Arrays
fill_y(y_train_mask_raw, df_train, normalize=False)
fill_y(y_train_mask, df_train, normalize=True)
# fill_y(y_val_mask, df_val, normalize=True)
# Values
display(pd.DataFrame({
    'y_shift': Y_SHIFT,
    'y_std': Y_STD
}, index=cols_target))

Unnamed: 0,y_shift,y_std
X4_mean,0.522745,0.176857
X11_mean,1.146663,0.273735
X18_mean,0.003926,0.720351
X50_mean,0.176696,0.213118
X26_mean,0.45302,1.103293
X3112_mean,2.846611,0.717447


In [26]:
def _plot_hist(ax: plt.axes, df: pd.DataFrame, name: str):
    """
    """
    ax.hist(df[target].values, bins=128)
    ax.set_title(
        f'{target} {name} min: {df[target].min():.3f}, max: {df[target].max():.2e}, µ: {df[target].mean():.2e}, σ: {df[target].std():.2f}',
        size=10
    )

def plot_label_distribution(
    df_raw: pd.DataFrame, df_mask: pd.DataFrame, df_norm: pd.DataFrame, cols_target: list[str]
) -> None:
    """
    """
    # Options
    n_targets = len(cols_target)
    # Create figure
    fig, ax = plt.subplots(nrows=n_targets, ncols=3, figsize=(20, 4*n_targets))
    for i, target in enumerate(cols_target):
        # Raw
        _plot_hist(ax[i, 0], df_raw, 'Raw')
        # Mask
        _plot_hist(ax[i, 1], df_mask, 'Mask')
        # Norm
        _plot_hist(ax[i, 2], df_norm, 'Norm')
    plt.subplots_adjust(hspace=0.25, wspace=0.30)
    plt.show()