# Exploratory Data Analysis

# 1. Imports

## 1.1 Packages

In [23]:
import os

import imageio.v3 as imageio
import matplotlib.pyplot as plt
import pandas as pd


## 1.2 Options

In [24]:
path_data = '../data/04_feature'

In [25]:
cols_target = [
    'X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean'
]

## 1.3 Data

In [26]:
df_train = pd.read_parquet(os.path.join(path_data, 'df_train.parquet'))

# 2. Analyze

In [27]:
df_train.sample(2)

Unnamed: 0,id,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,...,X50_mean,X3112_mean,X4_sd,X11_sd,X18_sd,X26_sd,X50_sd,X3112_sd,file_path,jpeg_bytes
7516,196427071,14.046963,394.537781,38.799999,39.42939,681.608398,30.613777,130,150,138,...,2.80646,1424.570134,0.010268,0.358832,0.283994,13.533783,0.096891,70.892999,data/03_primary/train_images//196427071.jpeg,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
13255,194951674,19.730377,595.257141,72.995239,48.885834,555.705139,30.235714,138,150,141,...,2.584134,166.165708,0.00852,0.453257,1.431458,1.088286,0.126948,47.658843,data/03_primary/train_images//194951674.jpeg,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...


## 2.1 Plot example

In [28]:
def plot_example(df: pd.DataFrame, nrows: int=3, ncols: int=3) -> None:
    """Plot examples of images

    Args:
        df (pd.DataFrame): Input dataframe
        nrows (int): Number of rows for the subplots
        ncols (int): Number of columns for the subplots
    """
    nimgs = nrows * ncols
    df_to_plot = df.sample(n=nimgs, random_state=12)
    fig, axes = plt.subplots(nrows, ncols, figsize=(ncols*5, nrows*5))
    for i in range(nimgs):
        r = i // ncols
        c = i % ncols
        img = imageio.imread(df_to_plot.iloc[i]['jpeg_bytes'])
        image_id = df_to_plot.iloc[i]['id']
        axes[r, c].imshow(img)
        axes[r, c].set_title(f'{image_id} | shape: {img.shape}')
    plt.show()

# plot_example(df_train)

## 2.2 Analyze labels

In [29]:
# Labels Meta Data
target_name_meta = pd.read_csv('../data/03_primary/target_name_meta.tsv', delimiter='\t')
target_name_meta['trait_ID'] = target_name_meta['trait_ID'] + '_mean'
target_name_meta = target_name_meta.set_index('trait_ID').squeeze().to_dict()

pd.Series(target_name_meta).to_frame()

Unnamed: 0,0
X4_mean,Stem specific density (SSD) or wood density (s...
X11_mean,Leaf area per leaf dry mass (specific leaf are...
X18_mean,Plant height
X26_mean,Seed dry mass
X50_mean,Leaf nitrogen (N) content per leaf area
X3112_mean,"Leaf area (in case of compound leaves: leaf, u..."


In [31]:
# Minimum/Maximum Based On Train 0.1% and 99.9%
V_MIN = df_train[cols_target].quantile(0.001)
V_MAX = df_train[cols_target].quantile(0.999)

# Percentiles of features to use
percentiles = [
    0.001,
    0.01,
    0.05,
    0.10,
    0.25,
    0.50,
    0.75,
    0.90,
    0.95,
    0.99,
    0.999,
]
labels_describe_df = pd.DataFrame()
for target in cols_target:
    labels_describe_df = pd.concat((
        labels_describe_df,
        df_train[target].describe(percentiles=percentiles).round(3)
    ), axis=1)

# Transpose DataFrame
labels_describe_df = labels_describe_df.T

# Minimum/Maximum Values
labels_describe_df.insert(4, 'v_min', V_MIN)
labels_describe_df.insert(16, 'v_max', V_MAX)

labels_describe_df

Unnamed: 0,count,mean,std,min,v_min,0.1%,1%,5%,10%,25%,50%,75%,90%,95%,99%,99.9%,v_max,max
X4_mean,44391.0,0.523,0.177,-2.431,-0.443441,-0.443,0.228,0.304,0.339,0.412,0.509,0.622,0.725,0.797,0.923,1.48,1.480369,4.475
X11_mean,44391.0,91.036,10106.92,0.0,0.516698,0.517,3.08,4.905,6.438,10.648,15.102,19.696,25.361,31.007,51.26,534.948,534.948108,1504254.0
X18_mean,44391.0,24602.09,2582362.0,0.0,0.005404,0.005,0.041,0.102,0.162,0.311,0.717,3.595,11.48,17.099,28.651,199.684,199.684349,272049400.0
X50_mean,44391.0,12.778,1313.408,0.0,0.065051,0.065,0.486,0.774,0.923,1.173,1.479,1.929,2.495,3.037,4.445,39.876,39.876007,159759.9
X26_mean,44391.0,3397.419,229890.0,0.0,0.00098,0.001,0.011,0.047,0.118,0.563,2.518,14.75,77.34,217.411,1394.459,9845.906,9845.905856,31065550.0
X3112_mean,44391.0,496199.996,102327900.0,0.0,1.10883,1.109,11.128,42.572,89.776,255.051,727.857,2152.198,4850.017,8169.295,24365.387,453450.026,453450.0259,21559110000.0
