# Main Article

In [None]:
from utils import load_and_preprocess_table_data

config = "no_resample_cloud_disturbance_weights_3Y"
data = load_and_preprocess_table_data(config)
# Define features and target
features = ['amplitude_red', 'cos_phase_red', 'sin_phase_red', 'offset_red',
            'amplitude_green', 'cos_phase_green', 'sin_phase_green', 'offset_green',
            'amplitude_blue', 'cos_phase_blue', 'sin_phase_blue', 'offset_blue',
            'amplitude_crswir', 'cos_phase_crswir', 'sin_phase_crswir', 'offset_crswir', 
            'elevation', 'sin_aspect', 'cos_aspect']
target = 'phen'



## Distribution of feature and deciduous/evergreen proportion

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import numpy as np
from tqdm import tqdm 
import warnings
from matplotlib.lines import Line2D
from matplotlib.patches import Patch

warnings.filterwarnings("ignore")

# Function to compute KDE and color the area under the curve
def plot_kde_with_colored_area(data, feature, ax, show_legend=False):
    # Data for deciduous and evergreen
    data_deciduous = data[data['phen'] == 1][feature]
    data_evergreen = data[data['phen'] == 2][feature]

    # Calculate KDE
    kde_deciduous = gaussian_kde(data_deciduous)
    kde_evergreen = gaussian_kde(data_evergreen)
    
    # X values for KDE
    if 'amplitude' in feature or 'offset' in feature:
        lower_bound = np.percentile(data[feature], 1)
        upper_bound = np.percentile(data[feature], 99)
    else : 
        lower_bound = data[feature].min()
        upper_bound = data[feature].max()

    x_vals = np.linspace(lower_bound, upper_bound, 1000)
    kde_vals_deciduous = kde_deciduous(x_vals)
    kde_vals_evergreen = kde_evergreen(x_vals)

    # Calculate proportions
    total_kde_vals = kde_vals_deciduous + kde_vals_evergreen
    prop_deciduous = kde_vals_deciduous / total_kde_vals
    prop_evergreen = kde_vals_evergreen / total_kde_vals

    # Normalize to get densities
    total_kde_vals_percent = total_kde_vals / total_kde_vals.sum()

    # Plot KDE
    sns.lineplot(x=x_vals, y=total_kde_vals_percent, ax=ax, color='black', linewidth=2)
    
    if show_legend:
        ax.legend(['KDE'], loc='upper right')

    # Color the area under the curve
    for i in range(len(x_vals) - 1):
        ax.fill_between(x_vals[i:i+2], 0, total_kde_vals_percent[i:i+2], color=(prop_deciduous[i], prop_evergreen[i], 0, 0.3))

    feature_name = feature.replace('_', ' ').capitalize()
    # ax.set_title(f'Distribution of {feature_name}')
    ax.set_xlabel(feature_name)
    ax.set_ylabel('Density')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # Set x-axis limits to 95% of the data
    ax.set_xlim(lower_bound, upper_bound)

# Plot distributions for overall data using KDE with colored areas
fig = plt.figure(figsize=(12, 12))
idx = 0
n_features = len(features)

for feature in tqdm(features):
    ax = plt.subplot(5, 4, idx + 1)
    plot_kde_with_colored_area(data, feature, ax, show_legend=False)
    idx += 1

ax = plt.subplot(5, 4, idx + 1)
#add legend with green patch for evergreen, red patch for deciduous and a line for the KDE
handles = [Line2D([0], [0], color='black', linewidth=2, label='KDE'),
           Patch(color=(1, 0, 0, 0.3), label='Deciduous'),
           Patch(color=(0, 1, 0, 0.3), label='Evergreen')]

ax.legend(handles=handles, loc='upper right')
ax.set_axis_off()

plt.tight_layout()
plt.show()
fig.savefig('images/kde_colored_area.png', dpi=300, bbox_inches='tight')


## Information Gain -> Mutual Information 

Information Gain is a metric used to measure the effectiveness of a feature in terms of its contribution to predicting the target variable. It is commonly used in decision trees and other machine learning algorithms to assess the relevance of different features.

The concept of Information Gain originates from information theory, where it is defined as the reduction in entropy, or uncertainty, of a target variable given the knowledge of a feature. In simpler terms, it quantifies how much knowing the value of a feature improves our ability to predict the target variable.

### Entropy refresher 

Entropy, denoted as  $H(T)$, is a measure of randomness or disorder in a set of data. It quantifies the impurity or unpredictability of the target variable. Entropy is highest when the probability distribution of the classes is uniform, indicating maximum uncertainty. Conversely, entropy is zero when the dataset is perfectly pure (i.e., all instances belong to a single class).

The entropy of a target variable $T$ with $c$ possible classes is calculated as:

$$ H(T) = -\sum_{i=1}^{c} p_i \log_2(p_i) $$ 

where:
- $p_i$ is the proportion of instances in class $i$.

### Information Gain definition 

Information Gain measures the reduction in entropy after the dataset is split based on a feature. The higher the Information Gain, the more the feature contributes to reducing uncertainty about the target variable.

Mathematically, Information Gain $IG(T, X)$ is computed as the difference between the entropy of the target variable before and after observing the feature:

$$IG(T, X) = H(T) - H(T | X)$$

where:
- $H(T)$ is the entropy of the target variable $T$.
- $H(T | X)$ is the conditional entropy of $T$ given the feature $X$.

The conditional entropy $H(T | X)$ is calculated as:

$$H(T | X) = \sum_{v \in \text{Values}(X)} \frac{|S_v|}{|S|} H(S_v)$$

where:
- $\text{Values}(X)$ represents the unique values of the feature $X$.
- $S_v$ is the subset of $S$ for which feature $X$ has value $v$.
- $|S|$ and $|S_v|$ are the sizes of the sets $S$ and $S_v$, respectively.

By calculating the Information Gain for each feature, we can identify which features are the most informative for predicting the target variable. Features with higher Information Gain are considered more important as they contribute more to reducing the uncertainty about the target variable.


In [None]:
from sklearn.feature_selection import mutual_info_classif
import pandas as pd
import matplotlib.pyplot as plt

X = data[features]
y = data['phen']

# Calculate Information Gain (Mutual Information)
mi = mutual_info_classif(X, y, discrete_features=False, random_state=42)

# Create a DataFrame for plotting
mi_df = pd.DataFrame({'Feature': features, 'Information Gain': mi})
mi_df = mi_df.sort_values(by='Information Gain', ascending=False)

# Plotting the Information Gain
fig, ax = plt.subplots(figsize=(8, 8))
bars = ax.barh(mi_df['Feature'], mi_df['Information Gain'], color='skyblue')
ax.set_xlabel('Information Gain')
ax.set_ylabel('Features')
ax.set_title('Information Gain of Features for Tree Phenology Classification')
ax.invert_yaxis()

# Adding vertical grid lines
ax.xaxis.grid(True, which='major', linestyle='--', linewidth=0.5)

# Removing top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Color-coding the y-tick labels
def color_code_ticks(ax, labels):
    for label in labels:
        label_text = label.get_text()
        if 'red' in label_text:
            label.set_color('red')
        elif 'green' in label_text:
            label.set_color('green')
        elif 'blue' in label_text:
            label.set_color('blue')
        elif 'crswir' in label_text:
            label.set_color('darkred')

# Apply color coding to the y-tick labels
color_code_ticks(ax, ax.get_yticklabels())

# Save the figure
plt.tight_layout()
plt.savefig('images/information_gain.png', dpi=300)
plt.show()


In [None]:
latex_table = mi_df.to_latex(index=True, index_names=False)
print(latex_table)


## Fisher's Score

Fisher's Score, also known as Fisher Discriminant Ratio, is a method used in feature selection to evaluate the importance of individual features in distinguishing between different classes. It is based on the concept of maximizing the separation between different classes while minimizing the variation within each class.

The Fisher Score for a feature is calculated by taking the ratio of the variance between classes to the variance within classes. Features with higher Fisher Scores are considered more relevant for classification tasks as they provide better discrimination between classes.

Mathematically, the Fisher Score $F_i$ for a feature $i$ is defined as:

$$F_i = \frac{\sum_{c=1}^{C} n_c (\mu_{i,c} - \mu_i)^2}{\sum_{c=1}^{C} n_c \sigma_{i,c}^2}$$

where:
- $C$ is the number of classes.
- $n_c$ is the number of samples in class $c$.
- $\mu_{i,c}$ is the mean of the feature $i$ in class $c$.
- $\mu_i$ is the mean of the feature $i$ across all classes.
- $\sigma_{i,c}$ is the standard deviation of the feature $i$ in class $c$.

By computing the Fisher Scores for each feature, we can rank the features based on their ability to discriminate between classes. This helps in selecting the most informative features for building classification models.

In the context of our tree phenology classification task, we will use Fisher's Score to evaluate the importance of various spectral and topographical features in distinguishing between deciduous and evergreen trees. This analysis will guide us in selecting the most relevant features, thereby enhancing the performance of our classification models.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# Calculate Fisher Score for each feature
def fisher_score(X: pd.DataFrame, y: pd.Series) -> np.ndarray:
    """Calculate Fisher Score for each feature in X with respect to target y."""
    classes = np.unique(y)
    n_classes = len(classes)
    n_samples = X.shape[0]
    
    fisher_scores = np.zeros(X.shape[1])
    
    for idx, feature in enumerate(X.columns):
        mean_overall = np.mean(X[feature])
        numerator = 0
        denominator = 0
        
        for cls in classes:
            X_cls = X[y == cls]
            n_cls = X_cls.shape[0]
            mean_cls = np.mean(X_cls[feature])
            var_cls = np.var(X_cls[feature])
            
            numerator += n_cls * (mean_cls - mean_overall) ** 2
            denominator += n_cls * var_cls
            
        fisher_scores[idx] = numerator / denominator
    
    return fisher_scores

# Compute Fisher Scores
fisher_scores = fisher_score(X, y)

# Create a DataFrame for plotting
fisher_df = pd.DataFrame({'Feature': features, 'Fisher Score': fisher_scores})
fisher_df = fisher_df.sort_values(by='Fisher Score', ascending=False)

# Plotting the Fisher Scores
fig, ax = plt.subplots(figsize=(12, 8))
ax.barh(fisher_df['Feature'], fisher_df['Fisher Score'], color='skyblue')
ax.set_xlabel('Fisher Score')
ax.set_ylabel('Features')
ax.set_title('Fisher Score of Features for Tree Phenology Classification')
ax.invert_yaxis()

# Removing top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Save the figure
plt.tight_layout()
plt.savefig('fisher_score.png', dpi=300)
plt.show()

## Comp fisher's vs information gain 


- **High Scores in Both Methods:** Features with high scores in both Information Gain and Fisher's Score are likely to be highly relevant for the classification task.
- **Discrepancies Between Methods:** If a feature has a high score in one method but not the other, this might indicate specific characteristics of the data or the type of relationship between the feature and the target variable. For example, Information Gain might highlight non-linear relationships that Fisher's Score does not capture.
- **Feature Selection:** By combining insights from both methods, you can make more informed decisions about which features to include in your model to potentially improve its performance.


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
mi_normalized = scaler.fit_transform(mi.reshape(-1, 1)).flatten()
fisher_normalized = scaler.fit_transform(fisher_scores.reshape(-1, 1)).flatten()

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({
    'Feature': features,
    'Information Gain': mi_normalized,
    'Fisher Score': fisher_normalized
})

# Sort by Information Gain for consistency
comparison_df = comparison_df.sort_values(by='Information Gain', ascending=False)

# Plotting the comparison
fig, ax = plt.subplots(figsize=(10, 12))

# Bar width
bar_width = 0.4

# Positions of the bars on the x-axis
r1 = np.arange(len(comparison_df['Feature']))
r2 = [x + bar_width for x in r1]

# Make the plot
ax.barh(r1, comparison_df['Information Gain'], color='skyblue', height=bar_width, label='Information Gain')
ax.barh(r2, comparison_df['Fisher Score'], color='lightgreen', height=bar_width, label='Fisher Score')

# Labeling
ax.set_xlabel('Normalized Score')
ax.set_ylabel('Features')
ax.set_title('Comparison of Information Gain and Fisher Score for Tree Phenology Classification')
ax.set_yticks([r + bar_width / 2 for r in range(len(comparison_df['Feature']))])
ax.set_yticklabels(comparison_df['Feature'])

# Invert y-axis
ax.invert_yaxis()

# Removing top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Adding legend
ax.legend()

# Save the figure
plt.tight_layout()
plt.savefig('images/comparison_information_gain_fisher_score.png', dpi=300)
plt.show()

The figure shows that the following features : cos_phase_crswir, offset_crswir, and cos_phase_red have high importance in both metrics. Elevation has a low Fisher Score, and high information gain. If we look at the distribution of the elevation across deciduous and evergreen, we can see that the relationship is non linear : deciduous are strongly present in the 250 - 750m segment. The non-linearity of the relationship can explain a low Fisher's score.  

Seing the results we can discard the following features : sin_aspect, amplitude green, cos_aspect, sin_phase_blue, amplitude_blue. 

## Correlation 
Correlation analysis is a statistical method used to evaluate the strength and direction of the linear relationship between two variables. In the context of feature selection for machine learning, understanding the correlation between features and the target variable, as well as among the features themselves, is crucial for several reasons:

- Correlation with Target Variable: This helps to identify which features are most strongly associated with the target variable, providing insights into their potential importance in the model.
- Correlation Among Features: High correlation among features indicates redundancy, meaning that one feature can be predicted from another. Redundant features do not provide additional information and can lead to overfitting. By identifying these, we can simplify the model by removing or combining correlated features.

Importance of Correlation Analysis :
- **Identify Important Features:** Features that are highly correlated with the target variable are potential candidates for inclusion in the model.
- **Detect Redundancy:** Features that are highly correlated with each other may be redundant. Including redundant features can complicate the model without providing additional predictive power.
- **Simplify the Model:** By removing or combining correlated features, we can reduce the complexity of the model, which can help prevent overfitting and improve interpretability.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from matplotlib.colorbar import ColorbarBase

# Compute correlation matrix
corr_matrix = data[features + [target]].corr()

# Define the 'coolwarm' colormap
cmap = 'coolwarm'

# Plot correlation matrix
fig, ax = plt.subplots(figsize=(12, 10))

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

# Create the main heatmap without a colorbar
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap=cmap, cbar=False, square=True, ax=ax,
            vmin=-1, vmax=1)

# Define a new axis for the colorbar
# cbar_ax = fig.add_axes([0.25, -0.15, 0.4, 0.02])

# Create a colorbar
# norm = Normalize(vmin=-1, vmax=1)
# cbar = ColorbarBase(cbar_ax, cmap=cmap, norm=norm, orientation='horizontal')
# cbar.set_ticks([-1, 0, 1])
# cbar.set_ticklabels([-1, 0, 1])

# Function to color-code the tick labels
def color_code_ticks(ax, corr_matrix):
    new_labels = []
    for label in ax.get_xticklabels():
        label_text = label.get_text().replace('_', ' ')
        new_labels.append(label_text)
        if label_text == target.replace('_', ' '):
            label.set_fontweight('bold')
        if 'red' in label_text: 
            label.set_color('red')
        elif 'green' in label_text:
            label.set_color('green')
        elif 'blue' in label_text:
            label.set_color('blue')
        elif 'crswir' in label_text:
            label.set_color('darkred')
    ax.set_xticklabels(new_labels, rotation=45, ha='right')

    new_labels = []
    for label in ax.get_yticklabels():
        label_text = label.get_text().replace('_', ' ')
        new_labels.append(label_text)
        if label_text == target.replace('_', ' '):
            label.set_fontweight('bold')
        if 'red' in label_text: 
            label.set_color('red')
        elif 'green' in label_text:
            label.set_color('green')
        elif 'blue' in label_text:
            label.set_color('blue')
        elif 'crswir' in label_text:
            label.set_color('darkred')
    ax.set_yticklabels(new_labels, rotation=0)

# Apply color coding to the tick labels
color_code_ticks(ax, corr_matrix)

# Bold the 'phen' row and column in the heatmap annotations
for text in ax.texts:
    pos = text.get_position()
    if pos[0] == list(corr_matrix.columns).index(target) + 0.5 or pos[1] == list(corr_matrix.index).index(target) + 0.5:
        text.set_fontweight('bold')

ax.set_title('Correlation Matrix of Features and Target')

# Save the figure
plt.tight_layout()
plt.savefig('images/correlation_matrix.png', dpi=300)
plt.show()


We can first compare the features for the RGB channels and the CRSWIR. Offsets of the RGB channels are strongly correlated. We can observe the same pattern for amplitude and phase, though with slightly smaller correlations. The CRSWIR features have a small correlation with the RGB ones. Aspect and elevation have little correlation with other features. According to these results, it seems critical to keep the CRSWIR features while retaining only one of the RGB bands for the features (amplitude, offset, and phase).

The cos_phase_red, cos_phase_crswir, and offset_crswir are the features with the highest (negative) correlation with tree phenology.

# Suplementary materials

## Most deviated region per feature

In [None]:
# Function to calculate deviation score for each feature in each region
def calculate_deviation_scores(data, features):
    regions = data['greco_region'].unique()
    deviation_scores = pd.DataFrame(index=regions, columns=features)
    
    for feature in tqdm(features):
        overall_distribution = data[feature]
        
        for region in regions:
            region_data = data[data['greco_region'] == region][feature]
            ks_stat, _ = ks_2samp(overall_distribution, region_data)
            deviation_scores.loc[region, feature] = ks_stat
    
    return deviation_scores

# Calculate deviation scores
deviation_scores = calculate_deviation_scores(data, features)

# Identify the region with the highest deviation for each feature
#chang dtypes to float
deviation_scores = deviation_scores.astype(float)
most_deviated_region_per_feature = deviation_scores.idxmax()

print(f'Most deviated region per feature:\n{most_deviated_region_per_feature}')

## Variation Inflation Factor (VIF)

### What is VIF?

Variance Inflation Factor (VIF) is a measure of the amount of multicollinearity in a set of multiple regression variables. It quantifies how much the variance (the square of the standard error) of an estimated regression coefficient is increased because of collinearity. High multicollinearity increases the variance of the coefficient estimates and makes the estimates very sensitive to changes in the model, which can result in less reliable statistical inferences.

### How is VIF Computed?

1. **For each feature $i$ in the dataset, regress that feature on all the other features.** 
   
   For example, if you have features $X_1, X_2, X_3$, you would perform the following regressions:
   - $X_1 \sim X_2 + X_3$
   - $X_2 \sim X_1 + X_3$
   - $X_3 \sim X_1 + X_2$
   
2. **Compute the R-squared value ($R^2_i$) from this regression.**

   The R-squared value represents the proportion of the variance in the dependent variable that is predictable from the independent variables. In this context, it represents how well feature $i$ can be predicted by the other features.

3. **Calculate the VIF for each feature using the formula:**

   $$ \text{VIF}_i = \frac{1}{1 - R^2_i} $$

   Where:
   - $R^2_i$ is the R-squared value from regressing the $i$th feature on all the other features.
   - VIF values quantify how much the variance of a coefficient is inflated due to multicollinearity.

### Interpretation of VIF Values

- **VIF = 1**: No correlation between the $i$th feature and the other features. The feature is not collinear.
- **1 < VIF < 5**: Moderate correlation that may be acceptable depending on the context.
- **VIF ≥ 5**: High correlation and potentially problematic multicollinearity. In some contexts, a threshold of 10 is used instead of 5.

### Using VIF

1. **Identify multicollinearity**: Calculate VIF for all features to identify those with high VIF values, indicating high multicollinearity.
2. **Address multicollinearity**: Consider removing or combining highly collinear features to improve the model's stability and interpretability.


In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from utils import load_and_preprocess_table_data

def calculate_vif(data: pd.DataFrame, features: list[str]) -> pd.DataFrame:
    """
    Calculate the Variance Inflation Factor (VIF) for each feature.
    
    Args:
        data (pd.DataFrame): The dataset.
        features (list[str]): List of feature names to calculate VIF for.
        
    Returns:
        pd.DataFrame: DataFrame containing features and their corresponding VIF values.
    """
    X = data[features]
    vif_data = pd.DataFrame()
    vif_data['Feature'] = features
    vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data


config = "no_resample_cloud_disturbance_weights_3Y"
data = load_and_preprocess_table_data(config)

# Define features
rgb_features = ['amplitude_red', 'cos_phase_red', 'offset_green', 'offset_red', 'cos_phase_blue']
crswir_features = ['amplitude_crswir', 'cos_phase_crswir', 'sin_phase_crswir', 'offset_crswir']
relevant_features = rgb_features + crswir_features

# Calculate VIF
vif_df = calculate_vif(data, relevant_features)
print(vif_df)




In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_correlation_matrix(data: pd.DataFrame, features: list[str]):
    """
    Plot the correlation matrix for a given list of features.
    
    Args:
        data (pd.DataFrame): The dataset.
        features (list[str]): List of feature names to include in the correlation matrix.
    """
    correlation_matrix = data[features].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
    plt.title('Correlation Matrix of Features')
    plt.show()
    return correlation_matrix

# Plot correlation matrix
correlation_matrix = plot_correlation_matrix(data, relevant_features)

# Check correlations of `offset_red` and `offset_crswir`
print("\nCorrelations with offset_green:")
print(correlation_matrix['offset_green'].sort_values(ascending=False))

print("\nCorrelations with offset_crswir:")
print(correlation_matrix['offset_crswir'].sort_values(ascending=False))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_pairplot(data: pd.DataFrame, features: list[str]):
    """
    Plot the pair plot for a given list of features.
    
    Args:
        data (pd.DataFrame): The dataset.
        features (list[str]): List of feature names to include in the pair plot.
    """
    sns.pairplot(data[features])
    plt.suptitle('Pair Plot of Relevant Features', y=1.02)
    plt.show()

plot_pairplot(data, relevant_features)

In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns
import matplotlib.pyplot as plt
from utils import load_and_preprocess_table_data

def calculate_vif(data: pd.DataFrame, features: list[str]) -> pd.DataFrame:
    """
    Calculate the Variance Inflation Factor (VIF) for each feature.
    
    Args:
        data (pd.DataFrame): The dataset.
        features (list[str]): List of feature names to calculate VIF for.
        
    Returns:
        pd.DataFrame: DataFrame containing features and their corresponding VIF values.
    """
    X = data[features]
    vif_data = pd.DataFrame()
    vif_data['Feature'] = features
    vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

def plot_pairplot(data: pd.DataFrame, features: list[str]):
    """
    Plot the pair plot for a given list of features.
    
    Args:
        data (pd.DataFrame): The dataset.
        features (list[str]): List of feature names to include in the pair plot.
    """
    sns.pairplot(data[features])
    plt.suptitle('Pair Plot of Relevant Features', y=1.02)
    plt.show()

# Load data
config = "no_resample_cloud_disturbance_weights_3Y"
data = load_and_preprocess_table_data(config)

# Define features
red_features = ['amplitude_red', 'cos_phase_red', 'sin_phase_red', 'offset_red']
crswir_features = ['amplitude_crswir', 'cos_phase_crswir', 'sin_phase_crswir', 'offset_crswir']
relevant_features = red_features + crswir_features

# Check for NaNs and Infs in the original data
if data[relevant_features].isnull().values.any():
    raise ValueError("Input data contains NaNs. Please handle them before proceeding.")
if np.isinf(data[relevant_features].values).any():
    raise ValueError("Input data contains Infs. Please handle them before proceeding.")

# Add a small constant to avoid log(0) and negative values
epsilon = 1e-6
# Combine `offset_red` and `offset_crswir` using different techniques
data['offset_mean'] = data[['offset_red', 'offset_crswir']].mean(axis=1)
data['offset_interaction'] = data['offset_red'] * data['offset_crswir']
data['offset_log'] = np.log1p(data['offset_red'] + epsilon) + np.log1p(data['offset_crswir'] + epsilon)
data['offset_sqrt'] = np.sqrt(data['offset_red'] + epsilon) + np.sqrt(data['offset_crswir'] + epsilon)
data['offset_poly'] = (data['offset_red'] + epsilon)**2 + (data['offset_crswir'] + epsilon)**2

# Check for NaNs and Infs after transformations
combined_features = ['offset_mean', 'offset_interaction', 'offset_log', 'offset_sqrt', 'offset_poly']

for feature in combined_features:
    if data[feature].isnull().values.any():
        print(f"NaNs detected in {feature}.")
        print(data[['offset_red', 'offset_crswir', feature]].loc[data[feature].isnull()])
    if np.isinf(data[feature].values).any():
        print(f"Infs detected in {feature}.")
        print(data[['offset_red', 'offset_crswir', feature]].loc[np.isinf(data[feature].values)])

# Drop rows where NaNs are present in the transformed features
data = data.dropna(subset=combined_features)

# Define new feature sets
new_features_mean = [f for f in relevant_features if f not in ['offset_red', 'offset_crswir']] + ['offset_mean']
new_features_interaction = [f for f in relevant_features if f not in ['offset_red', 'offset_crswir']] + ['offset_interaction']
new_features_log = [f for f in relevant_features if f not in ['offset_red', 'offset_crswir']] + ['offset_log']
new_features_sqrt = [f for f in relevant_features if f not in ['offset_red', 'offset_crswir']] + ['offset_sqrt']

# Calculate VIF for new feature sets
vif_mean = calculate_vif(data, new_features_mean)
vif_interaction = calculate_vif(data, new_features_interaction)
vif_log = calculate_vif(data, new_features_log)
vif_sqrt = calculate_vif(data, new_features_sqrt)

print("VIF for Mean Combined Feature:\n", vif_mean)
print("VIF for Interaction Combined Feature:\n", vif_interaction)
print("VIF for Logarithmic Combined Feature:\n", vif_log)
print("VIF for Square Root Combined Feature:\n", vif_sqrt)

# Plot pair plot for the best feature set (for example, using the mean combined feature)
plot_pairplot(data, new_features_mean)

In [None]:
vif_mean

In [None]:
vif_interaction

In [None]:
vif_log

In [None]:
vif_sqrt

## Difference between extraction methods and time series length

This section compares features extracted from Sentinel-2 time series pixels using various methods over different time spans. By loading and concatenating training and validation datasets, it calculates the mean and standard deviation of pixel-wise differences for amplitude, phase, and offset features. These metrics are stored in a DataFrame and visualized as heatmaps, providing a comprehensive comparison of feature extraction methods across regions. The results help assess the variability and consistency of different extraction methods.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Function to compute differences between features
def compute_differences(df1: pd.DataFrame, df2: pd.DataFrame, feature: str):
    diff = df1[feature] - df2[feature]
    mean_diff = diff.mean()
    std_diff = diff.std()
    return mean_diff, std_diff

# Function to plot heatmap
def plot_heatmap(data, title, xlabel, ylabel, output_file):
    plt.figure(figsize=(10, 8))
    sns.heatmap(data, annot=True, fmt=".2f", cmap="coolwarm")
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.savefig(output_file)
    plt.close()

# Define the configurations and file paths
configs = ["no_resample_cloud_disturbance_weights", "no_resample_cloud_weights", "no_resample_no_weights", "resampled_no_weights"]
years = [1, 2, 3]
indices = ["red", "green", "blue", "crswir"]
features = ['amplitude', 'cos_phase', 'sin_phase', 'offset']
all_data = {}

# Load and concatenate all data
for config in configs:
    for year in years:
        all_data[f'{config}_Y{year}'] = load_and_preprocess_table_data(f'{config}_{year}Y')

# Get unique regions
regions = all_data[f'{configs[0]}_Y{1}']['greco_region'].unique()

# DataFrame to store all metrics
metrics_df = pd.DataFrame(columns=['method', 'year', 'feature', 'index', 'region', 'mean_diff', 'std_diff'])

# Compute differences and generate heatmaps
for region in regions:
    for index in indices:
        for feature in features:
            # Compare the same method across different years
            for config in configs:
                for i, year1 in enumerate(years):
                    df1 = all_data[f'{config}_Y{year1}']
                    df1_region = df1[df1['greco_region'] == region]

                    for j, year2 in enumerate(years):
                        df2 = all_data[f'{config}_Y{year2}']
                        df2_region = df2[df2['greco_region'] == region]
                        mean_diff, std_diff = compute_differences(df1_region, df2_region, f'{feature}_{index}')
                        
                        metrics_df = metrics_df.append({
                            'method': config,
                            'year': f'{year1}_vs_{year2}',
                            'feature': feature,
                            'index': index,
                            'region': region,
                            'mean_diff': mean_diff,
                            'std_diff': std_diff
                        }, ignore_index=True)

            # Compare the same year across different methods
            for year in years:
                for i, config1 in enumerate(configs):
                    df1 = all_data[f'{config1}_Y{year}']
                    df1_region = df1[df1['greco_region'] == region]

                    for j, config2 in enumerate(configs):
                        if config1 == config2:
                            continue
                        df2 = all_data[f'{config2}_Y{year}']
                        df2_region = df2[df2['greco_region'] == region]
                        mean_diff, std_diff = compute_differences(df1_region, df2_region, f'{feature}_{index}')
                        
                        metrics_df = metrics_df.append({
                            'method': f'{config1}_vs_{config2}',
                            'year': year,
                            'feature': feature,
                            'index': index,
                            'region': region,
                            'mean_diff': mean_diff,
                            'std_diff': std_diff
                        }, ignore_index=True)

# Save the metrics DataFrame to a CSV file for further analysis
metrics_df.to_csv('metrics_comparison.csv', index=False)

# Aggregate and plot heatmaps
for index in indices:
    for feature in features:
        for region in regions:
            # Filter the DataFrame for specific feature, index, and region
            filtered_df = metrics_df[(metrics_df['feature'] == feature) &
                                     (metrics_df['index'] == index) &
                                     (metrics_df['region'] == region)]
            
            # Pivot the DataFrame to get the matrices for heatmap
            mean_diff_matrix_years = filtered_df.pivot('year', 'method', 'mean_diff')
            std_diff_matrix_years = filtered_df.pivot('year', 'method', 'std_diff')
            
            mean_diff_matrix_methods = filtered_df.pivot('method', 'year', 'mean_diff')
            std_diff_matrix_methods = filtered_df.pivot('method', 'year', 'std_diff')

            # Plot the heatmaps for year comparison
            plot_heatmap(mean_diff_matrix_years, f"Mean Difference ({feature} - {index} - {region} - Years)", "Year", "Method", f"mean_diff_{feature}_{index}_{region}_years.png")
            plot_heatmap(std_diff_matrix_years, f"Std Difference ({feature} - {index} - {region} - Years)", "Year", "Method", f"std_diff_{feature}_{index}_{region}_years.png")

            # Plot the heatmaps for method comparison
            plot_heatmap(mean_diff_matrix_methods, f"Mean Difference ({feature} - {index} - {region} - Methods)", "Method", "Year", f"mean_diff_{feature}_{index}_{region}_methods.png")
            plot_heatmap(std_diff_matrix_methods, f"Std Difference ({feature} - {index} - {region} - Methods)", "Method", "Year", f"std_diff_{feature}_{index}_{region}_methods.png")

print("Processing complete.")


## Variance Threshold 

Variance Threshold is a simple feature selection method that removes features with low variance. Features with a variance below a certain threshold do not vary much and hence are less likely to be informative. In other words, features with low variance are less likely to carry significant information that can help differentiate between classes. By removing these low-variance features, we can simplify the model, reduce overfitting, and improve model performance.

In this analysis, we will compute the variance of each feature and identify those that have a variance below a specified threshold. We will then visualize these variances to understand which features may be candidates for removal.

In [None]:
from sklearn.feature_selection import VarianceThreshold

# Extract the feature data
X = data[features]

# Compute variance of each feature
variances = X.var()

# Create a DataFrame for plotting
variance_df = pd.DataFrame({'Feature': features, 'Variance': variances})
variance_df = variance_df.sort_values(by='Variance', ascending=False)

# Plotting the variances
fig, ax = plt.subplots(figsize=(12, 8))
ax.barh(variance_df['Feature'], variance_df['Variance'], color='skyblue')
ax.set_xlabel('Variance')
ax.set_ylabel('Features')
ax.set_title('Variance of Features for Tree Phenology Classification')
ax.invert_yaxis()

# Removing top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Save the figure
plt.tight_layout()
plt.savefig('images/variance_threshold.png', dpi=300)
plt.show()

# Define variance threshold
threshold = 0.5

# Apply Variance Threshold
selector = VarianceThreshold(threshold=threshold)
X_high_variance = selector.fit_transform(X)

# Get the support mask
support_mask = selector.get_support()

# Get the selected feature names
selected_features = X.columns[support_mask]

print("Selected features with variance above threshold:")
print(selected_features)

In [None]:
selector.variances_

## Dispersion ratio

The Dispersion Ratio is a feature selection method that evaluates the dispersion of each feature. It is based on the concept of the ratio between the variance of a feature and its mean. A high dispersion ratio indicates that the feature values are spread out over a wide range, suggesting that the feature may be informative. Conversely, a low dispersion ratio suggests that the feature values are tightly clustered, potentially indicating redundancy or lack of informativeness.

By analyzing the dispersion ratio of each feature, we can identify features that have a wide range of values and are likely to provide useful information for the model.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Extract the feature data
X = data[features]

# Compute mean and variance of each feature
means = X.mean()
variances = X.var()

# Compute dispersion ratio
dispersion_ratio = variances / means

# Create a DataFrame for plotting
dispersion_df = pd.DataFrame({'Feature': features, 'Dispersion Ratio': dispersion_ratio})
dispersion_df = dispersion_df.sort_values(by='Dispersion Ratio', ascending=False)

# Plotting the dispersion ratios
fig, ax = plt.subplots(figsize=(12, 8))
ax.barh(dispersion_df['Feature'], dispersion_df['Dispersion Ratio'], color='skyblue')
ax.set_xlabel('Dispersion Ratio')
ax.set_ylabel('Features')
ax.set_title('Dispersion Ratio of Features for Tree Phenology Classification')
ax.invert_yaxis()

# Removing top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Save the figure
plt.tight_layout()
plt.savefig('images/dispersion_ratio.png', dpi=300)
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load training data
train_data = pd.read_csv('train_data.csv')

# Define feature pairs
feature_pairs = [
    ('amplitude_crswir', 'amplitude_red'),
    ('phase_crswir', 'phase_red'),
    ('offset_crswir', 'offset_red')
]

# Compute correlation matrix
correlation_data = train_data[[pair[0] for pair in feature_pairs] + [pair[1] for pair in feature_pairs]]
correlation_matrix = correlation_data.corr()

# Plot correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix between CRSWIR and Red Channel Features')
plt.show()

# Scatter plots with correlation coefficients
plt.figure(figsize=(12, 8))
for i, (crswir_feature, red_feature) in enumerate(feature_pairs):
    plt.subplot(2, 3, i + 1)
    sns.scatterplot(x=train_data[crswir_feature], y=train_data[red_feature])
    r = train_data[crswir_feature].corr(train_data[red_feature])
    plt.title(f'{crswir_feature} vs {red_feature}\nCorrelation: {r:.2f}')
    plt.xlabel(crswir_feature)
    plt.ylabel(red_feature)

plt.tight_layout()
plt.show()


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np

# Function to plot linear regression
def plot_linear_regression(data, x_feature, y_feature, ax):
    X = data[x_feature].values.reshape(-1, 1)
    y = data[y_feature].values
    model = LinearRegression().fit(X, y)
    y_pred = model.predict(X)
    
    ax.scatter(X, y, alpha=0.5)
    ax.plot(X, y_pred, color='red', linewidth=2, linestyle='--')
    r2 = r2_score(y, y_pred)
    x_feature = x_feature.replace('_', ' ').capitalize()
    y_feature = y_feature.replace('_', ' ').capitalize()
    ax.set_title(f'{x_feature} vs {y_feature}\nR²: {r2:.2f}')
    ax.set_xlabel(x_feature)
    ax.set_ylabel(y_feature)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.grid(True, linestyle='--', alpha=0.5)

# Plot linear regression
fig = plt.figure(figsize=(12, 6))
for i, (crswir_feature, red_feature) in enumerate(feature_pairs):
    ax = plt.subplot(2, 3, i + 1)
    plot_linear_regression(train_data, crswir_feature, red_feature, ax)

plt.tight_layout()
plt.show()
fig.savefig('images/linear_regression.png', dpi=300, bbox_inches='tight')
