In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import Counter

# File paths
input_path = "E:/Abroad period research/Phenology datasets/PHENOLOGY_H1/"
file_names = ["Jaen.csv", "Cadiz.csv", "Cordoba.csv", "Sevilla.csv"]

# Function to perform temporal sliding window resampling
def window_based_resampling(df, target_col='fenologia_h1', window_size=5, jitter_strength=0.1, desired_class_count=None):
    """
    Generate new synthetic samples using window-based resampling with jittering.
    """
    
    # Extract features (X) and target (y)
    X = df.drop(columns=[target_col, 'SPOT_ID', 'TIME'])
    y = df[target_col]
    time_col = df['TIME']  # Store the time column

    # Identify all class labels
    class_labels = y.unique()
    new_samples = []

    # Determine the maximum number of samples for each class if balancing
    if desired_class_count is None:
        desired_class_count = {class_label: len(df[df[target_col] == class_label]) for class_label in class_labels}

    # Resample for all classes, even if they are not minority classes
    for class_label in class_labels:
        class_samples = df[df[target_col] == class_label]
        class_count = desired_class_count[class_label]

        # Sliding window resampling for each class
        for index, sample in class_samples.iterrows():
            for start in range(len(sample) - window_size + 1):
                window = X.iloc[start:start+window_size].values  # Convert to NumPy array for flattening

                # Apply jittering or interpolation to create new sample
                jittered_window = window + np.random.normal(0, jitter_strength, window.shape)

                # Store the new sample along with the corresponding target and time
                new_sample = jittered_window.flatten().tolist()  # Flatten window and add to new sample list
                new_sample.append(sample[target_col])  # Add the target column
                new_sample.append(sample['TIME'])  # Keep the time intact
                
                new_samples.append(new_sample)

        # If the number of samples for a class is less than the desired, duplicate the samples
        current_class_count = len(class_samples) + len(new_samples)
        if current_class_count < desired_class_count[class_label]:
            additional_samples_needed = desired_class_count[class_label] - current_class_count
            for _ in range(additional_samples_needed):
                new_samples.append(new_samples[-1])  # Repeat last sample for balancing

    # Ensure that the number of columns in new_samples matches the original columns
    num_features = X.shape[1]
    column_names = X.columns.tolist() * window_size + [target_col, 'TIME']

    # Create a new DataFrame with the synthetic samples
    new_data = pd.DataFrame(new_samples, columns=column_names)

    # Ensure the DataFrame is correct
    return new_data

# Function to balance and save the datasets
def balance_and_save_dataset(file_name):
    # Load the dataset
    df = pd.read_csv(f"{input_path}{file_name}")

    # Display the class distribution before resampling
    target_col = 'fenologia_h1'
    print(f"Class distribution in {file_name} before resampling:")
    print(Counter(df[target_col]))
    print("\n")

    # Perform window-based resampling
    balanced_df = window_based_resampling(df, target_col=target_col)

    # Display the class distribution after resampling
    print(f"Class distribution in {file_name} after resampling:")
    print(Counter(balanced_df[target_col]))
    print("\n")

    # Save the balanced dataset to a new CSV file
    balanced_file_name = file_name.replace(".csv", "_balanced.csv")
    balanced_df.to_csv(f"{input_path}{balanced_file_name}", index=False)

    print(f"Balanced dataset saved as: {balanced_file_name}")
    print("="*50)

# Loop through the files and balance them
for file_name in file_names:
    balance_and_save_dataset(file_name)


Class distribution in Jaen.csv before resampling:
Counter({3: 262, 2: 103, 4: 35, 1: 31})


Class distribution in Jaen.csv after resampling:
Counter({3: 2358, 2: 927, 4: 315, 1: 279})


Balanced dataset saved as: Jaen_balanced.csv
Class distribution in Cadiz.csv before resampling:
Counter({3: 255, 2: 140, 4: 130, 1: 30})


Class distribution in Cadiz.csv after resampling:
Counter({3: 2295, 2: 1260, 4: 1170, 1: 270})


Balanced dataset saved as: Cadiz_balanced.csv
Class distribution in Cordoba.csv before resampling:
Counter({3: 268, 2: 107, 4: 82, 1: 27})


Class distribution in Cordoba.csv after resampling:
Counter({3: 2412, 2: 963, 4: 738, 1: 243})


Balanced dataset saved as: Cordoba_balanced.csv
Class distribution in Sevilla.csv before resampling:
Counter({3: 290, 2: 152, 4: 100, 1: 31})


Class distribution in Sevilla.csv after resampling:
Counter({3: 2610, 2: 1368, 4: 900, 1: 279})


Balanced dataset saved as: Sevilla_balanced.csv


In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import Counter
import random

# File paths
input_path = "E:/Abroad period research/Phenology datasets/PHENOLOGY_H1/"
file_names = ["Jaen.csv", "Cadiz.csv", "Cordoba.csv", "Sevilla.csv"]

# Function to perform time-series augmentation
def time_series_augmentation(df, target_col='fenologia_h1', window_size=5, jitter_strength=0.1, time_warp_strength=0.05):
    """
    Generate new synthetic samples using time series augmentation techniques like jittering and time warping.
    """
    # Extract features (X) and target (y)
    X = df.drop(columns=[target_col, 'SPOT_ID', 'TIME'])
    y = df[target_col]
    time_col = df['TIME']  # Store the time column

    minority_class = [class_label for class_label in y.unique() if list(y).count(class_label) < len(y)/4]  # Identify minority class labels
    new_samples = []

    # Identify samples of the minority class
    minority_class_samples = df[df[target_col].isin(minority_class)]

    # Time-series augmentation for minority classes
    for index, sample in minority_class_samples.iterrows():
        for start in range(len(sample) - window_size + 1):
            window = X.iloc[start:start+window_size].values  # Extract the window of data
            
            # Apply jittering (adding noise)
            jittered_window = window + np.random.normal(0, jitter_strength, window.shape)

            # Apply time warping (shifting values randomly)
            time_warped_window = window + np.random.uniform(-time_warp_strength, time_warp_strength, window.shape)

            # Interpolate values between time points
            interpolated_window = []
            for i in range(window_size - 1):
                # Interpolating between adjacent time steps for each feature
                interp_vals = np.linspace(window[i], window[i + 1], num=3)[1:-1]  # Create interpolated values
                interpolated_window.extend(interp_vals)
            interpolated_window = np.array(interpolated_window)

            # Store the augmented samples along with the corresponding target and time
            new_sample_jitter = jittered_window.flatten().tolist()
            new_sample_jitter.append(sample[target_col])  # Add the target column
            new_sample_jitter.append(sample['TIME'])  # Keep the time intact

            new_sample_warp = time_warped_window.flatten().tolist()
            new_sample_warp.append(sample[target_col])  # Add the target column
            new_sample_warp.append(sample['TIME'])  # Keep the time intact

            new_sample_interp = interpolated_window.tolist()
            new_sample_interp.append(sample[target_col])  # Add the target column
            new_sample_interp.append(sample['TIME'])  # Keep the time intact

            # Append augmented samples to the new_samples list
            new_samples.append(new_sample_jitter)
            new_samples.append(new_sample_warp)
            new_samples.append(new_sample_interp)

    # Ensure that the number of columns in new_samples matches the original columns
    num_features = X.shape[1]
    column_names = X.columns.tolist() * window_size + [target_col, 'TIME']

    # Create a new DataFrame with the synthetic samples
    new_data = pd.DataFrame(new_samples, columns=column_names)

    # Ensure the DataFrame is correct
    return new_data

# Function to balance and save the datasets
def balance_and_save_dataset(file_name):
    # Load the dataset
    df = pd.read_csv(f"{input_path}{file_name}")

    # Display the class distribution before augmentation
    target_col = 'fenologia_h1'
    print(f"Class distribution in {file_name} before augmentation:")
    print(Counter(df[target_col]))
    print("\n")

    # Perform time-series augmentation
    balanced_df = time_series_augmentation(df, target_col=target_col)

    # Display the class distribution after augmentation
    print(f"Class distribution in {file_name} after augmentation:")
    print(Counter(balanced_df[target_col]))
    print("\n")

    # Save the augmented dataset to a new CSV file
    balanced_file_name = file_name.replace(".csv", "_augmented.csv")
    balanced_df.to_csv(f"{input_path}{balanced_file_name}", index=False)

    print(f"Augmented dataset saved as: {balanced_file_name}")
    print("="*50)

# Loop through the files and augment them
for file_name in file_names:
    balance_and_save_dataset(file_name)


Class distribution in Jaen.csv before augmentation:
Counter({3: 262, 2: 103, 4: 35, 1: 31})


Class distribution in Jaen.csv after augmentation:
Counter({2.0: 1854, 4.0: 630, 1.0: 558, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1,