In [2]:
import pandas as pd
import os

# List of file paths
file_paths = [
    r"E:\Abroad period research\Phenology datasets\PHENOLOGY_H1.csv",
    r"E:\Abroad period research\Phenology datasets\PHENOLOGY_H2.csv",
    r"E:\Abroad period research\Phenology datasets\PHENOLOGY_H3.csv",
    r"E:\Abroad period research\Phenology datasets\PHENOLOGY_H4.csv"
]

# Years for splitting
train_years = [2015, 2016, 2017]
test_years = [2018, 2019]

for file_path in file_paths:
    try:
        # Load dataset
        df = pd.read_csv(file_path)

        # Ensure TIME column is datetime or extract year correctly
        if not pd.api.types.is_datetime64_any_dtype(df['TIME']):
            df['TIME'] = pd.to_datetime(df['TIME'], errors='coerce')

        df['YEAR'] = df['TIME'].dt.year

        # Split based on year
        train_df = df[df['YEAR'].isin(train_years)].drop(columns=['YEAR'])
        test_df = df[df['YEAR'].isin(test_years)].drop(columns=['YEAR'])

        # Generate new filenames
        base_dir = os.path.dirname(file_path)
        base_name = os.path.splitext(os.path.basename(file_path))[0]

        train_file = os.path.join(base_dir, f"{base_name}_train.csv")
        test_file = os.path.join(base_dir, f"{base_name}_test.csv")

        # Save the splits
        train_df.to_csv(train_file, index=False)
        test_df.to_csv(test_file, index=False)

        # Output number of instances
        print(f"{base_name}:")
        print(f"  Train instances: {len(train_df)}")
        print(f"  Test instances:  {len(test_df)}\n")

    except Exception as e:
        print(f"Error processing {file_path}: {e}")


PHENOLOGY_H1:
  Train instances: 1230
  Test instances:  813

PHENOLOGY_H2:
  Train instances: 1230
  Test instances:  813

PHENOLOGY_H3:
  Train instances: 1230
  Test instances:  813

PHENOLOGY_H4:
  Train instances: 1230
  Test instances:  813

