In [1]:
# =============================
# IMPORTS
# =============================
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso
import os
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

# =============================
# 1. Load + Preprocess Data
# =============================
def load_and_preprocess_data(filepath, target_col='fenologia_h2'):
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.replace('"', '').str.strip()
    df['TIME'] = pd.to_datetime(df['TIME'], errors='coerce')
    df.dropna(subset=['TIME'], inplace=True)
    df.set_index('TIME', inplace=True)
    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(axis=1, how='all', inplace=True)

    if target_col not in df.columns:
        raise ValueError(f"Target column '{target_col}' not found in data.")

    df[target_col].interpolate(method='linear', inplace=True)
    # df[target_col] = df[target_col] - 1

    for i in range(1, 4):
        df[f'{target_col}_lag{i}'] = df[target_col].shift(i)

    for window in [3, 6]:
        df[f'{target_col}_roll_mean_{window}'] = df[target_col].rolling(window).mean()
        df[f'{target_col}_roll_std_{window}'] = df[target_col].rolling(window).std()

    df['month'] = df.index.month
    df['weekofyear'] = df.index.isocalendar().week
    df['year'] = df.index.isocalendar().year
    df['sin_week'] = np.sin(2 * np.pi * df['weekofyear'] / 52)
    df['cos_week'] = np.cos(2 * np.pi * df['weekofyear'] / 52)
    df['EMA_3'] = df[target_col].ewm(span=3, adjust=False).mean()
    df['EMA_6'] = df[target_col].ewm(span=6, adjust=False).mean()
    df['correlation_target_month'] = df[target_col].rolling(window=6).corr(df['month'])
    df['correlation_target_week'] = df[target_col].rolling(window=6).corr(df['weekofyear'])

    fft_values = np.fft.fft(df[target_col].dropna().values)
    fft_real = np.real(fft_values)[:len(df[target_col])]
    fft_imag = np.imag(fft_values)[:len(df[target_col])]
    df['fft_real'] = np.concatenate([fft_real, np.nan * np.ones(len(df) - len(fft_real))])
    df['fft_imag'] = np.concatenate([fft_imag, np.nan * np.ones(len(df) - len(fft_imag))])

    df.fillna(df.median(), inplace=True)

    df.reset_index(inplace=True)  # restore TIME as column
    return df, target_col

# =============================
# 2. Feature Scaling
# =============================
def feature_scaling(df, target_col):
    time_column = df[['TIME']]
    y = df[[target_col]]  # keep as DataFrame for later concatenation
    X = df.drop(columns=['TIME', target_col])

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

    return time_column, X_scaled_df, y, scaler

# =============================
# 3. Lasso Feature Selection
# =============================
def lasso_feature_selection(X, y, alpha=0.01, threshold=0.001):
    lasso = Lasso(alpha=alpha)
    lasso.fit(X, y.values.ravel())
    selected_mask = np.abs(lasso.coef_) > threshold
    selected_features = X.columns[selected_mask]
    return selected_features

# =============================
# 4. Save Datasets With TIME + Target
# =============================
def save_selected_datasets(time_train, X_train, y_train, time_test, X_test, y_test, selected_features, target_col, save_dir):
    os.makedirs(save_dir, exist_ok=True)

    df_train_selected = pd.concat([
        time_train.reset_index(drop=True),
        X_train[selected_features].reset_index(drop=True),
        y_train.reset_index(drop=True)
    ], axis=1)

    df_test_selected = pd.concat([
        time_test.reset_index(drop=True),
        X_test[selected_features].reset_index(drop=True),
        y_test.reset_index(drop=True)
    ], axis=1)

    train_path = os.path.join(save_dir, "train_lasso_selected.csv")
    test_path = os.path.join(save_dir, "test_lasso_selected.csv")

    df_train_selected.to_csv(train_path, index=False)
    df_test_selected.to_csv(test_path, index=False)

    print(f"\n✅ Saved training dataset to: {train_path}")
    print(f"✅ Saved testing dataset to: {test_path}")

# =============================
# 5. Main
# =============================
def main():
    train_fp = r"E:\Abroad period research\Phenology datasets\PHENOLOGY_H2_train.csv"
    test_fp  = r"E:\Abroad period research\Phenology datasets\PHENOLOGY_H2_test.csv"
    output_dir = r"E:\Abroad period research\Phenology datasets\Lasso_Selected_Features"

    # Load and preprocess
    df_train, target_col = load_and_preprocess_data(train_fp)
    df_test, _ = load_and_preprocess_data(test_fp, target_col=target_col)

    # Scale features (preserve TIME and target)
    time_train, X_train, y_train, _ = feature_scaling(df_train, target_col)
    time_test, X_test, y_test, _ = feature_scaling(df_test, target_col)

    # Feature selection
    selected_features = lasso_feature_selection(X_train, y_train)

    print(f"\n🔍 Selected {len(selected_features)} features using Lasso:")
    print(selected_features.tolist())

    # Save with TIME + target
    save_selected_datasets(time_train, X_train, y_train, time_test, X_test, y_test, selected_features, target_col, output_dir)

if __name__ == "__main__":
    main()



🔍 Selected 7 features using Lasso:
['fenologia_h2_lag2', 'fenologia_h2_lag3', 'fenologia_h2_roll_std_3', 'month', 'weekofyear', 'sin_week', 'EMA_3']

✅ Saved training dataset to: E:\Abroad period research\Phenology datasets\Lasso_Selected_Features\train_lasso_selected.csv
✅ Saved testing dataset to: E:\Abroad period research\Phenology datasets\Lasso_Selected_Features\test_lasso_selected.csv
