In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler
from mlxtend.frequent_patterns import apriori, association_rules
from scipy.fftpack import fft
import tensorflow as tf
import warnings
import gc

np.random.seed(42)
tf.random.set_seed(42)
warnings.filterwarnings('ignore')

# ============================== 1. Load + Preprocess ==============================
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.replace('"', '').str.strip()

    if 'TIME' not in df.columns:
        raise KeyError("The column 'TIME' does not exist in the dataset.")

    df['TIME'] = pd.to_datetime(df['TIME'], errors='coerce')
    df.dropna(subset=['TIME'], inplace=True)
    df.set_index('TIME', inplace=True)

    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(axis=1, how='all', inplace=True)

    target_col = 'fenologia_h1'
    if target_col not in df.columns:
        raise KeyError(f"The target column '{target_col}' does not exist.")
    df[target_col].interpolate(method='linear', inplace=True)
    df[target_col] = df[target_col] - 1

    for i in range(1, 4):
        df[f'{target_col}_lag{i}'] = df[target_col].shift(i)
    for window in [3, 6]:
        df[f'{target_col}_roll_mean_{window}'] = df[target_col].rolling(window).mean()
        df[f'{target_col}_roll_std_{window}'] = df[target_col].rolling(window).std()

    df['month'] = df.index.month
    df['weekofyear'] = df.index.isocalendar().week.astype(int)
    df['year'] = df.index.isocalendar().year.astype(int)
    df['sin_week'] = np.sin(2 * np.pi * df['weekofyear'] / 52)
    df['cos_week'] = np.cos(2 * np.pi * df['weekofyear'] / 52)
    df['EMA_3'] = df[target_col].ewm(span=3, adjust=False).mean()
    df['EMA_6'] = df[target_col].ewm(span=6, adjust=False).mean()
    df['correlation_target_month'] = df[target_col].rolling(6).corr(df['month'])
    df['correlation_target_week'] = df[target_col].rolling(6).corr(df['weekofyear'])

    fft_vals = fft(df[target_col].dropna().values)
    fft_real = np.real(fft_vals)[:len(df)]
    fft_imag = np.imag(fft_vals)[:len(df)]
    df['fft_real'] = np.pad(fft_real, (0, max(0, len(df) - len(fft_real))), constant_values=np.nan)
    df['fft_imag'] = np.pad(fft_imag, (0, max(0, len(df) - len(fft_imag))), constant_values=np.nan)

    df.fillna(df.median(numeric_only=True), inplace=True)
    return df, target_col

# ============================== 2. Scaling ==============================
def feature_scaling(df, target_col):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y.reset_index(drop=True), scaler

# ============================== 3. Symbolic Rule-Based Feature Importance ==============================
def rule_based_feature_importance(X, y, n_bins=3, min_support=0.1, min_conf=0.6, top_k=15, max_features=50):
    if X.shape[1] > max_features:
        X = X[:, :max_features]

    df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
    y = pd.Series(y, name='target')

    # Discretize features
    kb = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
    df_binned = pd.DataFrame(kb.fit_transform(df), columns=df.columns).astype(int)

    # Convert to symbolic format
    for col in df_binned.columns:
        df_binned[col] = df_binned[col].apply(lambda x: f"{col}_bin{x}")
    df_binned['target'] = y.apply(lambda x: f"target_class_{int(x)}")

    # One-hot encode using memory-efficient sparse format
    ohe = pd.get_dummies(df_binned, sparse=True)

    # Mine frequent patterns
    frequent_itemsets = apriori(ohe, min_support=min_support, use_colnames=True, low_memory=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_conf)

    # Filter rules leading to class outcomes
    rules = rules[rules['consequents'].apply(lambda x: any('target_class' in i for i in x))]

    # Calculate feature importance via cumulative lift
    importance_scores = {}
    for _, row in rules.iterrows():
        for antecedent in row['antecedents']:
            if 'feature_' in antecedent:
                importance_scores[antecedent] = importance_scores.get(antecedent, 0) + row['lift']

    importance_df = pd.DataFrame(list(importance_scores.items()), columns=['Feature_Bin', 'Importance'])
    importance_df.sort_values('Importance', ascending=False, inplace=True)

    # Plot symbolic feature influence
    plt.figure(figsize=(12, 6))
    sns.barplot(data=importance_df.head(top_k), y='Feature_Bin', x='Importance', palette='magma')
    plt.title('Symbolic Rule-Based Feature Importance')
    plt.xlabel('Cumulative Lift Score')
    plt.ylabel('Feature Bin')
    plt.tight_layout()
    plt.show()

    gc.collect()
    return importance_df

# ============================== 4. Main ==============================
def main():
    train_path = r"E:\Abroad period research\Phenology datasets\PHENOLOGY_H1\Cordoba_train.csv"
    print("Loading and preprocessing data...")
    df_train, target_col = load_and_preprocess_data(train_path)
    X_train, y_train, _ = feature_scaling(df_train, target_col)

    print("\nRunning Rule-Based Feature Attribution...")
    results = rule_based_feature_importance(X_train, y_train)

    print("\nTop symbolic feature patterns:")
    print(results.head(10))

if __name__ == "__main__":
    main()


Loading and preprocessing data...

Running Rule-Based Feature Attribution...


Preprocessing and feature engineering 

Temporal Convolutional Network (TCN)

Novel Explainability Using Deep SHAP for Temporal Models