In [None]:
# ===========================================
#  IMPORTS
# ===========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import mutual_info_classif, RFE, VarianceThreshold
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
tf.random.set_seed(42)

# ===========================================
#  DATA LOADING + PREPROCESSING
# ===========================================
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.replace('"', '').str.strip()
    df['TIME'] = pd.to_datetime(df['TIME'], errors='coerce')
    df.dropna(subset=['TIME'], inplace=True)
    df.set_index('TIME', inplace=True)
    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(axis=1, how='all', inplace=True)

    target_col = 'fenologia_h1'
    df[target_col].interpolate(method='linear', inplace=True)
    df[target_col] = df[target_col] - 1

    for i in range(1, 4):
        df[f'{target_col}_lag{i}'] = df[target_col].shift(i)

    for window in [3, 6]:
        df[f'{target_col}_roll_mean_{window}'] = df[target_col].rolling(window).mean()
        df[f'{target_col}_roll_std_{window}'] = df[target_col].rolling(window).std()

    df['month'] = df.index.month
    df['weekofyear'] = df.index.isocalendar().week
    df['year'] = df.index.isocalendar().year
    df['sin_week'] = np.sin(2 * np.pi * df['weekofyear'] / 52)
    df['cos_week'] = np.cos(2 * np.pi * df['weekofyear'] / 52)

    df['EMA_3'] = df[target_col].ewm(span=3, adjust=False).mean()
    df['EMA_6'] = df[target_col].ewm(span=6, adjust=False).mean()

    df['correlation_target_month'] = df[target_col].rolling(window=6).corr(df['month'])
    df['correlation_target_week'] = df[target_col].rolling(window=6).corr(df['weekofyear'])

    fft_values = np.fft.fft(df[target_col].dropna().values)
    fft_real = np.real(fft_values)[:len(df[target_col])]
    fft_imag = np.imag(fft_values)[:len(df[target_col])]
    df['fft_real'] = np.concatenate([fft_real, np.nan * np.ones(len(df) - len(fft_real))])
    df['fft_imag'] = np.concatenate([fft_imag, np.nan * np.ones(len(df) - len(fft_imag))])

    df.fillna(df.median(), inplace=True)
    return df, target_col

# ===========================================
#  SCALING
# ===========================================
def feature_scaling(df, target_col):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    return pd.DataFrame(X_scaled, columns=X.columns), y, scaler

# ===========================================
#  MODEL
# ===========================================
def build_mlp_model(input_dim, num_classes=4, num_units=64, dropout_rate=0.2, activation='relu'):
    model = Sequential()
    model.add(Dense(num_units, activation=activation, input_dim=input_dim))
    model.add(Dropout(dropout_rate))
    model.add(Dense(num_units // 2, activation=activation))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# ===========================================
#  FEATURE SELECTION
# ===========================================
def mutual_information_feature_selection(X, y):
    scores = mutual_info_classif(X, y)
    return scores > 0.01

def pearson_correlation_feature_selection(X, y):
    scores = np.array([abs(pearsonr(X[col], y)[0]) for col in X.columns])
    return scores > 0.1

def variance_threshold_feature_selection(X, threshold=0.01):
    selector = VarianceThreshold(threshold)
    selector.fit(X)
    return selector.get_support()

def rfe_feature_selection(X, y, n_features_to_select=10):
    rfe = RFE(SVC(kernel="linear"), n_features_to_select=n_features_to_select)
    rfe.fit(X, y)
    return rfe.support_

def pso_feature_selection(X, y):
    def fitness_function(features):
        mask = features > 0.5
        if mask.sum() == 0:
            return 1.0
        clf = RandomForestClassifier()
        clf.fit(X.iloc[:, mask], y)
        acc = clf.score(X.iloc[:, mask], y)
        return 1 - acc

    lb = np.zeros(X.shape[1])
    ub = np.ones(X.shape[1])
    best_pos, _ = pso(fitness_function, lb, ub, swarmsize=10, maxiter=3)
    return best_pos > 0.5

def lasso_feature_selection(X, y):
    model = Lasso(alpha=0.01)
    model.fit(X, y)
    return np.abs(model.coef_) > 0.001

def tree_based_feature_selection(X, y):
    model = RandomForestClassifier()
    model.fit(X, y)
    return model.feature_importances_ > np.mean(model.feature_importances_)

# ===========================================
#  TRAINING AND EVALUATION
# ===========================================
def train_and_evaluate_model(X_train, X_test, y_train, y_test, selected_features):
    selected_columns = X_train.columns[selected_features]
    X_train_sel = X_train[selected_columns]
    X_test_sel = X_test[selected_columns]

    model = build_mlp_model(input_dim=X_train_sel.shape[1])
    model.fit(X_train_sel, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=0)
    y_pred = model.predict(X_test_sel).argmax(axis=1)

    acc = accuracy_score(y_test, y_pred)
    print(f"\n Accuracy: {acc:.4f}")
    print(" Classification Report:\n", classification_report(y_test, y_pred))
    print(" Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    return acc, model

# ===========================================
#  MAIN
# ===========================================
def main():
    train_fp = r"E:\\Abroad period research\\Phenology datasets\\PHENOLOGY_H1\\Cadiz_train.csv"
    test_fp  = r"E:\\Abroad period research\\Phenology datasets\\PHENOLOGY_H1\\Cadiz_test.csv"

    df_train, target_col = load_and_preprocess_data(train_fp)
    X_train, y_train, scaler = feature_scaling(df_train, target_col)

    df_test, _ = load_and_preprocess_data(test_fp)
    X_test, y_test, _ = feature_scaling(df_test, target_col)

    print(" Mutual Info...")
    acc1, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, mutual_information_feature_selection(X_train, y_train))

    print(" Pearson Corr...")
    acc2, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, pearson_correlation_feature_selection(X_train, y_train))

    print(" Variance Threshold...")
    acc3, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, variance_threshold_feature_selection(X_train))

    print(" RFE...")
    acc4, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, rfe_feature_selection(X_train, y_train))

    print(" PSO...")
    acc5, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, pso_feature_selection(X_train, y_train))

    print(" Lasso...")
    acc6, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, lasso_feature_selection(X_train, y_train))

    print(" Tree-Based...")
    acc7, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, tree_based_feature_selection(X_train, y_train))

    print("\n FINAL COMPARISON:")
    for name, acc in zip(["MI", "Pearson", "Variance", "RFE", "PSO", "Lasso", "Tree"],
                         [acc1, acc2, acc3, acc4, acc5, acc6, acc7]):
        print(f"{name:10s}: {acc:.4f}")

if __name__ == "__main__":
    main()


In [None]:
# ===========================================
#  IMPORTS
# ===========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import mutual_info_classif, RFE, VarianceThreshold, SelectKBest, f_classif
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
tf.random.set_seed(42)

# ===========================================
#  DATA LOADING + PREPROCESSING
# ===========================================
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.replace('"', '').str.strip()
    df['TIME'] = pd.to_datetime(df['TIME'], errors='coerce')
    df.dropna(subset=['TIME'], inplace=True)
    df.set_index('TIME', inplace=True)
    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(axis=1, how='all', inplace=True)

    target_col = 'fenologia_h1'
    df[target_col].interpolate(method='linear', inplace=True)
    df[target_col] = df[target_col] - 1

    for i in range(1, 4):
        df[f'{target_col}_lag{i}'] = df[target_col].shift(i)

    for window in [3, 6]:
        df[f'{target_col}_roll_mean_{window}'] = df[target_col].rolling(window).mean()
        df[f'{target_col}_roll_std_{window}'] = df[target_col].rolling(window).std()

    df['month'] = df.index.month
    df['weekofyear'] = df.index.isocalendar().week
    df['year'] = df.index.isocalendar().year
    df['sin_week'] = np.sin(2 * np.pi * df['weekofyear'] / 52)
    df['cos_week'] = np.cos(2 * np.pi * df['weekofyear'] / 52)

    df['EMA_3'] = df[target_col].ewm(span=3, adjust=False).mean()
    df['EMA_6'] = df[target_col].ewm(span=6, adjust=False).mean()

    df['correlation_target_month'] = df[target_col].rolling(window=6).corr(df['month'])
    df['correlation_target_week'] = df[target_col].rolling(window=6).corr(df['weekofyear'])

    fft_values = np.fft.fft(df[target_col].dropna().values)
    fft_real = np.real(fft_values)[:len(df[target_col])]
    fft_imag = np.imag(fft_values)[:len(df[target_col])]
    df['fft_real'] = np.concatenate([fft_real, np.nan * np.ones(len(df) - len(fft_real))])
    df['fft_imag'] = np.concatenate([fft_imag, np.nan * np.ones(len(df) - len(fft_imag))])

    df.fillna(df.median(), inplace=True)
    return df, target_col

# ===========================================
#  SCALING
# ===========================================
def feature_scaling(df, target_col):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    return pd.DataFrame(X_scaled, columns=X.columns), y, scaler

# ===========================================
#  MODEL
# ===========================================
def build_mlp_model(input_dim, num_classes=4, num_units=64, dropout_rate=0.2, activation='relu'):
    model = Sequential()
    model.add(Dense(num_units, activation=activation, input_dim=input_dim))
    model.add(Dropout(dropout_rate))
    model.add(Dense(num_units // 2, activation=activation))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# ===========================================
#  FEATURE SELECTION
# ===========================================
def mutual_information_feature_selection(X, y):
    scores = mutual_info_classif(X, y)
    return scores > 0.01

def pearson_correlation_feature_selection(X, y):
    scores = np.array([abs(pearsonr(X[col], y)[0]) for col in X.columns])
    return scores > 0.1

def variance_threshold_feature_selection(X, threshold=0.01):
    selector = VarianceThreshold(threshold)
    selector.fit(X)
    return selector.get_support()

def rfe_feature_selection(X, y, n_features_to_select=10):
    rfe = RFE(SVC(kernel="linear"), n_features_to_select=n_features_to_select)
    rfe.fit(X, y)
    return rfe.support_

def logistic_regression_wrapper_selection(X, y):
    model = LogisticRegression(solver='liblinear')
    rfe = RFE(model, n_features_to_select=10)
    rfe.fit(X, y)
    return rfe.support_

def f_classif_filter_selection(X, y):
    selector = SelectKBest(score_func=f_classif, k=10)
    selector.fit(X, y)
    return selector.get_support()

def lasso_feature_selection(X, y):
    model = Lasso(alpha=0.01)
    model.fit(X, y)
    return np.abs(model.coef_) > 0.001

def tree_based_feature_selection(X, y):
    model = RandomForestClassifier()
    model.fit(X, y)
    return model.feature_importances_ > np.mean(model.feature_importances_)

# ===========================================
#  TRAINING AND EVALUATION
# ===========================================
def train_and_evaluate_model(X_train, X_test, y_train, y_test, selected_features):
    selected_columns = X_train.columns[selected_features]
    X_train_sel = X_train[selected_columns]
    X_test_sel = X_test[selected_columns]

    model = build_mlp_model(input_dim=X_train_sel.shape[1])
    model.fit(X_train_sel, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=0)
    y_pred = model.predict(X_test_sel).argmax(axis=1)

    acc = accuracy_score(y_test, y_pred)
    print(f"\n Accuracy: {acc:.4f}")
    print(" Classification Report:\n", classification_report(y_test, y_pred))
    print(" Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    return acc, model

# ===========================================
#  MAIN
# ===========================================
def main():
    train_fp = r"E:\\Abroad period research\\Phenology datasets\\PHENOLOGY_H1\\Cadiz_train.csv"
    test_fp  = r"E:\\Abroad period research\\Phenology datasets\\PHENOLOGY_H1\\Cadiz_test.csv"

    df_train, target_col = load_and_preprocess_data(train_fp)
    X_train, y_train, scaler = feature_scaling(df_train, target_col)

    df_test, _ = load_and_preprocess_data(test_fp)
    X_test, y_test, _ = feature_scaling(df_test, target_col)

    print(" Mutual Info (Filter)...")
    acc1, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, mutual_information_feature_selection(X_train, y_train))

    print(" F-ANOVA (Filter)...")
    acc2, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, f_classif_filter_selection(X_train, y_train))

    print(" RFE SVC (Wrapper)...")
    acc3, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, rfe_feature_selection(X_train, y_train))

    print(" Logistic Regression Wrapper...")
    acc4, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, logistic_regression_wrapper_selection(X_train, y_train))

    print(" Lasso (Embedded)...")
    acc5, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, lasso_feature_selection(X_train, y_train))

    print(" Tree-Based (Embedded)...")
    acc6, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, tree_based_feature_selection(X_train, y_train))

    print("\n FINAL COMPARISON:")
    for name, acc in zip(["MI", "F-Classif", "RFE", "LogRegWrapper", "Lasso", "Tree"],
                         [acc1, acc2, acc3, acc4, acc5, acc6]):
        print(f"{name:15s}: {acc:.4f}")

if __name__ == "__main__":
    main()


In [None]:
# ===========================================
#  IMPORTS
# ===========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import mutual_info_classif, RFE, VarianceThreshold, SelectKBest, f_classif
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import SelectFromModel
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import (mutual_info_classif, RFE, VarianceThreshold,
                                       SelectFromModel, f_classif)
from sklearn.base import clone


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
tf.random.set_seed(42)

# ===========================================
#  DATA LOADING + PREPROCESSING
# ===========================================
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.replace('"', '').str.strip()
    df['TIME'] = pd.to_datetime(df['TIME'], errors='coerce')
    df.dropna(subset=['TIME'], inplace=True)
    df.set_index('TIME', inplace=True)
    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(axis=1, how='all', inplace=True)

    target_col = 'fenologia_h1'
    df[target_col].interpolate(method='linear', inplace=True)
    df[target_col] = df[target_col] - 1

    for i in range(1, 4):
        df[f'{target_col}_lag{i}'] = df[target_col].shift(i)

    for window in [3, 6]:
        df[f'{target_col}_roll_mean_{window}'] = df[target_col].rolling(window).mean()
        df[f'{target_col}_roll_std_{window}'] = df[target_col].rolling(window).std()

    df['month'] = df.index.month
    df['weekofyear'] = df.index.isocalendar().week
    df['year'] = df.index.isocalendar().year
    df['sin_week'] = np.sin(2 * np.pi * df['weekofyear'] / 52)
    df['cos_week'] = np.cos(2 * np.pi * df['weekofyear'] / 52)

    df['EMA_3'] = df[target_col].ewm(span=3, adjust=False).mean()
    df['EMA_6'] = df[target_col].ewm(span=6, adjust=False).mean()

    df['correlation_target_month'] = df[target_col].rolling(window=6).corr(df['month'])
    df['correlation_target_week'] = df[target_col].rolling(window=6).corr(df['weekofyear'])

    fft_values = np.fft.fft(df[target_col].dropna().values)
    fft_real = np.real(fft_values)[:len(df[target_col])]
    fft_imag = np.imag(fft_values)[:len(df[target_col])]
    df['fft_real'] = np.concatenate([fft_real, np.nan * np.ones(len(df) - len(fft_real))])
    df['fft_imag'] = np.concatenate([fft_imag, np.nan * np.ones(len(df) - len(fft_imag))])

    df.fillna(df.median(), inplace=True)
    return df, target_col

# ===========================================
#  SCALING
# ===========================================
def feature_scaling(df, target_col):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    return pd.DataFrame(X_scaled, columns=X.columns), y, scaler

# ===========================================
#  MODEL
# ===========================================
def build_mlp_model(input_dim, num_classes=4, num_units=64, dropout_rate=0.2, activation='relu'):
    model = Sequential()
    model.add(Dense(num_units, activation=activation, input_dim=input_dim))
    model.add(Dropout(dropout_rate))
    model.add(Dense(num_units // 2, activation=activation))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


# ===========================================
#  FEATURE SELECTION
# ===========================================
def mutual_information_feature_selection(X, y):
    scores = mutual_info_classif(X, y)
    return scores > 0.01

def pearson_correlation_feature_selection(X, y):
    scores = np.array([abs(pearsonr(X[col], y)[0]) for col in X.columns])
    return scores > 0.1

def variance_threshold_feature_selection(X, threshold=0.01):
    selector = VarianceThreshold(threshold)
    selector.fit(X)
    return selector.get_support()

def rfe_feature_selection(X, y, n_features_to_select=10):
    rfe = RFE(SVC(kernel="linear"), n_features_to_select=n_features_to_select)
    rfe.fit(X, y)
    return rfe.support_

def logistic_regression_wrapper_selection(X, y):
    model = LogisticRegression(solver='liblinear')
    rfe = RFE(model, n_features_to_select=10)
    rfe.fit(X, y)
    return rfe.support_

def f_classif_filter_selection(X, y):
    selector = SelectKBest(score_func=f_classif, k=10)
    selector.fit(X, y)
    return selector.get_support()

def lasso_feature_selection(X, y):
    model = Lasso(alpha=0.01)
    model.fit(X, y)
    return np.abs(model.coef_) > 0.001

def tree_based_feature_selection(X, y):
    model = RandomForestClassifier()
    model.fit(X, y)
    return model.feature_importances_ > np.mean(model.feature_importances_)

def variance_threshold_feature_selection(X, threshold=0.01):
    selector = VarianceThreshold(threshold)
    selector.fit(X)
    return selector.get_support()

#  New Filter: Variance Threshold

#  New Wrapper: Forward Selection (simple custom greedy implementation)
def forward_selection(X, y, n_features=10):
    from sklearn.base import clone
    model = RandomForestClassifier()
    selected = []
    remaining = list(X.columns)
    while len(selected) < n_features:
        scores = []
        for feature in remaining:
            temp_features = selected + [feature]
            model_clone = clone(model)
            model_clone.fit(X[temp_features], y)
            acc = model_clone.score(X[temp_features], y)
            scores.append((acc, feature))
        scores.sort(reverse=True)
        best_feature = scores[0][1]
        selected.append(best_feature)
        remaining.remove(best_feature)
    return X.columns.isin(selected)

#  New Embedded: Linear SVM with L1 penalty

def linear_svc_l1_feature_selection(X, y):
    svc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
    selector = SelectFromModel(svc, prefit=True)
    return selector.get_support()


# ===========================================
#  TRAINING AND EVALUATION
# ===========================================
def train_and_evaluate_model(X_train, X_test, y_train, y_test, selected_features):
    selected_columns = X_train.columns[selected_features]
    X_train_sel = X_train[selected_columns]
    X_test_sel = X_test[selected_columns]

    model = build_mlp_model(input_dim=X_train_sel.shape[1])
    model.fit(X_train_sel, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=0)
    y_pred = model.predict(X_test_sel).argmax(axis=1)

    acc = accuracy_score(y_test, y_pred)
    print(f"\n Accuracy: {acc:.4f}")
    print(" Classification Report:\n", classification_report(y_test, y_pred))
    print(" Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    return acc, model

# ===========================================
#  MAIN
# ===========================================
def main():
    train_fp = r"E:\\Abroad period research\\Phenology datasets\\PHENOLOGY_H1\\Cadiz_train.csv"
    test_fp  = r"E:\\Abroad period research\\Phenology datasets\\PHENOLOGY_H1\\Cadiz_test.csv"

    df_train, target_col = load_and_preprocess_data(train_fp)
    X_train, y_train, scaler = feature_scaling(df_train, target_col)

    df_test, _ = load_and_preprocess_data(test_fp)
    X_test, y_test, _ = feature_scaling(df_test, target_col)

    print(" Mutual Info (Filter)...")
    acc1, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, mutual_information_feature_selection(X_train, y_train))

    print(" F-ANOVA (Filter)...")
    acc2, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, f_classif_filter_selection(X_train, y_train))

    print(" Variance Threshold (Filter)...")
    acc3, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, variance_threshold_feature_selection(X_train))

    print(" RFE SVC (Wrapper)...")
    acc4, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, rfe_feature_selection(X_train, y_train))

    print(" Logistic Regression Wrapper...")
    acc5, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, logistic_regression_wrapper_selection(X_train, y_train))

    print(" Forward Selection (Wrapper)...")
    acc6, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, forward_selection(X_train, y_train))

    print(" Lasso (Embedded)...")
    acc7, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, lasso_feature_selection(X_train, y_train))

    print(" Tree-Based (Embedded)...")
    acc8, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, tree_based_feature_selection(X_train, y_train))

    print(" Linear SVC L1 (Embedded)...")
    acc9, _ = train_and_evaluate_model(X_train, X_test, y_train, y_test, linear_svc_l1_feature_selection(X_train, y_train))

    print("\n FINAL COMPARISON:")
    methods = ["MI", "F-Classif", "Variance", "RFE", "LogRegWrapper", "ForwardSel", "Lasso", "Tree", "LinearSVC-L1"]
    accuracies = [acc1, acc2, acc3, acc4, acc5, acc6, acc7, acc8, acc9]
    for name, acc in zip(methods, accuracies):
        print(f"{name:15s}: {acc:.4f}")

if __name__ == "__main__":
    main()


Feature selection algorithms with proper results for comparison

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import mutual_info_classif, RFE, VarianceThreshold
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
tf.random.set_seed(42)

# Dummy PSO (replace with your real PSO function)
def pso(fitness_func, lb, ub, swarmsize=10, maxiter=3):
    dim = len(lb)
    best = np.random.rand(dim)
    return best, 0.5

# ======================
# Load & Preprocess
# ======================
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.replace('"', '').str.strip()
    df['TIME'] = pd.to_datetime(df['TIME'], errors='coerce')
    df.dropna(subset=['TIME'], inplace=True)
    df.set_index('TIME', inplace=True)
    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(axis=1, how='all', inplace=True)

    target_col = 'fenologia_h1'
    df[target_col].interpolate(method='linear', inplace=True)
    df[target_col] = df[target_col] - 1

    for i in range(1, 4):
        df[f'{target_col}_lag{i}'] = df[target_col].shift(i)

    for window in [3, 6]:
        df[f'{target_col}_roll_mean_{window}'] = df[target_col].rolling(window).mean()
        df[f'{target_col}_roll_std_{window}'] = df[target_col].rolling(window).std()

    df['month'] = df.index.month
    df['weekofyear'] = df.index.isocalendar().week
    df['year'] = df.index.isocalendar().year
    df['sin_week'] = np.sin(2 * np.pi * df['weekofyear'] / 52)
    df['cos_week'] = np.cos(2 * np.pi * df['weekofyear'] / 52)

    df['EMA_3'] = df[target_col].ewm(span=3, adjust=False).mean()
    df['EMA_6'] = df[target_col].ewm(span=6, adjust=False).mean()
    df['correlation_target_month'] = df[target_col].rolling(window=6).corr(df['month'])
    df['correlation_target_week'] = df[target_col].rolling(window=6).corr(df['weekofyear'])

    fft_values = np.fft.fft(df[target_col].dropna().values)
    fft_real = np.real(fft_values)[:len(df[target_col])]
    fft_imag = np.imag(fft_values)[:len(df[target_col])]
    df['fft_real'] = np.concatenate([fft_real, np.nan * np.ones(len(df) - len(fft_real))])
    df['fft_imag'] = np.concatenate([fft_imag, np.nan * np.ones(len(df) - len(fft_imag))])
    df.fillna(df.median(), inplace=True)
    return df, target_col

# ======================
# Scaling
# ======================
def feature_scaling(df, target_col):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    return pd.DataFrame(X_scaled, columns=X.columns), y

# ======================
# MLP Model
# ======================
def build_mlp_model(input_dim, num_classes=4, num_units=64, dropout_rate=0.2, activation='relu'):
    model = Sequential()
    model.add(Dense(num_units, activation=activation, input_dim=input_dim))
    model.add(Dropout(dropout_rate))
    model.add(Dense(num_units // 2, activation=activation))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# ======================
# Feature Selection Methods
# ======================
def mutual_information_feature_selection(X, y):
    scores = mutual_info_classif(X, y)
    return scores > 0.01

def pearson_correlation_feature_selection(X, y):
    scores = np.array([abs(pearsonr(X[col], y)[0]) for col in X.columns])
    return scores > 0.1

def variance_threshold_feature_selection(X):
    selector = VarianceThreshold(threshold=0.01)
    selector.fit(X)
    return selector.get_support()

def rfe_feature_selection(X, y, n_features_to_select=10):
    rfe = RFE(SVC(kernel="linear"), n_features_to_select=n_features_to_select)
    rfe.fit(X, y)
    return rfe.support_

def pso_feature_selection(X, y):
    def fitness_function(features):
        mask = features > 0.5
        if mask.sum() == 0:
            return 1.0
        clf = RandomForestClassifier()
        clf.fit(X.iloc[:, mask], y)
        acc = clf.score(X.iloc[:, mask], y)
        return 1 - acc

    lb = np.zeros(X.shape[1])
    ub = np.ones(X.shape[1])
    best_pos, _ = pso(fitness_function, lb, ub)
    return best_pos > 0.5

def lasso_feature_selection(X, y):
    model = Lasso(alpha=0.01)
    model.fit(X, y)
    return np.abs(model.coef_) > 0.001

def tree_based_feature_selection(X, y):
    model = RandomForestClassifier()
    model.fit(X, y)
    return model.feature_importances_ > np.mean(model.feature_importances_)

# ======================
# Train & Evaluate
# ======================
def train_and_evaluate(X_train, X_test, y_train, y_test, selector_func, method_name):
    start_fs = time.time()
    if method_name == "Variance":
        mask = selector_func(X_train)
    else:
        mask = selector_func(X_train, y_train)
    fs_time = time.time() - start_fs

    selected_cols = X_train.columns[mask]
    X_train_sel = X_train[selected_cols]
    X_test_sel = X_test[selected_cols]

    start_train = time.time()
    model = build_mlp_model(input_dim=X_train_sel.shape[1])
    model.fit(X_train_sel, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=0)
    train_time = time.time() - start_train

    start_test = time.time()
    y_pred = model.predict(X_test_sel).argmax(axis=1)
    test_time = time.time() - start_test

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    n_features = sum(mask)
    reduction_pct = 100 * (1 - n_features / X_train.shape[1])

    return {
        "Method": method_name,
        "Accuracy": round(acc, 4),
        "Precision": round(prec, 4),
        "Recall": round(rec, 4),
        "F1 Score": round(f1, 4),
        "#Features": n_features,
        "Feature Reduction (%)": round(reduction_pct, 2),
        "FS Time (s)": round(fs_time, 2),
        "Train Time (s)": round(train_time, 2),
        "Test Time (s)": round(test_time, 2)
    }

# ======================
# Main
# ======================
def main():
    train_fp = r"E:\\Abroad period research\\Phenology datasets\\PHENOLOGY_H1\\Cadiz_train.csv"
    test_fp = r"E:\\Abroad period research\\Phenology datasets\\PHENOLOGY_H1\\Cadiz_test.csv"

    df_train, target_col = load_and_preprocess_data(train_fp)
    X_train, y_train = feature_scaling(df_train, target_col)

    df_test, _ = load_and_preprocess_data(test_fp)
    X_test, y_test = feature_scaling(df_test, target_col)

    methods = [
        ("Mutual Info", mutual_information_feature_selection),
        ("Pearson", pearson_correlation_feature_selection),
        ("Variance", variance_threshold_feature_selection),
        ("RFE", rfe_feature_selection),
        ("PSO", pso_feature_selection),
        ("Lasso", lasso_feature_selection),
        ("Tree-Based", tree_based_feature_selection),
    ]

    results = []
    for name, func in methods:
        print(f" {name}...")
        result = train_and_evaluate(X_train, X_test, y_train, y_test, func, name)
        results.append(result)

    df_results = pd.DataFrame(results)
    print("\n FINAL COMPARISON TABLE:")
    print(df_results.to_string(index=False))

if __name__ == "__main__":
    main()


In [None]:
# ===========================================
# 📦 IMPORTS
# ===========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import mutual_info_classif, RFE, VarianceThreshold, SelectKBest, f_classif
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
tf.random.set_seed(42)

# ===========================================
#  DATA LOADING + PREPROCESSING
# ===========================================
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.replace('"', '').str.strip()
    df['TIME'] = pd.to_datetime(df['TIME'], errors='coerce')
    df.dropna(subset=['TIME'], inplace=True)
    df.set_index('TIME', inplace=True)
    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(axis=1, how='all', inplace=True)

    target_col = 'fenologia_h1'
    df[target_col].interpolate(method='linear', inplace=True)
    df[target_col] = df[target_col] - 1

    for i in range(1, 4):
        df[f'{target_col}_lag{i}'] = df[target_col].shift(i)

    for window in [3, 6]:
        df[f'{target_col}_roll_mean_{window}'] = df[target_col].rolling(window).mean()
        df[f'{target_col}_roll_std_{window}'] = df[target_col].rolling(window).std()

    df['month'] = df.index.month
    df['weekofyear'] = df.index.isocalendar().week
    df['year'] = df.index.isocalendar().year
    df['sin_week'] = np.sin(2 * np.pi * df['weekofyear'] / 52)
    df['cos_week'] = np.cos(2 * np.pi * df['weekofyear'] / 52)

    df['EMA_3'] = df[target_col].ewm(span=3, adjust=False).mean()
    df['EMA_6'] = df[target_col].ewm(span=6, adjust=False).mean()

    df['correlation_target_month'] = df[target_col].rolling(window=6).corr(df['month'])
    df['correlation_target_week'] = df[target_col].rolling(window=6).corr(df['weekofyear'])

    fft_values = np.fft.fft(df[target_col].dropna().values)
    fft_real = np.real(fft_values)[:len(df[target_col])]
    fft_imag = np.imag(fft_values)[:len(df[target_col])]
    df['fft_real'] = np.concatenate([fft_real, np.nan * np.ones(len(df) - len(fft_real))])
    df['fft_imag'] = np.concatenate([fft_imag, np.nan * np.ones(len(df) - len(fft_imag))])

    df.fillna(df.median(), inplace=True)
    return df, target_col

# ===========================================
#  SCALING
# ===========================================
def feature_scaling(df, target_col):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    return pd.DataFrame(X_scaled, columns=X.columns), y, scaler

# ===========================================
#  MODEL
# ===========================================
def build_mlp_model(input_dim, num_classes=4, num_units=64, dropout_rate=0.2, activation='relu'):
    model = Sequential()
    model.add(Dense(num_units, activation=activation, input_dim=input_dim))
    model.add(Dropout(dropout_rate))
    model.add(Dense(num_units // 2, activation=activation))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# ===========================================
#  FEATURE SELECTION FUNCTIONS
# ===========================================
def timed_feature_selection(method, X, y):
    start = time.time()
    mask = method(X, y)
    duration = time.time() - start
    return mask, duration

def mutual_information_feature_selection(X, y):
    return mutual_info_classif(X, y) > 0.01

def pearson_correlation_feature_selection(X, y):
    scores = np.array([abs(pearsonr(X[col], y)[0]) for col in X.columns])
    return scores > 0.1

def variance_threshold_feature_selection(X, y, threshold=0.01):
    selector = VarianceThreshold(threshold)
    selector.fit(X)
    return selector.get_support()

def rfe_feature_selection(X, y, n_features_to_select=10):
    rfe = RFE(SVC(kernel="linear"), n_features_to_select=n_features_to_select)
    rfe.fit(X, y)
    return rfe.support_

def logistic_regression_wrapper_selection(X, y):
    model = LogisticRegression(solver='liblinear')
    rfe = RFE(model, n_features_to_select=10)
    rfe.fit(X, y)
    return rfe.support_

def f_classif_filter_selection(X, y):
    selector = SelectKBest(score_func=f_classif, k=10)
    selector.fit(X, y)
    return selector.get_support()

def lasso_feature_selection(X, y):
    model = Lasso(alpha=0.01)
    model.fit(X, y)
    return np.abs(model.coef_) > 0.001

def tree_based_feature_selection(X, y):
    model = RandomForestClassifier()
    model.fit(X, y)
    return model.feature_importances_ > np.mean(model.feature_importances_)

# ===========================================
#  TRAINING AND EVALUATION
# ===========================================
def train_and_evaluate_model(X_train, X_test, y_train, y_test, selected_features):
    X_train_sel = X_train.loc[:, selected_features]
    X_test_sel = X_test.loc[:, selected_features]

    model = build_mlp_model(input_dim=X_train_sel.shape[1])

    start_train = time.time()
    model.fit(X_train_sel, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=0)
    train_time = time.time() - start_train

    start_test = time.time()
    y_pred = model.predict(X_test_sel).argmax(axis=1)
    test_time = time.time() - start_test

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    return acc, prec, rec, f1, train_time, test_time

# ===========================================
#  MAIN
# ===========================================
def main():
    train_fp = r"E:\\Abroad period research\\Phenology datasets\\PHENOLOGY_H1\\Cadiz_train.csv"
    test_fp  = r"E:\\Abroad period research\\Phenology datasets\\PHENOLOGY_H1\\Cadiz_test.csv"

    df_train, target_col = load_and_preprocess_data(train_fp)
    X_train, y_train, scaler = feature_scaling(df_train, target_col)

    df_test, _ = load_and_preprocess_data(test_fp)
    X_test, y_test, _ = feature_scaling(df_test, target_col)

    fs_methods = {
        "Mutual Info": mutual_information_feature_selection,
        "F-Classif": f_classif_filter_selection,
        "RFE": rfe_feature_selection,
        "LogRegWrapper": logistic_regression_wrapper_selection,
        "Lasso": lasso_feature_selection,
        "Tree-Based": tree_based_feature_selection
    }

    results = []

    for name, method in fs_methods.items():
        print(f"\n🔍 Running Feature Selection: {name}")
        mask, fs_time = timed_feature_selection(method, X_train, y_train)
        selected_cols = X_train.columns[mask]
        n_features = len(selected_cols)
        reduction = 100 * (1 - n_features / X_train.shape[1])

        acc, prec, rec, f1, train_time, test_time = train_and_evaluate_model(
            X_train, X_test, y_train, y_test, selected_cols
        )

        results.append({
            "Method": name,
            "Accuracy": round(acc, 4),
            "Precision": round(prec, 4),
            "Recall": round(rec, 4),
            "F1 Score": round(f1, 4),
            "#Features": n_features,
            "Feature Reduction (%)": round(reduction, 2),
            "FS Time (s)": round(fs_time, 2),
            "Train Time (s)": round(train_time, 2),
            "Test Time (s)": round(test_time, 2)
        })

    print("\n Final Comparison Table:")
    df_results = pd.DataFrame(results)
    print(df_results.to_string(index=False))

if __name__ == "__main__":
    main()


In [None]:
# ===========================================
#  IMPORTS
# ===========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import mutual_info_classif, RFE, VarianceThreshold, SelectKBest, f_classif
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import SelectFromModel
import time
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.base import clone
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
tf.random.set_seed(42)

# ===========================================
#  DATA LOADING + PREPROCESSING
# ===========================================
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.replace('"', '').str.strip()
    df['TIME'] = pd.to_datetime(df['TIME'], errors='coerce')
    df.dropna(subset=['TIME'], inplace=True)
    df.set_index('TIME', inplace=True)
    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(axis=1, how='all', inplace=True)

    target_col = 'fenologia_h2'
    df[target_col].interpolate(method='linear', inplace=True)
    df[target_col] = df[target_col] - 1

    for i in range(1, 4):
        df[f'{target_col}_lag{i}'] = df[target_col].shift(i)

    for window in [3, 6]:
        df[f'{target_col}_roll_mean_{window}'] = df[target_col].rolling(window).mean()
        df[f'{target_col}_roll_std_{window}'] = df[target_col].rolling(window).std()

    df['month'] = df.index.month
    df['weekofyear'] = df.index.isocalendar().week
    df['year'] = df.index.isocalendar().year
    df['sin_week'] = np.sin(2 * np.pi * df['weekofyear'] / 52)
    df['cos_week'] = np.cos(2 * np.pi * df['weekofyear'] / 52)

    df['EMA_3'] = df[target_col].ewm(span=3, adjust=False).mean()
    df['EMA_6'] = df[target_col].ewm(span=6, adjust=False).mean()

    df['correlation_target_month'] = df[target_col].rolling(window=6).corr(df['month'])
    df['correlation_target_week'] = df[target_col].rolling(window=6).corr(df['weekofyear'])

    fft_values = np.fft.fft(df[target_col].dropna().values)
    fft_real = np.real(fft_values)[:len(df[target_col])]
    fft_imag = np.imag(fft_values)[:len(df[target_col])]
    df['fft_real'] = np.concatenate([fft_real, np.nan * np.ones(len(df) - len(fft_real))])
    df['fft_imag'] = np.concatenate([fft_imag, np.nan * np.ones(len(df) - len(fft_imag))])

    df.fillna(df.median(), inplace=True)
    return df, target_col

# ===========================================
#  SCALING
# ===========================================
def feature_scaling(df, target_col):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    return pd.DataFrame(X_scaled, columns=X.columns), y, scaler

# ===========================================
#  MODEL
# ===========================================
def build_mlp_model(input_dim, num_classes=4, num_units=64, dropout_rate=0.2, activation='relu'):
    model = Sequential()
    model.add(Dense(num_units, activation=activation, input_dim=input_dim))
    model.add(Dropout(dropout_rate))
    model.add(Dense(num_units // 2, activation=activation))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# ===========================================
#  FEATURE SELECTION FUNCTIONS
# ===========================================
def mutual_information_feature_selection(X, y):
    return mutual_info_classif(X, y) > 0.01

def pearson_correlation_feature_selection(X, y):
    scores = np.array([abs(pearsonr(X[col], y)[0]) for col in X.columns])
    return scores > 0.1

def variance_threshold_feature_selection(X, threshold=0.01):
    return VarianceThreshold(threshold).fit(X).get_support()

def rfe_feature_selection(X, y, n_features_to_select=10):
    return RFE(SVC(kernel="linear"), n_features_to_select=n_features_to_select).fit(X, y).support_

def logistic_regression_wrapper_selection(X, y):
    return RFE(LogisticRegression(solver='liblinear'), n_features_to_select=10).fit(X, y).support_

def f_classif_filter_selection(X, y):
    return SelectKBest(score_func=f_classif, k=10).fit(X, y).get_support()

def lasso_feature_selection(X, y):
    return np.abs(Lasso(alpha=0.01).fit(X, y).coef_) > 0.001

def tree_based_feature_selection(X, y):
    model = RandomForestClassifier()
    model.fit(X, y)
    return model.feature_importances_ > np.mean(model.feature_importances_)

def forward_selection(X, y, n_features=10):
    model = RandomForestClassifier()
    selected, remaining = [], list(X.columns)
    while len(selected) < n_features:
        best_feature = max(
            ((clone(model).fit(X[selected + [f]], y).score(X[selected + [f]], y), f) for f in remaining),
            key=lambda x: x[0]
        )[1]
        selected.append(best_feature)
        remaining.remove(best_feature)
    return X.columns.isin(selected)

def linear_svc_l1_feature_selection(X, y):
    svc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
    return SelectFromModel(svc, prefit=True).get_support()

# ===========================================
#  TRAINING AND EVALUATION
# ===========================================
def train_evaluate_record(X_train, X_test, y_train, y_test, fs_func, method_name):
    record = {'Method': method_name}
    fs_start = time.time()
    selected_features = fs_func(X_train, y_train) if 'y' in fs_func.__code__.co_varnames else fs_func(X_train)
    fs_time = time.time() - fs_start
    record['FS Time (s)'] = round(fs_time, 4)

    X_train_sel = X_train.loc[:, selected_features]
    X_test_sel = X_test.loc[:, selected_features]
    record['#Features'] = X_train_sel.shape[1]
    record['Feature Reduction (%)'] = round(100 * (1 - X_train_sel.shape[1] / X_train.shape[1]), 2)

    train_start = time.time()
    model = build_mlp_model(X_train_sel.shape[1])
    model.fit(X_train_sel, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=0)
    
    train_time = time.time() - train_start
    record['Train Time (s)'] = round(train_time, 4)

    test_start = time.time()
    y_pred = model.predict(X_test_sel).argmax(axis=1)
    test_time = time.time() - test_start
    record['Test Time (s)'] = round(test_time, 4)

    record['Accuracy'] = round(accuracy_score(y_test, y_pred), 4)
    record['Precision'] = round(precision_score(y_test, y_pred, average='weighted'), 4)
    record['Recall'] = round(recall_score(y_test, y_pred, average='weighted'), 4)
    record['F1 Score'] = round(f1_score(y_test, y_pred, average='weighted'), 4)

    return record

# ===========================================
#  MAIN
# ===========================================
def main():
    train_fp = r"E:\Abroad period research\Phenology datasets\PHENOLOGY_H2_train.csv"
    test_fp  = r"E:\Abroad period research\Phenology datasets\PHENOLOGY_H2_test.csv"

    df_train, target_col = load_and_preprocess_data(train_fp)
    X_train, y_train, _ = feature_scaling(df_train, target_col)

    df_test, _ = load_and_preprocess_data(test_fp)
    X_test, y_test, _ = feature_scaling(df_test, target_col)

    fs_methods = [
        ("MI", mutual_information_feature_selection),
        ("F-Classif", f_classif_filter_selection),
        ("Variance", variance_threshold_feature_selection),
        ("RFE", rfe_feature_selection),
        ("LogRegWrapper", logistic_regression_wrapper_selection),
        ("ForwardSel", forward_selection),
        ("Lasso", lasso_feature_selection),
        ("Tree", tree_based_feature_selection),
        ("LinearSVC-L1", linear_svc_l1_feature_selection),
    ]

    all_results = []
    for name, func in fs_methods:
        print(f"\n Running: {name}...")
        result = train_evaluate_record(X_train, X_test, y_train, y_test, func, name)
        all_results.append(result)

    df_results = pd.DataFrame(all_results)
    print("\n Final Comparison Table:")
    print(df_results.to_string(index=False))

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
tf.random.set_seed(42)

# Dummy PSO (replace with your real PSO logic if needed)
def pso(fitness_func, lb, ub, swarmsize=10, maxiter=3):
    dim = len(lb)
    best = np.random.rand(dim)
    return best, 0.5

# ======================
# Load & Preprocess
# ======================
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.replace('"', '').str.strip()
    df['TIME'] = pd.to_datetime(df['TIME'], errors='coerce')
    df.dropna(subset=['TIME'], inplace=True)
    df.set_index('TIME', inplace=True)
    df = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(axis=1, how='all', inplace=True)

    target_col = 'fenologia_h2'
    df[target_col].interpolate(method='linear', inplace=True)
    df[target_col] = df[target_col] - 1

    for i in range(1, 4):
        df[f'{target_col}_lag{i}'] = df[target_col].shift(i)

    for window in [3, 6]:
        df[f'{target_col}_roll_mean_{window}'] = df[target_col].rolling(window).mean()
        df[f'{target_col}_roll_std_{window}'] = df[target_col].rolling(window).std()

    df['month'] = df.index.month
    df['weekofyear'] = df.index.isocalendar().week
    df['year'] = df.index.isocalendar().year
    df['sin_week'] = np.sin(2 * np.pi * df['weekofyear'] / 52)
    df['cos_week'] = np.cos(2 * np.pi * df['weekofyear'] / 52)

    df['EMA_3'] = df[target_col].ewm(span=3, adjust=False).mean()
    df['EMA_6'] = df[target_col].ewm(span=6, adjust=False).mean()
    df['correlation_target_month'] = df[target_col].rolling(window=6).corr(df['month'])
    df['correlation_target_week'] = df[target_col].rolling(window=6).corr(df['weekofyear'])

    fft_values = np.fft.fft(df[target_col].dropna().values)
    fft_real = np.real(fft_values)[:len(df[target_col])]
    fft_imag = np.imag(fft_values)[:len(df[target_col])]
    df['fft_real'] = np.concatenate([fft_real, np.nan * np.ones(len(df) - len(fft_real))])
    df['fft_imag'] = np.concatenate([fft_imag, np.nan * np.ones(len(df) - len(fft_imag))])
    df.fillna(df.median(), inplace=True)
    return df, target_col

# ======================
# Scaling
# ======================
def feature_scaling(df, target_col):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    return pd.DataFrame(X_scaled, columns=X.columns), y

# ======================
# MLP Model
# ======================
def build_mlp_model(input_dim, num_classes=4, num_units=64, dropout_rate=0.2, activation='relu'):
    model = Sequential()
    model.add(Dense(num_units, activation=activation, input_dim=input_dim))
    model.add(Dropout(dropout_rate))
    model.add(Dense(num_units // 2, activation=activation))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# ======================
# Feature Selection Methods
# ======================
def pso_feature_selection(X, y):
    def fitness_function(features):
        mask = features > 0.5
        if mask.sum() == 0:
            return 1.0
        clf = RandomForestClassifier()
        clf.fit(X.iloc[:, mask], y)
        acc = clf.score(X.iloc[:, mask], y)
        return 1 - acc

    lb = np.zeros(X.shape[1])
    ub = np.ones(X.shape[1])
    best_pos, _ = pso(fitness_function, lb, ub)
    return best_pos > 0.5

def pearson_feature_selection(X, y, threshold=0.1):
    scores = np.array([abs(pearsonr(X[col], y)[0]) for col in X.columns])
    return scores > threshold

# ======================
# Train & Evaluate
# ======================
def train_and_evaluate(X_train, X_test, y_train, y_test, selector_func, method_name):
    start_fs = time.time()
    mask = selector_func(X_train, y_train)
    fs_time = time.time() - start_fs

    selected_cols = X_train.columns[mask]
    X_train_sel = X_train[selected_cols]
    X_test_sel = X_test[selected_cols]

    start_train = time.time()
    model = build_mlp_model(input_dim=X_train_sel.shape[1])
    model.fit(X_train_sel, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=0)
    train_time = time.time() - start_train

    start_test = time.time()
    y_pred = model.predict(X_test_sel).argmax(axis=1)
    test_time = time.time() - start_test

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    n_features = sum(mask)
    reduction_pct = 100 * (1 - n_features / X_train.shape[1])

    return {
        "Method": method_name,
        "Accuracy": round(acc, 4),
        "Precision": round(prec, 4),
        "Recall": round(rec, 4),
        "F1 Score": round(f1, 4),
        "#Features": n_features,
        "Feature Reduction (%)": round(reduction_pct, 2),
        "FS Time (s)": round(fs_time, 4),
        "Train Time (s)": round(train_time, 4),
        "Test Time (s)": round(test_time, 4)
    }

# ======================
# Main
# ======================
def main():
    train_fp = r"E:\Abroad period research\Phenology datasets\PHENOLOGY_H2_train.csv"
    test_fp  = r"E:\Abroad period research\Phenology datasets\PHENOLOGY_H2_test.csv"

    df_train, target_col = load_and_preprocess_data(train_fp)
    X_train, y_train = feature_scaling(df_train, target_col)

    df_test, _ = load_and_preprocess_data(test_fp)
    X_test, y_test = feature_scaling(df_test, target_col)

    methods = [
        ("PSO", pso_feature_selection),
        ("Pearson", pearson_feature_selection)
    ]

    results = []
    for name, func in methods:
        print(f"\n🔍 Running Feature Selection: {name}")
        result = train_and_evaluate(X_train, X_test, y_train, y_test, func, name)
        results.append(result)

    df_results = pd.DataFrame(results)
    print("\n FINAL COMPARISON TABLE:")
    print(df_results.to_string(index=False))

if __name__ == "__main__":
    main()


Final feature selection code with all features selection algorithms for comparison