<a href="https://colab.research.google.com/github/Chijiro31/-Infinite-Feature-Selection-a-Graph-based-Feature-/blob/main/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Infinite Feature Selection: A Graph-based Feature Filtering Approach**

*  Giorgio Roffo, Simone Melzi, Member, IEEE, Umberto Castellani,
 Alessandro Vinciarelli, Member, IEEE and Marco Cristani, Member, IEEE


**Improtar los Recursos**

In [1]:
# Requisitos de instalación
!pip install -q scikit-learn skfeature-chappers skrebate gdown PyIFS
!git clone https://github.com/jundongl/scikit-feature.git
!pip install numpy pandas scikit-learn matplotlib skrebate
!pip install skfeature-chappers
%cd scikit-feature
!pip install -e .
%cd ..

# Solución al problema de PyIFS: importar scipy.stats
!sed -i '1i import scipy.stats as stats' /usr/local/lib/python3.11/dist-packages/PyIFS/InfFS.py

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for skrebate (setup.py) ... [?25l[?25hdone
Cloning into 'scikit-feature'...
remote: Enumerating objects: 1089, done.[K
remote: Counting objects: 100% (132/132), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 1089 (delta 57), reused 132 (delta 57), pack-reused 957 (from 1)[K
Receiving objects: 100% (1089/1089), 194.81 MiB | 21.15 MiB/s, done.
Resolving deltas: 100% (643/643), done.
/content/scikit-feature
Obtaining file:///content/scikit-feature
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: skfeature
  Running setup.py develop for skfeature
Successfully installed skfeature-1.0.0
/content


In [2]:
# Importaciones

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import scipy.stats as stats

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
from skrebate import ReliefF
from skfeature.function.similarity_based import lap_score
from skfeature.utility import construct_W
from skfeature.function.sparse_learning_based import MCFS
from PyIFS import InfFS
from numpy.linalg import eigvals, inv
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression,  Lasso, LassoCV
from skfeature.function.sparse_learning_based import UDFS
from sklearn.feature_selection import SelectKBest, f_classif, RFE, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from skrebate import ReliefF as skReliefF
from sklearn.decomposition import PCA


**Configuración de visualización**

In [3]:
plt.style.use('ggplot')
%matplotlib inline

**Cargar el archivo CSV**

In [4]:
from google.colab import files
uploaded = files.upload()

Saving X_train.txt to X_train.txt
Saving y_train.txt to y_train.txt
Saving X_test.txt to X_test.txt
Saving y_test.txt to y_test.txt


**Leer los Datos**

In [5]:
# Cargar archivos
X_train = np.loadtxt("X_train.txt")
y_train = np.loadtxt("y_train.txt")
X_test = np.loadtxt("X_test.txt")
y_test = np.loadtxt("y_test.txt")

# Combinar todo
X = np.vstack([X_train, X_test])
y = np.hstack([y_train, y_test])

# Escalar
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Methods dictionary: method name -> function
methods = {}
rankings = {}
results = {}



# **1.Modelos y funciones de evaluación**

In [6]:
# Modelo base para evaluación
base_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Función de evaluación
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)


# **2.Métodos de selección de características**

In [7]:
def apply_select_k_best(X_train, y_train, k=100):
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X_train, y_train)
    return selector

def apply_rfe(X_train, y_train, k=100):
    estimator = LogisticRegression(max_iter=1000)
    selector = RFE(estimator, n_features_to_select=k, step=0.1)
    selector.fit(X_train, y_train)
    return selector

def apply_lasso(X_train, y_train):
    selector = SelectFromModel(Lasso(alpha=0.01, max_iter=5000)).fit(X_train, y_train)
    return selector

def apply_reliefF(X_train, y_train):
    selector = ReliefF(n_features_to_select=100)
    selector.fit(X_train, y_train)
    return selector


**Aplicar los métodos y evaluar**

In [None]:
from sklearn.base import clone
methods = {
    "SelectKBest": apply_select_k_best,
    "RFE": apply_rfe,
    "Lasso": apply_lasso,
    "ReliefF": apply_reliefF
}

results = {}

for name, func in methods.items():
    print(f"Evaluando {name}...")
    selector = func(X_train, y_train)
    X_train_sel = selector.transform(X_train)
    X_test_sel = selector.transform(X_test)
    acc = evaluate_model(clone(base_model), X_train_sel, y_train, X_test_sel, y_test)
    results[name] = acc

# Mostrar resultados ordenados
results = dict(sorted(results.items(), key=lambda item: item[1], reverse=True))
for method, acc in results.items():
    print(f"{method}: {acc:.4f}")

# **3. Metodos de Filtrado**

**Laplacian Score (LS)**

In [None]:
W = construct_W.construct_W(X_scaled)
scores = lap_score.lap_score(X_scaled, W=W)
lap_ranking = np.argsort(scores)[::-1]
rankings['Laplacian Score'] = lap_ranking

print("Top 20 features (Laplacian Score):", lap_ranking[:20])

**Multi-Cluster Feature Selection (MCFS)**

In [None]:
weight = MCFS.mcfs(X_scaled, n_selected_features=20, n_clusters=6)
mcfs_ranking = np.argsort(np.abs(weight))[::-1]
rankings['MCFS'] = mcfs_ranking

print("Top 20 features (Multi-Cluster Feature Selection):", lap_ranking[:20])

**Relief-F**

In [None]:
def relief_f_ranking(X, y):
    fs = ReliefF(n_features_to_select=X.shape[1])
    fs.fit(X, y)
    scores = fs.feature_importances_
    ranked = np.argsort(-scores)
    return ranked, scores

r_relief, _ = relief_f_ranking(X_scaled, y)
rankings['ReliefF'] = r_relief
print("Top 20 features (ReliefF):", r_relief[:20])

**Mutual Information (MI)**

In [None]:
mi_scores = mutual_info_classif(X_scaled, y, random_state=42)
mi_ranking = np.argsort(mi_scores)[::-1]
rankings['Mutual Information'] = mi_ranking

print("Top 20 features (MI):", mi_ranking[:20])

# ***Infinite Feature Selection (unsupervised)***

In [None]:
def inf_fs(X, alpha=None):
    corr = np.abs(np.corrcoef(X.T))
    np.fill_diagonal(corr, 0)
    max_eig = max(np.real(eigvals(corr)))
    if alpha is None:
        alpha = 0.9 / max_eig
    I = np.eye(corr.shape[0])
    S = inv(I - alpha * corr) - I
    feature_scores = np.sum(S, axis=1)
    ranked_features = np.argsort(-feature_scores)
    return ranked_features, feature_scores

ranked_features_inf_fs, scores_inf_fs = inf_fs(X_train)
rankings['Inf-FS(u)'] = ranked_features_inf_fs

In [None]:
# Evaluar con SVM
accuracies = []
features_range = list(range(10, 201, 10))

for k in features_range:
    selected = ranked_features[:k]
    clf = SVC(kernel='linear')
    clf.fit(X_train[:, selected], y_train)
    y_pred = clf.predict(X_test[:, selected])
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

In [None]:
# Graficar
plt.plot(features_range, accuracies, marker='o')
plt.title("Inf-FS completo (no supervisado) en HAR Dataset")
plt.xlabel("Número de características seleccionadas")
plt.ylabel("Precisión SVM")
plt.grid(True)
plt.show()

# ***Inf-FSs (Supervisado)***

In [None]:
def inf_fs_supervised(X, y, alpha=None):
    mi = mutual_info_classif(X, y)
    corr = np.corrcoef(X.T)
    np.fill_diagonal(corr, 0)
    A = np.abs(corr) * np.sqrt(np.outer(mi, mi))
    max_eig = max(np.real(eigvals(A)))
    if alpha is None:
        alpha = 0.9 / max_eig
    I = np.eye(A.shape[0])
    S = inv(I - alpha * A) - I
    scores = np.sum(S, axis=1)
    ranked = np.argsort(-scores)
    return ranked, scores

**RFE con SVM**

In [None]:
from skrebate import ReliefF

def rfe_ranking(X, y):
    estimator = SVC(kernel="linear", C=1)
    selector = RFE(estimator, n_features_to_select=1, step=1)
    selector.fit(X, y)
    ranked = np.argsort(-selector.ranking_)
    scores = 1 / (selector.ranking_ + 1e-12)
    return ranked, scores

**Fisher Score**

In [None]:
def fisher_score(X, y):
    n_samples, n_features = X.shape
    unique_classes = np.unique(y)
    mean_overall = np.mean(X, axis=0)
    scores = np.zeros(n_features)
    for c in unique_classes:
        X_c = X[y == c]
        n_c = X_c.shape[0]
        mean_c = np.mean(X_c, axis=0)
        var_c = np.var(X_c, axis=0) + 1e-12
        scores += (n_c * (mean_c - mean_overall) ** 2) / var_c
    return scores

fs = fisher_score(X_scaled, y)
ranking_fs = np.argsort(fs)[::-1]
rankings['Fisher Score'] = ranking_fs
print("Top 20 features (Fisher Score):", ranking_fs[:20])

**LASSO (Logistic Regression con L1)**

In [None]:
def lasso_ranking(X, y):
    model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0, max_iter=500)
    model.fit(X, y)
    scores = np.abs(model.coef_).mean(axis=0)
    ranked = np.argsort(-scores)
    return ranked, scores

**Comparación de métodos supervisados de filtrado**

In [None]:
methods = {
    "Inf-FS(s)": inf_fs_supervised,
    "ReliefF": relief_f_ranking,
    "Fisher": lambda X, y: (np.argsort(-fisher_score(X, y)), None),
    "LASSO": lasso_ranking,
    "RFE": rfe_ranking
}

results = {}
for name, func in methods.items():
    ranked, _ = func(X_train, y_train)
    accs = []
    for k in features_range:
        top_k = ranked[:k]
        clf = SVC(kernel='linear')
        clf.fit(X_train[:, top_k], y_train)
        y_pred = clf.predict(X_test[:, top_k])
        accs.append(accuracy_score(y_test, y_pred))
    results[name] = accs

plt.figure(figsize=(10, 6))
for name, accs in results.items():
    plt.plot(features_range, accs, label=name)
plt.title("Comparación de métodos supervisados de selección de características")
plt.xlabel("Número de características seleccionadas")
plt.ylabel("Precisión SVM")
plt.legend()
plt.grid(True)
plt.show()

# **4.Metodos de Envoltura**

**No Supervisado**

1.   DGUFS
2.   FSASL
3. UFSOL

**Supervisado**

1.   RFE
2.   SVM


**DGUFS**

In [1]:
def dgufs(X, n_clusters, n_features, max_iter=50, alpha=1.0, beta=1.0, verbose=False):
    n_samples, d = X.shape
    W = np.random.rand(d, n_clusters)
    W = normalize(W, axis=0)
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    F = kmeans.fit_transform(X)

    for i in range(max_iter):
        A = X @ W
        kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
        F = kmeans.fit_transform(A)
        numerator = X.T @ F
        denominator = W @ (F.T @ F) + alpha * W + beta * np.sign(W)
        W = W * (numerator / (denominator + 1e-10))
        W = normalize(W, axis=0)
        if verbose and i % 10 == 0:
            print(f"Iter {i}: residual norm = {norm(X @ W - F):.4f}")

    feature_scores = np.linalg.norm(W, axis=1)
    selected = np.argsort(-feature_scores)[:n_features]
    return selected, feature_scores

n_clusters = len(np.unique(y))
n_features = 20
selected_dgufs, scores_dgufs = dgufs(X_scaled, n_clusters, n_features, verbose=True)
rankings['DGUFS'] = selected_dgufs
print("Top 20 features (DGUFS):", selected_dgufs[:20])

NameError: name 'np' is not defined

**FSASL**

In [None]:
from skfeature.function.sparse_learning_based import fsasl

w_fsasl = fsasl.fsasl(X_scaled, n_selected_features=20)
fsasl_scores = np.sum(np.abs(w_fsasl), axis=1)
fsasl_ranking = np.argsort(fsasl_scores)[::-1]
rankings['FSASL'] = fsasl_ranking
print("Top 20 features (FSASL):", fsasl_ranking[:20])

**UFSOL**

In [None]:
from sklearn.metrics.pairwise import rbf_kernel
from scipy.sparse import csgraph

def ufsol(X, n_features=20, sigma=1.0):
    W = rbf_kernel(X, gamma=1 / (2 * sigma ** 2))
    W = normalize(W, norm='l1', axis=1)
    L = csgraph.laplacian(W, normed=True)
    scores = np.array([X[:, i].T @ L @ X[:, i] for i in range(X.shape[1])])
    ranking = np.argsort(scores)  # menor es mejor
    return ranking

ufsol_ranking = ufsol(X_scaled, n_features=20)
rankings['UFSOL'] = ufsol_ranking
print("Top 20 features (UFSOL):", ufsol_ranking[:20])


**Supervisado**

**RFE**

In [None]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

svc = SVC(kernel='linear')
rfe = RFE(svc, n_features_to_select=20).fit(X_scaled, y)
rankings['RFE'] = np.argsort(rfe.ranking_)



**SVM**

In [None]:
from sklearn.svm import LinearSVC

# Entrena SVM lineal con L1 o L2
svm = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter=5000)
svm.fit(X_scaled, y)

# Ranking según pesos
fsv_scores = np.abs(svm.coef_).sum(axis=0)
fsv_ranking = np.argsort(fsv_scores)[::-1]

# Agregar a rankings
rankings['FSV'] = fsv_ranking


# **3. Metodos Integrados**

**No Supervisado**

1.   UDFS

In [None]:
def udfs_ranking(X, n_selected_features=20, gamma=1.0):
    weights = UDFS.udfs(X, gamma=gamma, n_selected_features=n_selected_features)
    scores = np.sum(np.abs(weights), axis=1)
    ranking = np.argsort(scores)[::-1]
    return ranking, scores

udfs_ranked, _ = udfs_ranking(X_scaled)
rankings['UDFS'] = udfs_ranked
print("Top 20 features (UDFS):", udfs_ranked[:20])



**Supervisado**

1.   FSV
2.   LASSO
3. NHTP



In [None]:
#  Supervisado (LASSO / FSV)

from sklearn.linear_model import LogisticRegression

lasso = LogisticRegression(penalty='l1', solver='liblinear').fit(X_scaled, y)
lasso_ranking = np.argsort(np.abs(lasso.coef_).sum(axis=0))[::-1]
#rankings['LASSO'] = np.argsort(np.abs(lasso.coef_).sum(axis=0))[::-1]



In [None]:
# Supervisador (NHTP)

def nhtp(X, y, k=20, max_iter=100, tol=1e-4):
    n, d = X.shape
    w = np.zeros(d)

    for iteration in range(max_iter):
        # Gradiente de la función de pérdida (MSE)
        gradient = -2 * X.T @ (y - X @ w)

        # Paso de Newton (descenso)
        w_temp = w - 0.01 * gradient

        # Umbralización: mantener solo top-k entradas
        idx = np.argsort(np.abs(w_temp))[-k:]
        w_new = np.zeros(d)
        w_new[idx] = w_temp[idx]

        # Verificar convergencia
        if np.linalg.norm(w_new - w) < tol:
            break

        w = w_new

    # Ranking: ordenar por magnitud de coeficientes
    ranking = np.argsort(np.abs(w))[::-1]
    return ranking

nhtp_ranking = nhtp(X_scaled, y, k=20)
rankings['NHTP'] = nhtp_ranking
print("Top 20 features (NHTP):", nhtp_ranking[:20])


# **4. Evaluación Común y Comparativa (ENVOLTURA E INTEGRADOS)**

In [None]:
evaluated_methods = {
    "DGUFS": rankings['DGUFS'],
    "FSASL": rankings['FSASL'],
    "UFSOL": rankings['UFSOL'],
    "UDFS": rankings['UDFS'],
    "FSV": rankings['FSV'],
    "LASSO": rankings['LASSO']
}

results_env_int = {}

for name, ranked in evaluated_methods.items():
    accs = []
    for k in features_range:
        top_k = ranked[:k]
        clf = SVC(kernel='linear')
        clf.fit(X_train[:, top_k], y_train)
        y_pred = clf.predict(X_test[:, top_k])
        accs.append(accuracy_score(y_test, y_pred))
    results_env_int[name] = accs


In [None]:
# Gráfico
plt.figure(figsize=(12, 7))
for name, accs in results_env_int.items():
    plt.plot(features_range, accs, marker='o', label=name)
plt.title("Comparación de métodos de envoltura e integrados")
plt.xlabel("Número de características seleccionadas")
plt.ylabel("Precisión SVM")
plt.legend()
plt.grid(True)
plt.show()

# **5. Consolidación y comparación global**

In [None]:
all_rankings = {}
all_rankings.update(rankings)  # Ya contiene muchos métodos
# Añadir Inf-FS supervisado y no supervisado si no están incluidos
all_rankings["Inf-FS(s)"] = inf_fs_supervised(X_train, y_train)[0]
all_rankings["Inf-FS(u)"] = ranked_features  # resultado de inf_fs no supervisado

results_global = {}

for name, ranked in all_rankings.items():
    # Evitar métodos con rankings incompletos (verificar tamaño)
    if len(ranked) != X_train.shape[1]:
        continue
    accs = []
    for k in features_range:
        top_k = ranked[:k]
        clf = SVC(kernel='linear')
        clf.fit(X_train[:, top_k], y_train)
        y_pred = clf.predict(X_test[:, top_k])
        accs.append(accuracy_score(y_test, y_pred))
    results_global[name] = accs

In [None]:
# Gráfico global comparativo
plt.figure(figsize=(14, 8))
for name, accs in results_global.items():
    plt.plot(features_range, accs, label=name)
plt.title("Comparación global de métodos de selección de características")
plt.xlabel("Número de características seleccionadas")
plt.ylabel("Precisión SVM")
plt.legend(fontsize='small', loc='best', ncol=2)
plt.grid(True)
plt.show()

# **6. Tabla resumen**

In [None]:
comparison = pd.DataFrame([
    {"Método": "Inf-FS(u)", "Tipo": "Filtro", "Supervisión": "No", "Complejidad": "Alta", "Precisión máx": max(results["Inf-FS(u)"])},
    {"Método": "ReliefF", "Tipo": "Filtro", "Supervisión": "Sí", "Complejidad": "Media", "Precisión máx": max(results["ReliefF"])},
    ...
])
print(comparison.sort_values(by="Precisión máx", ascending=False))
