# MGFS: A multi-label graph-based feature selection algorithm via PageRank centrality

## Imports

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

## Prepare functions

In [2]:
def covariance(X, Y, bias=False):
    X_mean = np.mean(X, dtype=np.float64)
    Y_mean = np.mean(Y, dtype=np.float64)
    if bias:
        n = len(X)
    else:
        n = len(X) - 1

    return np.sum((X - X_mean) * (Y - Y_mean)) / n

In [3]:
def covariance_matrix(X, Y, bias=False):
    return np.array([[1 - (covariance(f_val, l_val)/np.sqrt(np.var(f_val, ddof=1)*np.var(l_val, ddof=1))) for f_val in X.T] for l_val in Y.T]).T

In [5]:
def euclidean_distance_matrix(matrix):
    distance_matrix = cdist(matrix, matrix, 'euclidean')
    return distance_matrix

In [104]:
def weighted_pagerank(M, beta=0.85, epsilon=1e-8):
    n = len(M)
    d = np.sum(M, axis=1)
    P = M / d[:, np.newaxis]
    G = beta * P + (1 - beta) / n * np.ones((n, n))

    # Initial probability vector
    pi = np.ones(n) / n

    while True:
        pi_next = np.dot(pi, G)

        # Check for convergence
        if np.linalg.norm(pi_next - pi, 1) < epsilon:
            break

        pi = pi_next

    return pi

## Data

In [7]:
features = np.array([
    [0.0347, 0.0897, 0.0912, -73.3024, 6.2152],
    [0.0814, 0.2727, 0.0857, -62.5844, 3.1832],
    [0.1105, 0.2736, 0.0844, -65.2353, 2.7950]
])

labels = np.array([
    [0, 1, 1],
    [1, 0, 0],
    [0, 1, 0]
])

## Usage

In [10]:
CDM = covariance_matrix(X=features, Y=labels, bias=False)
CDM

array([[0.86713359, 1.13286641, 1.92478039],
       [0.50368396, 1.49631604, 1.99999097],
       [1.33588179, 0.66411821, 0.01634619],
       [0.30867533, 1.69132467, 1.97140401],
       [1.40754479, 0.59245521, 0.00538601]])

In [68]:

EDM = euclidean_distance_matrix(CDM)
EDM

array([[0.        , 0.51946887, 2.02028979, 0.79115424, 2.06595336],
       [0.51946887, 0.        , 2.30650236, 0.27726151, 2.36904575],
       [2.02028979, 2.30650236, 0.        , 2.43568417, 0.1019377 ],
       [0.79115424, 0.27726151, 2.43568417, 0.        , 2.50604369],
       [2.06595336, 2.36904575, 0.1019377 , 2.50604369, 0.        ]])

In [118]:
FLG = weighted_pagerank(EDM, beta=0.85, epsilon=1e-8)
FLG

array([0.17799211, 0.17996626, 0.22105117, 0.19496132, 0.22602913])