# Evaluating feature selection algorithms to apply in antimicrobial-resistant genes classification in Gram-negative bacterias.

Explore feature selection and evaluation algorithms to select the most important features of antimicrobial-resistant genes in Gram-negative bacterias.

## Imports

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_validate
#from feature_selection.relieff_algorithm import Relieff
#from comparators.scores import get_mean_scores
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, jaccard

In [4]:
df = pd.read_csv("data/Ac_Sa_Ca_Kl_Ec/bla_all.csv")

In [3]:
df = load_iris()
df = pd.DataFrame(data= np.c_[df['data'], df['target']],
                     columns= df['feature_names'] + ['target'])


# Database Handling

## Discretizing

In [6]:
def discretize(df):
    for i in df.columns:
        if i != "Feature":
            df[i] = pd.qcut(df[i], q=5,  labels=False, precision=0, duplicates='drop')
    return df

In [7]:
df = discretize(df)
df.describe()

Unnamed: 0,X.G1.1.1.1.,X.G1.1.1.2.,X.G1.1.1.3.,X.G1.1.1.4.,X.G1.1.1.5.,X.G1.1.1.6.,X.G1.1.1.7.,X.G1.1.1.8.,X.G1.1.1.9.,X.G1.1.1.10.,...,LC6.1,LC6.2,LC6.3,LC7.1,LC7.2,LC7.3,LC8.1,LC8.2,LC8.3,Output
count,107.0,107.0,107.0,107.0,107.0,107.0,107.0,107.0,107.0,107.0,...,107.0,107.0,107.0,107.0,107.0,107.0,107.0,107.0,107.0,107.0
mean,2.0,2.0,2.0,2.0,1.990654,1.990654,1.990654,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.401869
std,1.434086,1.434086,1.434086,1.434086,1.430763,1.424154,1.430763,1.434086,1.434086,1.434086,...,1.434086,1.434086,1.434086,1.434086,1.434086,1.434086,1.434086,1.434086,1.434086,0.492583
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
50%,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0
75%,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0
max,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0


In [6]:
y = df['target']
X = df.drop(['target'], axis=1)

## Separating the model target to its own variables.

Dropping the 'Output' as it will be representend as the y, and 'Feature' columns from the dataframe as it's not necessary for the analysis.

In [8]:
#df = pd.get_dummies(df, columns=["Feature"])
y = df["Output"]
X = df.drop(["Output","Feature"], axis=1) 

In [7]:
def get_mean_scores(scores: dict, name: str) -> dict:
    for i in scores.keys():
        scores[i] = sum(scores[i])/len(scores[i])
    scores["name"] = name
    return scores

## Applying Feature Selection Algorithms.

We create a copy of the dataframe to apply the feature selection algorithm.

Here we use the Relieff algorithm in the database of datasets to find the most relevant features to predict the target variable.

In [9]:
import numpy as np
import sklearn_relief as relief

"implements the Relieff feature selection algorithm"
def Relieff(X, y):
    return relief.RReliefF( n_features=3 ).fit_transform(X, y)

In [11]:
X_relieff = X.copy(deep=True)
X_relieff = Relieff(X_relieff.to_numpy(), y)

Here we are using Pearson correlation to find the similarity between the features, then filtering by its correlation.

In [10]:
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = df.corr()
#Correlation with output variable
cor_target = abs(cor["target"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.5]
X_pearson = df.loc[:,relevant_features.index]


<Figure size 864x720 with 0 Axes>

In [11]:
X_GT = X.copy(deep=True)

In [12]:
#implements Banzhaf power index

def banzhaf(weight, quota):

    max_order = sum(weight)

    polynomial = [1] + max_order*[0]               # create a list to hold the polynomial coefficients

    current_order = 0                              # compute the polynomial coefficients
    aux_polynomial = polynomial[:]
    for i in range(len(weight)):
        current_order = current_order + weight[i]
        offset_polynomial = weight[i]*[0]+polynomial
        for j in range(current_order+1):
            aux_polynomial[j] = polynomial[j] + offset_polynomial[j]
        polynomial = aux_polynomial[:]

    banzhaf_power = len(weight)*[0]                                 # create a list to hold the Banzhaf Power for each voter
    swings = quota*[0]                                              # create a list to compute the swings for each voter

    for i in range(len(weight)):                                    # compute the Banzhaf Power
        for j in range(quota):                                      # fill the swings list
            if (j<weight[i]):
                swings[j] = polynomial[j]
            else:
                swings[j] = polynomial[j] - swings[j-weight[i]]
        for k in range(weight[i]):                                  # fill the Banzhaf Power vector
            banzhaf_power[i] = banzhaf_power[i] + swings[quota-1-k]

    # Normalize Index
    total_power = float(sum(banzhaf_power))
    banzhaf_index = map(lambda x: x / total_power, banzhaf_power)
    
    return banzhaf_index

In [None]:
def payoff_function(weight, quota):
    #TODO
    print("payoff")
    return 0

In [None]:
# Input: A training sample O with feature space F and the target C
# Output: Pv: Banzhaf power index vector of F.

def GT_feature_evaluation(O, C, F):
    pv = 0
    banzhaf_arr = [] 
    for i, value in enumerate(F):
        copy_set = O.copy(deep=True)
        i_data = copy_set[value]
        copy_set.drop(value, axis=1, inplace=True)
        payoff_matrix = []
        for j in copy_set.to_numpy():
            payoff_matrix.append(payoff_function(i_data, j, C))
        banzhaf_arr.append(banzhaf(payoff_matrix, C))
    return banzhaf_arr

In [15]:
banzhaf_power = []
for feature in X_GT.columns:
    banzhaf_power.append(banzhaf(X_GT[feature], 3))
for idx, value in enumerate(banzhaf_power):
    banzhaf_power[idx] = list(value)
print(len(banzhaf_power))


4


## Dividing train and test data

We are using the train_test_split function from sklearn.model_selection to split the data into training and testing sets.

In [13]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.30, random_state=4)
X_relieff_train, X_relieff_test, y_relieff_train, y_relieff_test = train_test_split( X_relieff, y, test_size=0.30, random_state=4)
X_pearson_train, X_pearson_test, y_pearson_train, y_pearson_test = train_test_split( X_pearson, y, test_size=0.30, random_state=4)

## Model Training

### Training SVM model in each training set.

In [14]:
clf = svm.SVC()
clf_relieff = svm.SVC()
clf_pearson = svm.SVC()

### Getting each model performance.

In [15]:
results = cross_validate(clf, X, y, cv=10, return_train_score=True)
relieff_results = cross_validate(clf_relieff, X_relieff, y, cv=10, return_train_score=True)
pearson_results = cross_validate(clf_pearson, X_pearson, y, cv=10, return_train_score=True)

In [16]:
print(get_mean_scores(results, "Control"))
print(get_mean_scores(relieff_results, "RRelieff"))
print(get_mean_scores(pearson_results, "Pearson"))

{'fit_time': 0.005098891258239746, 'score_time': 0.0031007051467895506, 'test_score': 0.990909090909091, 'train_score': 1.0, 'name': 'Control'}
{'fit_time': 0.0007000446319580078, 'score_time': 0.00020003318786621094, 'test_score': 0.7300000000000001, 'train_score': 0.8390678694158076, 'name': 'Relieff'}
{'fit_time': 0.0015004396438598633, 'score_time': 0.0007000923156738281, 'test_score': 0.9527272727272729, 'train_score': 1.0, 'name': 'Pearson'}
