# Optimal metabolite subset (post)

In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [None]:
# Import utilities
sys.path.insert(0, '../')
from helpers import preprocessing
from helpers import sets

## Import datasets

In [2]:
df = pd.read_csv("../../data_sets/ds3.csv")
metabolites = pd.read_excel("../../data_sets/Supplementary_Dataset_S1.xlsx")['Metabolite']

## Encode target values

In [3]:
preprocessing.encoding(df)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,527,528,529,530,531,532,533,534,535,y
0,15197.81152,15137.7959,13824.33203,3886.181641,44213.27344,614280.7,59331.63672,4037.03833,202598.0,21478.75586,...,538699.1875,91547.28125,26786.00195,12853.86816,4096.142578,4117.774902,50739.20703,1677.144287,5178.986328,1
1,12192.23828,21360.44141,9971.5,3381.012695,41295.73438,502297.2,32046.60156,5163.633789,93320.52,45638.0625,...,643461.8125,58183.94141,29228.64063,10554.83301,4145.901367,3771.996582,24112.17969,17484.77734,2493.593262,1
2,9794.058594,10355.11426,11419.6123,8838.53418,65490.66406,1383140.0,78672.14063,1794.260254,179458.8,20975.39453,...,557195.6875,111246.5469,19829.45313,5055.417969,848.069519,7285.378418,39987.89453,1590.35437,2588.685059,1
3,8328.241211,12858.30176,7068.033691,4751.85791,34073.45313,693953.4,58468.11328,1805.435913,434328.0,21998.61133,...,573969.125,100035.7734,24185.88281,6081.456543,1879.115723,3358.707764,43222.65234,11613.23438,1476.407959,1
4,9363.958008,17715.19727,10046.55371,3563.136475,43344.56641,346904.1,34618.79688,3753.905762,669534.4,79115.89063,...,914524.75,138855.0313,30385.55664,3615.894775,3641.17749,6749.343262,28545.83398,2022.740479,3390.47168,1
5,10436.95801,18300.25586,14812.60938,5018.631348,48729.83594,187605.1,31809.77539,2786.377197,1610184.0,18068.07422,...,599076.625,105117.0938,31420.39453,3993.579102,5037.51709,5423.530762,53416.86328,5904.73877,4261.810059,1
6,3591.131836,6639.097656,3956.794434,800.513794,13308.0791,239268.8,20475.42773,921.85968,89624.49,7806.132813,...,229509.1563,22639.59961,6644.923828,575.754578,418.703674,2657.068848,21776.98828,252.614639,528.917236,1
7,15442.97754,21300.11523,22227.33008,3203.901855,110988.5469,1985211.0,115973.3438,4001.095947,860021.0,21834.0918,...,793292.25,98324.24219,22737.08984,6077.202148,3207.498779,10259.93359,41542.54297,754.111084,2019.42041,1
8,7715.913574,19429.05078,7770.894043,3512.696533,29452.69531,328479.4,25904.33398,2677.121582,1273017.0,48289.70313,...,746354.5,76691.17969,20624.97852,5077.819336,1770.0802,4916.973633,12770.82715,2218.166504,3407.802979,1
9,9926.803711,11030.58203,6313.438477,2245.17627,16201.74121,202512.3,15109.17871,4253.385254,90949.49,46779.99609,...,651963.1875,53253.32422,23715.1543,3677.385254,4030.543701,4863.954102,18246.5957,3941.543213,2323.753174,1


## Data set split

In [4]:
from sklearn.model_selection import train_test_split

# Input data
X = df.drop(columns=['y'])

# Target variable
y = df['y']

# Split dataset in training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Scale

In [5]:
pareto = preprocessing.ParetoScaler()
X_train = pareto.fit_transform(X_train)
X_test = pareto.transform(X_test)

## Cideim

In [6]:
sets.cideim

{'152',
 '212',
 '218',
 '263',
 '296',
 '308',
 '312',
 '429',
 '431',
 '437',
 '486',
 '505',
 '508',
 '509',
 '67'}

### F Classif

In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [8]:
# best performance obtained with k=3
fs = SelectKBest(score_func=f_classif, k=3)
fs.fit(X_train, y_train)
opt_subset_f_classif = [metabolites[int(i)] for i in fs.get_feature_names_out()]

In [9]:
f_classif = set(fs.get_feature_names_out())
f_classif

{'39', '40', '505'}

## RFE (recursive feature elimination)

In [12]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFE

In [13]:
# best performance obtained with n_features_to_select=3
selector_rfe = RFE(SVC(kernel='linear'), n_features_to_select=3)
selector_rfe.fit(X_train, y_train)

RFE(estimator=SVC(kernel='linear'), n_features_to_select=3)

In [15]:
rfe = set(X.columns[selector_rfe.support_])
rfe

{'234', '363', '508'}

## RFECV (recursive feature elimination with cross validation)

In [16]:
from sklearn.feature_selection import RFECV

In [17]:
selector_rfecv = RFECV(SVC(kernel='linear')) # try f1 score
selector_rfecv.fit(X_train, y_train)

RFECV(estimator=SVC(kernel='linear'))

In [18]:
rfecv = set(X.columns[selector_rfecv.support_])
rfecv

{'112',
 '130',
 '140',
 '154',
 '185',
 '219',
 '234',
 '257',
 '308',
 '324',
 '363',
 '365',
 '386',
 '429',
 '450',
 '462',
 '474',
 '508'}

## Intersections

In [19]:
intersections = pd.DataFrame({
    'subset': ['cideim', 'f_classif', 'rfe', 'rfecv'],
    'cideim': [np.nan, cideim.intersection(f_classif), cideim.intersection(rfe), cideim.intersection(rfecv)],
    'f_classif': [np.nan, np.nan, f_classif.intersection(rfe), f_classif.intersection(rfecv)],
    'rfe': [np.nan, np.nan, np.nan, rfe.intersection(rfecv)],
})
intersections

Unnamed: 0,subset,cideim,anova,rfe
0,cideim,,,
1,anova,{505},,
2,rfe,{508},{},
3,rfecv,"{308, 429, 508}",{},"{234, 363, 508}"


## Auxiliar

In [20]:
# Translate features index to metabolites
def translation(features):
    return [metabolites[int(f)] for f in features]

In [21]:
print(f"F_CLASSIF:\t{translation(f_classif)}\n"\
      f"RFE\t{translation(rfe)}\n"\
      f"RFECV\t{translation(rfecv)}")

ANOVA:	['[FA (8:0)] octanoic acid', '[FA (7:0)] heptanoic acid', 'Stearoylcarnitine']
RFE	['Butanoic acid', 'Taurine', 'L-Glutamate']
RFECV	['Hypoxanthine', 'Butanoic acid', '[SP (16:0)] N-(hexadecanoyl)-sphing-4-enine-1-phosphocholine', '10-oxogeranial', 'N5-Ethyl-L-glutamine', 'PC(18:2(9Z_12Z)/18:1(9Z))', 'L-Proline', '4-Trimethylammoniobutanoate', 'Glu-Phe-Trp', 'Nicotinamide', '2-Octenoylcarnitine', 'allylcysteine', 'L-Glutamate', 'Octadecanoic acid', '1-Palmitoylglycerophosphocholine', 'L-Glutamine', 'Cys-Pro', 'Taurine']


In [22]:
print(f"1:\t{translation(intersections['cideim'][1])}\n"\
    f"2:\t{translation(intersections['cideim'][2])}\n"\
     f"3:\t{translation(intersections['cideim'][3])}\n"\
     f"4:\t{translation(intersections['rfe'][3])}")

1:	['Stearoylcarnitine']
2:	['Taurine']
3:	['Glu-Phe-Trp', 'N5-Ethyl-L-glutamine', 'Taurine']
4:	['Butanoic acid', 'L-Glutamate', 'Taurine']
