# Optimal metabolite subset

In [1]:
# Import libraries
import pandas as pd
import numpy as np

## Import datasets

In [2]:
# Columns are metabolites pre and pos, 35 patients
ds1 = pd.read_csv("../data_sets/ds1.csv")
# Columns are metabolites, 35 patients (pre)
ds2 = pd.read_csv("../data_sets/ds2.csv")
# Columns are metabolites, 70 patients (post)
ds3 = pd.read_csv("../data_sets/ds3.csv")
# List of metabolites names
metabolites = pd.read_excel("../data_sets/Supplementary_Dataset_S1.xlsx")['Metabolite']

In [3]:
df = ds3

## Define pareto scaler

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

In [5]:
class ParetoScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.factors = None
        
    def fit(self, X, y=None):
        self.factors = {col:X[col].std()**(1/2) for col in X}
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        for col in X_:
            X_[col] /= self.factors[col]
        return X_

## Encode target values

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['y'] = le.fit_transform(df['y'])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,527,528,529,530,531,532,533,534,535,y
0,12781.16016,13023.61719,7637.846191,4312.572266,36301.68359,6.867976e+05,85113.781250,4290.965332,2.709745e+05,41511.82422,...,624578.1875,79408.01563,26254.31250,7415.603027,1575.383423,3858.673096,12726.140630,2731.017578,2873.173340,0
1,13227.25391,23967.87695,15493.653320,2922.394531,80539.66406,4.620728e+05,84456.882810,5276.038574,8.744241e+04,39890.93359,...,796816.9375,99684.13281,33561.46094,4947.357422,4676.434570,4950.031250,12599.176760,5660.225586,2608.797119,0
2,10122.60547,12945.88867,10413.718750,7928.317871,42504.90625,6.120502e+05,43188.925780,4376.491211,4.880815e+05,38950.03516,...,714536.3125,102415.13280,28715.66602,5563.428223,3275.547607,8176.732910,16635.501950,3074.649170,1528.793335,0
3,13343.74707,13922.31250,12080.819340,7304.219727,48926.32422,5.544005e+05,93529.210940,5212.500488,1.081802e+06,38103.93750,...,652670.3125,109246.76560,25427.25000,3065.382813,2606.009521,4635.400391,8650.053711,4626.319824,2046.959595,0
4,11886.78418,27445.01172,9575.403320,3838.025879,7354.43457,3.840624e+04,4832.499023,5291.332031,2.550203e+05,44519.20313,...,761062.0625,133911.21880,27870.75391,3764.118164,3745.776855,7395.415039,5185.240723,1666.588257,1930.175171,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,11616.59961,13477.42285,28147.134770,4870.213867,69137.07813,1.372612e+05,8845.105469,3209.911133,2.121939e+05,48808.90234,...,609592.9375,75933.73438,30589.31836,2659.402832,4198.503418,8442.726563,12513.836910,6563.433594,1447.444580,1
66,18506.74609,14005.15137,10854.259770,3625.831543,29586.16016,1.905000e+05,22534.166020,5194.289551,5.381889e+05,49784.36328,...,732354.8125,62860.12109,34528.49609,7579.631836,3083.323730,2305.873047,16258.588870,9714.000000,1560.546631,1
67,14581.37012,11003.54102,17422.916020,4132.967285,84643.10156,1.276168e+06,116420.703100,4267.375977,1.079372e+05,24779.61914,...,455068.5000,66967.70313,31927.18750,8054.373047,3687.551758,7440.108398,12482.334960,2542.672119,501.314331,1
68,11871.04297,38053.35156,12079.410160,871.309875,27565.69336,2.726918e+05,21009.755860,3048.367676,3.279277e+05,13730.99512,...,607863.0000,92825.35938,22965.94336,8883.898438,1661.758789,8712.309570,10968.892580,2186.223389,1426.884644,1


## Data set split

In [7]:
from sklearn.model_selection import train_test_split

# Input data
X = df.drop(columns=['y'])

# Target variable
y = df['y']

# Split dataset in training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Lasso model

In [8]:
# Import lasso regression from sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso

In [9]:
num_vars = df.columns.tolist()[:-1]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', ParetoScaler(), num_vars)
    ]
)

In [10]:
lasso_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lasso', Lasso(alpha=0.0001))
])

In [11]:
lasso_model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', ParetoScaler(),
                                                  ['0', '1', '2', '3', '4', '5',
                                                   '6', '7', '8', '9', '10',
                                                   '11', '12', '13', '14', '15',
                                                   '16', '17', '18', '19', '20',
                                                   '21', '22', '23', '24', '25',
                                                   '26', '27', '28', '29', ...])])),
                ('lasso', Lasso(alpha=0.0001))])

### Optimize $\alpha$

In [12]:
#Import validation_curve from sklearn

from sklearn.model_selection import validation_curve

In [None]:
# Validation curve implementation

alpha = np.array([10**(-i) for i in range(5)])
score_train, score_test = validation_curve(
    lasso_model, X, y,
    param_name='lasso__alpha', param_range=alpha, cv=5, scoring='r2'
)

In [43]:
# Select the best alpha value

indx = score_test.mean(axis=1).argmax()
best_alpha = alpha[indx]
best_alpha

0.0001

### Extract heaviest features
This is done by intersecting the heaviest coeficients of the model over multiple iterations

In [20]:
subset = set(range(1072))
iterations = 50

for _ in range(iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    lasso_model.fit(X_train, y_train)
    coef = list(enumerate(lasso_model.named_steps['lasso'].coef_))
    coef.sort(key=lambda x: x[1], reverse=True)
    best_meta = {m for m,_ in coef[:200]}
    subset = subset.intersection(best_meta)

In [21]:
subset = list(subset)[:13]
subset.sort()
subset[:13]

[0, 14, 21, 26, 36, 37, 40, 41, 42, 72, 80, 93, 401]

### Map features to metabolites

In [22]:
opt_subset_lasso = [metabolites[i] for i in subset]
opt_subset_lasso

['&gamma;-thiomethyl glutamate',
 '(S)-Dihydroorotate',
 '[FA (11:2)] 4_10-undecadiynal',
 '[FA (17:0)] heptadecanoic acid',
 '[FA (22:4)] 7Z_10Z_13Z_16Z-docosatetraenoic acid',
 '[FA (26:0/2:0)] Hexacosanedioic acid',
 '[FA (8:0)] octanoic acid',
 '[FA (8:1)] 2Z-octenoic acid',
 '[FA amino(8:0)] 3-amino-octanoic acid',
 '[GP (18:0)] 1-octadecanoyl-2-sn-glycero-3-phosphate',
 '[PC (16:0/18:2)] 1-hexadecanoyl-2-(9Z_12Z-octadecadienoyl)-sn-glycero-3-phosphocholine',
 '[PC (18:1/22:6)] 1-(1Z-octadecenyl)-2-(4Z_7Z_10Z_13Z_16Z_19Z-docosahexaenoyl)-sn-glycero-3-phosphocholine',
 'Lys-Ser-Tyr']

## ANOVA correlation coefficient

In [23]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [24]:
fs = SelectKBest(score_func=f_classif, k=13)
fs.fit(X, y)
opt_subset_anova = [metabolites[int(i)] for i in fs.get_feature_names_out()]

In [25]:
opt_subset_anova

['[FA amino(8:0)] 3-amino-octanoic acid',
 '[FA methyl_hydroxy_oxo(5:2/4:0)] methyl 4-[2-(2-formyl-vinyl)-3-hydroxy-5-oxo-cyclopentyl]-butanoate',
 '[FA] Methyl jasmonate',
 '[PE (18:0/20:2)] 1-octadecanoyl-2-(11Z_14Z-eicosadienoyl)-sn-glycero-3-phosphoethanolamine',
 '2-Hydroxypyridine',
 '2-Oxoglutarate',
 'Ala-Lys-Ser-Arg',
 'D-Erythrose',
 'L-Methionine S-oxide',
 'N5-Ethyl-L-glutamine',
 'N6-Methyl-L-lysine',
 'N-Acetylglutamine',
 'Taurine']

In [27]:
print(subset)
print(fs.get_feature_names_out())

[0, 14, 21, 26, 36, 37, 40, 41, 42, 72, 80, 93, 401]
['42' '55' '67' '100' '152' '157' '212' '263' '380' '429' '431' '437'
 '508']


In [31]:
lasso_set = set(subset)
anova_set = set(map(int,fs.get_feature_names_out()))
lasso_set.intersection(anova_set)

{42}