# Measuring subsets success

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## Metaboolites subsets

In [3]:
# Fatty acyls y Sterol lipids ausentes, no se encontraron en 'Supplementary Dataset.xlsx'
cideim = ['67', '152', '212', '218', '263', '296', '308', '312', '429', '431', '437', '486', '505', '508', '509']
anova = ['21', '42', '72', '85', '221', '263', '375', '431', '482', '508']
rfe = ['8', '80', '85', '154', '220', '227', '314', '384', '389', '421', '429', '508']
rfecv = ['85', '227', '508']
rfecv_complete = ['283', '314', '372', '468', '508']

## Define pareto scaler

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

In [5]:
class ParetoScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.factors = None
        
    def fit(self, X, y=None):
        self.factors = {col:X[col].std()**(1/2) for col in X}
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        for col in X_:
            X_[col] /= self.factors[col]
        return X_

## Import dataset

In [17]:
df = pd.read_csv("../data_sets/ds2.csv")

## Encode target values

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['y'] = le.fit_transform(df['y'])

## Test-Train split

In [40]:
from sklearn.model_selection import train_test_split

# Input data
X = df.drop(columns=['y'])

# Target variable
y = df['y']

# Split dataset in training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## SVM models

### cideim

In [10]:
X_train, X_test = X_train[cideim], X_test[cideim]

In [11]:
num_vars = cideim

preprocessor = ColumnTransformer(
    transformers=[
        ('num', ParetoScaler(), num_vars)
    ]
)

In [12]:
cideim_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svm', SVC())
])

In [13]:
cideim_model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', ParetoScaler(),
                                                  ['67', '152', '212', '218',
                                                   '263', '296', '308', '312',
                                                   '429', '431', '437', '486',
                                                   '505', '508', '509'])])),
                ('svm', SVC())])

In [14]:
cideim_predict = cideim_model.predict(X_test)

In [15]:
accuracy_score(y_test, cideim_predict)

0.5714285714285714

### anova

In [20]:
X_train, X_test = X_train[anova], X_test[anova]

In [21]:
num_vars = anova

preprocessor = ColumnTransformer(
    transformers=[
        ('num', ParetoScaler(), num_vars)
    ]
)

In [22]:
anova_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svm', SVC())
])

In [23]:
anova_model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', ParetoScaler(),
                                                  ['21', '42', '72', '85',
                                                   '221', '263', '375', '431',
                                                   '482', '508'])])),
                ('svm', SVC())])

In [24]:
anova_predict = anova_model.predict(X_test)

In [25]:
accuracy_score(y_test, anova_predict)

0.5714285714285714

### rfe

In [27]:
X_train, X_test = X_train[rfe], X_test[rfe]

In [28]:
num_vars = rfe

preprocessor = ColumnTransformer(
    transformers=[
        ('num', ParetoScaler(), num_vars)
    ]
)

In [29]:
rfe_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svm', SVC())
])

In [30]:
rfe_model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', ParetoScaler(),
                                                  ['8', '80', '85', '154',
                                                   '220', '227', '314', '384',
                                                   '389', '421', '429',
                                                   '508'])])),
                ('svm', SVC())])

In [31]:
rfe_predict = rfe_model.predict(X_test)

In [32]:
accuracy_score(y_test, rfe_predict)

0.5714285714285714

### rfecv

In [34]:
X_train, X_test = X_train[rfecv], X_test[rfecv]

In [35]:
num_vars = rfecv

preprocessor = ColumnTransformer(
    transformers=[
        ('num', ParetoScaler(), num_vars)
    ]
)

In [36]:
rfecv_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svm', SVC())
])

In [37]:
rfecv_model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', ParetoScaler(),
                                                  ['85', '227', '508'])])),
                ('svm', SVC())])

In [38]:
rfecv_predict = rfecv_model.predict(X_test)

In [39]:
accuracy_score(y_test, rfecv_predict)

0.7142857142857143

### rfecv_complete

In [41]:
X_train, X_test = X_train[rfecv_complete], X_test[rfecv_complete]

In [42]:
num_vars = rfecv_complete

preprocessor = ColumnTransformer(
    transformers=[
        ('num', ParetoScaler(), num_vars)
    ]
)

In [43]:
rfecv_complete_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svm', SVC())
])

In [44]:
rfecv_complete_model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', ParetoScaler(),
                                                  ['283', '314', '372', '468',
                                                   '508'])])),
                ('svm', SVC())])

In [45]:
rfecv_complete_predict = rfecv_complete_model.predict(X_test)

In [46]:
accuracy_score(y_test, rfecv_complete_predict)

0.5714285714285714