# Measuring subsets success (post)

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Metaboolites subsets

In [43]:
# Fatty acyls y Sterol lipids ausentes, no se encontraron en 'Supplementary Dataset.xlsx'
cideim = ['67', '152', '212', '218', '263', '296', '308', '312', '429', '431', '437', '486', '505', '508', '509']
f_classif = ['39', '40', '505']
rfe = ['234', '363', '508']
rfecv = ['112', '130', '140', '154', '185', '219', '234', '257', '308', '324', '363', '365', '386', '429', '450', '462', '474', '508']
inter = ['308', '429', '508']

## Define pareto scaler

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

In [6]:
class ParetoScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.factors = None
        
    def fit(self, X, y=None):
        self.factors = {col:X[col].std()**(1/2) for col in X}
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        for col in X_:
            X_[col] /= self.factors[col]
        return X_

## Import dataset

In [8]:
df = pd.read_csv("../data_sets/ds3.csv")

## Encode target values

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['y'] = le.fit_transform(df['y'])

## Test-Train split

In [10]:
from sklearn.model_selection import train_test_split

# Input data
X = df.drop(columns=['y'])

# Target variable
y = df['y']

# Split dataset in training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Scale

In [11]:
pareto = ParetoScaler()
X_train = pareto.fit_transform(X_train)
X_test = pareto.transform(X_test)

## SVM models

### cideim

In [12]:
cideim_model = SVC()
cideim_model.fit(X_train[cideim], y_train)

SVC()

In [13]:
cideim_predict = cideim_model.predict(X_test[cideim])

In [14]:
accuracy_score(y_test, cideim_predict)

0.8571428571428571

### f_classif

In [15]:
f_classif_model = SVC()
f_classif_model.fit(X_train[f_classif], y_train)

SVC()

In [16]:
f_classif_predict = f_classif_model.predict(X_test[f_classif])

In [17]:
accuracy_score(y_test, f_classif_predict)

0.7142857142857143

### rfe

In [18]:
rfe_model = SVC()
rfe_model.fit(X_train[rfe], y_train)

SVC()

In [19]:
rfe_predict = rfe_model.predict(X_test[rfe])

In [20]:
accuracy_score(y_test, rfe_predict)

0.7142857142857143

### rfecv

In [21]:
rfecv_model = SVC()
rfecv_model.fit(X_train[rfecv], y_train)

SVC()

In [22]:
rfecv_predict = rfecv_model.predict(X_test[rfecv])

In [23]:
accuracy_score(y_test, rfecv_predict)

0.5714285714285714

### intersections

In [44]:
inter_model = SVC()
inter_model.fit(X_train[inter], y_train)

SVC()

In [45]:
inter_predict = inter_model.predict(X_test[inter])

In [46]:
accuracy_score(y_test, inter_predict)

0.8571428571428571