In [1]:
import pandas as pd
import numpy as np
import sys
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from tpot.config import classifier_config_dict

path = 'TPOT_COV'
tpot_data = pd.read_csv(path + '/annotDataKeggCov.txt', sep='\t', index_col=0).T

Xdata = tpot_data.drop('diagnosis', axis=1)
Ydata = tpot_data['diagnosis']
del tpot_data

print(Xdata.head())

gene_id    ENSG00000000419.8  ENSG00000000938.8  ENSG00000000971.11  \
2014-2194          -1.165569           0.513416            0.395402   
2014-2195          -1.051649           0.604444            0.876143   
2014-2196          -2.156356          -0.153699            0.267843   
2014-2197          -2.212747           0.447874            0.627974   
2014-2198          -0.468046          -0.172765            0.920823   

gene_id    ENSG00000001036.9  ENSG00000001084.6  ENSG00000001167.10  \
2014-2194          -1.353226           1.637868            1.009720   
2014-2195          -0.355030           1.895917            1.022767   
2014-2196          -1.632926           1.379626            1.067571   
2014-2197          -1.136629           0.549276            0.267843   
2014-2198          -1.537285           1.093152            1.001117   

gene_id    ENSG00000001617.7  ENSG00000001626.10  ENSG00000001630.11  \
2014-2194           0.334377           -0.767900           -0.742876   
20

In [2]:
print(Xdata.columns)

Index(['ENSG00000000419.8', 'ENSG00000000938.8', 'ENSG00000000971.11',
       'ENSG00000001036.9', 'ENSG00000001084.6', 'ENSG00000001167.10',
       'ENSG00000001617.7', 'ENSG00000001626.10', 'ENSG00000001630.11',
       'ENSG00000002016.12',
       ...
       'ENSG00000270408.1', 'ENSG00000271625.1', 'ENSG00000271885.1',
       'ENSG00000272047.1', 'ENSG00000272658.1', 'ENSG00000273079.1', 'CMC',
       'LIBD_szControl', 'sex', 'ethnicity'],
      dtype='object', name='gene_id', length=4956)


In [3]:


tpot_config = {
    'tpot.builtins.FeatureSetSelector': {
        'subset_list': [path + '/c2.cp.kegg.v7.0.ens.red.csv'],
        'sel_subset': range(186),
        'res_cols': [['CMC', 'LIBD_szControl', 'sex', 'ethnicity']] # columns that should not be excluded by featuresetselector
    },
   'tpot.builtins.MetaEstimator': {
        
        'estimator': {
           'sklearn.ensemble.GradientBoostingRegressor': {
                'n_estimators': [100],
                'loss': ["ls", "lad", "huber", "quantile"],
                'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
                'max_depth': range(1, 11),
                'min_samples_split': range(2, 21),
                'min_samples_leaf': range(1, 21),
                'subsample': np.arange(0.05, 1.01, 0.05),
                'max_features': np.arange(0.05, 1.01, 0.05),
                'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]
            },

        },
       'A': [['CMC', 'ethnicity']],
       'C': [['LIBD_szControl', 'sex']]
    }
    
}


accuracy_ls = []
n_gen = 5
n_pop = 25
seed = 42

X_train, X_test, y_train, y_test = train_test_split(Xdata, 
                                                    Ydata, 
                                                    random_state=int(seed), 
                                                    train_size=0.75, 
                                                    test_size=0.25, 
                                                    stratify=Ydata)
y_train.to_csv(path + '/ytrain_' + str(seed) + '.txt', sep='\t', header = True)

tpot = TPOTClassifier(generations=n_gen, population_size=n_pop, 
                        verbosity=3, cv=5,
                      config_dict=tpot_config,
                      template="FeatureSetSelector-Classifier",
                      scoring="balanced_accuracy",random_state=42)
tpot.fit(X_train, y_train)

2 operators have been imported by TPOT.


A Jupyter Widget

Generation 1 - Current Pareto front scores:
-2	0.6155040838214669	MetaEstimator(FeatureSetSelector(input_matrix, FeatureSetSelector__res_cols=['CMC', 'LIBD_szControl', 'sex', 'ethnicity'], FeatureSetSelector__sel_subset=153, FeatureSetSelector__subset_list=TPOT_COV/c2.cp.kegg.v7.0.ens.red.csv), MetaEstimator__A=['CMC', 'ethnicity'], MetaEstimator__C=['LIBD_szControl', 'sex'], MetaEstimator__GradientBoostingRegressor__alpha=0.99, MetaEstimator__GradientBoostingRegressor__learning_rate=0.001, MetaEstimator__GradientBoostingRegressor__loss=ls, MetaEstimator__GradientBoostingRegressor__max_depth=7, MetaEstimator__GradientBoostingRegressor__max_features=0.05, MetaEstimator__GradientBoostingRegressor__min_samples_leaf=7, MetaEstimator__GradientBoostingRegressor__min_samples_split=3, MetaEstimator__GradientBoostingRegressor__n_estimators=100, MetaEstimator__GradientBoostingRegressor__subsample=0.1)

Generation 2 - Current Pareto front scores:
-2	0.65251835657124	MetaEstimator(FeatureSetSelect

TPOTClassifier(config_dict={'tpot.builtins.FeatureSetSelector': {'res_cols': [['CMC',
                                                                               'LIBD_szControl',
                                                                               'sex',
                                                                               'ethnicity']],
                                                                 'sel_subset': range(0, 186),
                                                                 'subset_list': ['TPOT_COV/c2.cp.kegg.v7.0.ens.red.csv']},
                            'tpot.builtins.MetaEstimator': {'A': [['CMC',
                                                                   'ethnicity']],
                                                            'C': [['LIBD_szControl',
                                                                   'sex']],
                                                            'estimator': {'sklearn.ensemble.GradientBoo

In [5]:
tpot.fitted_pipeline_

Pipeline(memory=None,
         steps=[('featuresetselector',
                 FeatureSetSelector(res_cols=['CMC', 'LIBD_szControl', 'sex',
                                              'ethnicity'],
                                    sel_subset=169,
                                    subset_list='TPOT_COV/c2.cp.kegg.v7.0.ens.red.csv')),
                ('metaestimator',
                 MetaEstimator(A=['CMC', 'ethnicity'],
                               C=['LIBD_szControl', 'sex'],
                               estimator=GradientBoostingRegressor(alpha=0.8,
                                                                   criterion='friedman_mse',
                                                                   init=None,...
                                                                   loss='lad',
                                                                   max_depth=5,
                                                                   max_features=0.55,
             

In [8]:
print("Name of selected features set", tpot.fitted_pipeline_.steps[0][1].sel_subset_name)

Name of selected features set KEGG_SYSTEMIC_LUPUS_ERYTHEMATOSUS
