In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

In [4]:
X=pd.read_csv("testingData/fssTest/SchizoExpr_resAdjTpotInput.txt", sep='\t')
X.head()

Unnamed: 0,subject,ENSG00000000419.8,ENSG00000000938.8,ENSG00000000971.11,ENSG00000001036.9,ENSG00000001084.6,ENSG00000001167.10,ENSG00000001617.7,ENSG00000001626.10,ENSG00000001630.11,...,indicator_1,adjY_1,indicator_2,adjY_2,indicator_3,adjY_3,indicator_4,adjY_4,indicator_5,adjY_5
0,2014-2194,-1.165569,0.513416,0.395402,-1.353226,1.637868,1.00972,0.334377,-0.7679,-0.742876,...,0,-0.285673,0,-0.297187,0,-0.275449,0,-0.269905,0,-0.275531
1,2014-2195,-1.051649,0.604444,0.876143,-0.35503,1.895917,1.022767,-0.035038,-0.685936,-0.750591,...,0,-0.285673,1,-0.297187,0,-0.275449,0,-0.269905,0,-0.275531
2,2014-2196,-2.156356,-0.153699,0.267843,-1.632926,1.379626,1.067571,0.132726,-1.10502,-1.403541,...,0,-0.256852,0,-0.254031,1,-0.245011,0,-0.229888,1,-0.272893
3,2014-2197,-2.212747,0.447874,0.627974,-1.136629,0.549276,0.267843,-0.615382,-1.673632,-0.61225,...,0,-0.285673,0,-0.297187,0,-0.275449,0,-0.269905,0,-0.275531
4,2014-2198,-0.468046,-0.172765,0.920823,-1.537285,1.093152,1.001117,1.002188,-0.779275,-0.222382,...,0,-0.256852,1,-0.254031,0,-0.245011,0,-0.229888,1,-0.272893


In [5]:
X.columns

Index(['subject', 'ENSG00000000419.8', 'ENSG00000000938.8',
       'ENSG00000000971.11', 'ENSG00000001036.9', 'ENSG00000001084.6',
       'ENSG00000001167.10', 'ENSG00000001617.7', 'ENSG00000001626.10',
       'ENSG00000001630.11',
       ...
       'indicator_1', 'adjY_1', 'indicator_2', 'adjY_2', 'indicator_3',
       'adjY_3', 'indicator_4', 'adjY_4', 'indicator_5', 'adjY_5'],
      dtype='object', length=4966)

In [6]:
y = X['adjY'] 
X.drop(['adjY', 'subject'], axis=1, inplace=True)
X.head()

Unnamed: 0,ENSG00000000419.8,ENSG00000000938.8,ENSG00000000971.11,ENSG00000001036.9,ENSG00000001084.6,ENSG00000001167.10,ENSG00000001617.7,ENSG00000001626.10,ENSG00000001630.11,ENSG00000002016.12,...,indicator_1,adjY_1,indicator_2,adjY_2,indicator_3,adjY_3,indicator_4,adjY_4,indicator_5,adjY_5
0,-1.165569,0.513416,0.395402,-1.353226,1.637868,1.00972,0.334377,-0.7679,-0.742876,0.538746,...,0,-0.285673,0,-0.297187,0,-0.275449,0,-0.269905,0,-0.275531
1,-1.051649,0.604444,0.876143,-0.35503,1.895917,1.022767,-0.035038,-0.685936,-0.750591,0.231701,...,0,-0.285673,1,-0.297187,0,-0.275449,0,-0.269905,0,-0.275531
2,-2.156356,-0.153699,0.267843,-1.632926,1.379626,1.067571,0.132726,-1.10502,-1.403541,1.776993,...,0,-0.256852,0,-0.254031,1,-0.245011,0,-0.229888,1,-0.272893
3,-2.212747,0.447874,0.627974,-1.136629,0.549276,0.267843,-0.615382,-1.673632,-0.61225,0.301622,...,0,-0.285673,0,-0.297187,0,-0.275449,0,-0.269905,0,-0.275531
4,-0.468046,-0.172765,0.920823,-1.537285,1.093152,1.001117,1.002188,-0.779275,-0.222382,1.225208,...,0,-0.256852,1,-0.254031,0,-0.245011,0,-0.229888,1,-0.272893


In [7]:
import re
indicators = []
for col in X.columns:
    if re.match(r'^indicator', str(col)) or re.match(r'^adjY', str(col)):
        indicators.append(str(col))
indicators

['indicator_1',
 'adjY_1',
 'indicator_2',
 'adjY_2',
 'indicator_3',
 'adjY_3',
 'indicator_4',
 'adjY_4',
 'indicator_5',
 'adjY_5']

In [8]:
# test import scorer
from tpot.builtins import resAdjPredefinedSplits
from tpot.builtins import resAdjMseScorer


In [9]:
from tpot.config import resAdj_regressor_config_dict
from tpot import TPOTRegressor

resAdj_regressor_config_dict['tpot.builtins.resAdjTransformer'] ={
    'C': ['CMC','LIBD_szControl'],
    'adj_list': ['testingData/fssTest/adj_list.csv']
    
}

resAdj_regressor_config_dict['tpot.builtins.FeatureSetSelector'] ={
    'subset_list': ['testingData/fssTest/c2.cp.kegg.v7.0.ens.red.csv'],
    'sel_subset': range(186),
    'res_cols': [['CMC', 'LIBD_szControl'] + indicators] # columns that should not be excluded by featuresetselector
  
}
n_gen = 5
n_pop = 10


tpot = TPOTRegressor(generations=n_gen, population_size=n_pop, 
                        verbosity=3, cv=resAdjPredefinedSplits(n_splits=5),
                      config_dict=resAdj_regressor_config_dict,
                      template="FeatureSetSelector-resAdjTransformer-Transformer-Regressor",
                      scoring=resAdjMseScorer,random_state=42)
tpot.fit(X, y)

32 operators have been imported by TPOT.


HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=60.0, style=ProgressStyle(des…

_pre_test decorator: _random_mutation_operator: num_test=0 manhattan was provided as affinity. Ward can only work with euclidean distances..
Generation 1 - Current Pareto front scores:
-4	-0.2124740795656098	resAdjRidgeCV(resAdjNormalizer(resAdjTransformer(FeatureSetSelector(input_matrix, FeatureSetSelector__res_cols=['CMC', 'LIBD_szControl', 'indicator_1', 'adjY_1', 'indicator_2', 'adjY_2', 'indicator_3', 'adjY_3', 'indicator_4', 'adjY_4', 'indicator_5', 'adjY_5'], FeatureSetSelector__sel_subset=102, FeatureSetSelector__subset_list=testingData/fssTest/c2.cp.kegg.v7.0.ens.red.csv), resAdjTransformer__C=CMC, resAdjTransformer__adj_list=testingData/fssTest/adj_list.csv), resAdjNormalizer__norm=l2))

Generation 2 - Current Pareto front scores:
-4	-0.21062391640528405	resAdjElasticNetCV(resAdjStandardScaler(resAdjTransformer(FeatureSetSelector(input_matrix, FeatureSetSelector__res_cols=['CMC', 'LIBD_szControl', 'indicator_1', 'adjY_1', 'indicator_2', 'adjY_2', 'indicator_3', 'adjY_3', 'ind

TPOTRegressor(config_dict={'tpot.builtins.FeatureSetSelector': {'res_cols': [['CMC',
                                                                              'LIBD_szControl',
                                                                              'indicator_1',
                                                                              'adjY_1',
                                                                              'indicator_2',
                                                                              'adjY_2',
                                                                              'indicator_3',
                                                                              'adjY_3',
                                                                              'indicator_4',
                                                                              'adjY_4',
                                                                              'indicator_5',
  