In [1]:
import pandas as pd


## Aim
Motive of the notebook is to give a brief overview as to how to use the evolutionary sampling powered ensemble models as part of the EvoML research project. 

Will make the notebook more verbose if time permits. Priority will be to showcase the flexible API of the new estimators which encourage research and tinkering. 

##Contents
 - Subsampling
 - Subspacing

### 1. Subsampling - Sampling in the example space - rows will be mutated and evolved.


In [2]:
from evoml.subsampling import BasicSegmenter_FEMPO, BasicSegmenter_FEGT, BasicSegmenter_FEMPT

In [3]:
df = pd.read_csv('datasets/ozone.csv')

In [4]:
df.head(2)

Unnamed: 0,temp,invHt,press,vis,milPress,hum,invTemp,wind,output
0,0.220588,0.528124,0.25,0.714286,0.619048,0.121622,0.313725,0.190476,3
1,0.294118,0.097975,0.255682,0.285714,0.603175,0.243243,0.428571,0.142857,5


In [5]:
X, y = df.iloc[:,:-1], df['output']

In [6]:
print(BasicSegmenter_FEGT.__doc__)


    Uses basic evolutionary algorithm to find the best subsets of X and trains
    Linear Regression on each subset. For given row of input, prediction
    is based on the model trained on segment closest to input.

    Same as the BasicSegmenter, but uses list of thrained models instead of DataFrames
    as each individual. Done to boost performance. 

    Parameters
    ----------
    n : Integer, optional, default, 10
        The number of segments you want in your dataset.
    
    base_estimator: estimator, default, LinearRegression
        The basic estimator for all segments.

    test_size : float, optional, default, 0.2
        Test size that the algorithm internally uses in its 
        fitness function.

    n_population : Integer, optional, default, 30
        The number of ensembles present in population.

    init_sample_percentage : float, optional, default, 0.2
    

    Attributes
    -----------
    best_enstimator_ : estimator 
    
    segments_ : list of DataFrame

In [7]:
from sklearn.tree import DecisionTreeRegressor
clf_dt = DecisionTreeRegressor(max_depth=3)
clf = BasicSegmenter_FEGT(base_estimator=clf_dt, statistics=True)

In [8]:
clf.fit(X, y)

gen	nevals	avg   	std     	min    	max    
0  	30    	5.2781	0.589689	4.23039	6.80946
1  	22    	4.74899	0.469555	4.2272 	5.96946
2  	22    	4.57545	0.334731	4.09894	5.43991
3  	22    	4.44866	0.488645	4.03896	6.3982 
4  	19    	4.31265	0.222423	3.88692	4.74369
5  	27    	4.32705	0.449177	3.88692	6.41261
6  	25    	4.30957	0.465491	3.88692	5.8717 
7  	21    	4.26635	0.361028	3.88692	5.27358
8  	20    	4.251  	0.552977	3.89738	6.27109
9  	21    	4.15874	0.338961	3.79678	5.30784
10 	22    	4.10114	0.29433 	3.79678	5.05167
11 	25    	4.04121	0.253477	3.76978	4.70117
12 	20    	4.01039	0.363592	3.75632	5.65705
13 	23    	3.98229	0.272671	3.70865	4.60254
14 	24    	3.89309	0.226678	3.70865	4.78833
15 	20    	3.81467	0.219473	3.63065	4.8581 
16 	23    	3.8633 	0.270345	3.63065	4.59721
17 	25    	3.82304	0.197699	3.5993 	4.50707
18 	18    	3.81666	0.359854	3.55746	5.55283
19 	23    	3.80377	0.23816 	3.55746	4.75423
20 	22    	3.75274	0.146903	3.55746	4.16259
21 	24    	3.84858	0.300793	3.3717

BasicSegmenter_FEGT(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
          crossover_func=<function cxTwoPoint at 0x106C5B70>, cxpb=0.5,
          indpb=0.2, init_sample_percentage=0.2, mutpb=0.5, n=10,
          n_population=30, n_votes=1, ngen=50, statistics=True,
          test_size=0.2, tournsize=3)

In [9]:
clf.score(X, y)

0.69093734986554811

In [10]:
EGs = clf.segments_

In [11]:
len(EGs)

10

In [12]:
sampled_datasets = [eg.get_data() for eg in EGs]

In [13]:
[sd.shape for sd in sampled_datasets]

[(27, 9),
 (66, 9),
 (40, 9),
 (118, 9),
 (66, 9),
 (53, 9),
 (53, 9),
 (53, 9),
 (66, 9),
 (66, 9)]

### 2. Subspacing - sampling in the domain of features - evolving and mutating columns

In [14]:
from evoml.subspacing import FeatureStackerFEGT, FeatureStackerFEMPO

In [15]:
print(FeatureStackerFEGT.__doc__)


    Uses basic evolutionary algorithm to find the best subspaces of X and trains 
    a model on each subspace. For given row of input, prediction is based on the ensemble
    which has performed the best on the test set. The prediction is the average of all the 
    chromosome predictions.

    Same as the BasicSegmenter, but uses list of thrained models instead of DataFrames
    as each individual. Done to boost performance. 

    Parameters
    ----------
    test_size: float, default = 0.2
        Test size that the algorithm internally uses in its fitness
        function
    
    N_population: Integer, default : 30
        The population of the individuals that the evolutionary algorithm is going to use. 
    
    N_individual: Integer, default : 5
        Number of chromosomes in each individual of the population

    featMin: Integer, default : 1
        The minimum number of features for the sub space from the dataset
        Cannot be <= 0 else changes it to 1 instead.
    


In [16]:
clf = FeatureStackerFEGT(ngen=30)

In [17]:
clf.fit(X, y)

gen	nevals	avg    	min    	max    
0  	30    	4.80779	4.30355	5.31144
1  	14    	4.55898	4.30355	4.96747
2  	24    	4.47572	4.30232	5.01653
3  	30    	4.39705	4.24509	4.5792 
4  	13    	4.3305 	4.22728	4.70083
5  	22    	4.27701	4.22728	4.38708
6  	22    	4.25929	4.22728	4.38545
7  	21    	4.23435	4.21544	4.24509
8  	17    	4.23617	4.21544	4.38545
9  	18    	4.22293	4.21544	4.22728
10 	21    	4.21741	4.21544	4.22728
11 	27    	4.21559	4.21544	4.22013
12 	20    	4.21544	4.21544	4.21544
13 	20    	4.21544	4.21544	4.21544
14 	28    	4.21544	4.21544	4.21544
15 	17    	4.21536	4.21307	4.21544
16 	22    	4.21522	4.21307	4.21833
17 	26    	4.21459	4.21307	4.21831
18 	21    	4.21346	4.21307	4.21544
19 	19    	4.21307	4.21307	4.21307
20 	20    	4.21307	4.21307	4.21307
21 	24    	4.21307	4.21307	4.21307
22 	23    	4.21307	4.21307	4.21307
23 	18    	4.21328	4.21307	4.21833
24 	21    	4.21307	4.21307	4.21307
25 	23    	4.21307	4.21307	4.21307
26 	23    	4.21307	4.21307	4.21307
27 	20    	4.2131 	4

FeatureStackerFEGT(N_individual=5, N_population=30,
          base_estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
          crossover_func=<function cxTwoPoint at 0x106C5B70>, cxpb=0.5,
          featMax=7, featMin=1, indpb=0.05, mutpb=0.4, ngen=30,
          test_frac=0.3, test_frac_flag=False, test_size=0.2)

In [18]:
clf.score(X, y)

0.65262771433009603

In [19]:
## Get the Hall of Fame individual
hof = clf.segment[0]

In [20]:
sampled_datasets = [eg.get_data() for eg in hof]

In [21]:
[data.columns.tolist() for data in sampled_datasets]

[['hum', 'milPress', 'temp', 'invTemp', 'vis', 'invHt', 'press', 'output'],
 ['invHt', 'milPress', 'hum', 'temp', 'invTemp', 'vis', 'output'],
 ['invHt', 'output'],
 ['invHt', 'hum', 'vis', 'output'],
 ['hum', 'press', 'vis', 'milPress', 'invTemp', 'output']]

In [22]:
## Original X columns
X.columns

Index([u'temp', u'invHt', u'press', u'vis', u'milPress', u'hum', u'invTemp',
       u'wind'],
      dtype='object')

In [None]:
# The exploration of the dataset by benchmark algorithms
clf = DecisionTreeClassifier(random_state=34092)
clf.fit(X_train, y_train)
pred_DTC = clf.predict(X_test)
print('Base DecisionTreeClassifier accuracy: {}'.format(clf.score(X_test, y_test)))

clf = RandomForestClassifier(random_state=34092)
clf.fit(X_train_tot, y_train)
pred_RFC = clf.predict(X_test)
print('Base RandomForestClassifier accuracy: {}'.format(clf.score(X_test, y_test)))

clf = GradientBoostingClassifier(random_state=34092)
clf.fit(X_train, y_train)
pred_GBC = clf.predict(X_test)
print('Base GradientBoostingClassifier accuracy: {}'.format(clf.score(X_test, y_test)))

print('')