# AutoML Generalized Framework

In [1]:
import numpy as np
import pandas as pd
from sklearn import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from tpot import TPOTClassifier, TPOTRegressor
from datacleaner import autoclean
from collections import defaultdict

In [2]:
class AutoMLEstimator(object):
    
    def __init__(self, **kwargs):
        
        self.task = kwargs['task']
        self.speed = kwargs['speed']
        self.test_size = kwargs['test_size']
        if self.task == 'classification':
            self.tpot_model = TPOTClassifier(generations=self.speed, population_size=self.speed*10, verbosity=2, n_jobs=-1)
        else:
            self.tpot_model = TPOTRegressor(generations=self.speed, population_size=self.speed*10, verbosity=2, n_jobs=-1)
        
    def preprocess_data(self, data, target_column):
        
        clean_data = autoclean(data)
        X = clean_data.drop(target_column, axis=1)
        y = clean_data[target_column]
        
        return X, y
    
    def split_data(self, X, y):
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=self.test_size) 
    
    def fit_model(self):
        
        self.tpot_model.fit(self.X_train, self.y_train)
        
    def run_automl(self, data, target_column):
        
        self.X, self.y = self.preprocess_data(data, target_column)
        self.split_data(self.X, self.y)
        self.fit_model()
    
    def evaluate_model(self):
        
        metrics = defaultdict()
        pred = self.tpot_model.fitted_pipeline_.predict(self.X_test)
        
        if self.task == 'classification':
            
            metrics['accuracy'] = accuracy_score(pred, self.y_test)
            metrics['precision'] = precision_score(pred, self.y_test)
            metrics['recall'] = recall_score(pred, self.y_test)
            
        else:
            
            metrics['r2_score'] = r2_score(pred, self.y_test)
            metrics['mean_absolute_error'] = mean_absolute_error(pred, self.y_test)
            metrics['mean_squared_error'] = mean_squared_error(pred, self.y_test)
        
        return metrics

In [3]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [4]:
data.keys()

dict_keys(['feature_names', 'DESCR', 'data', 'target', 'target_names', 'filename'])

In [5]:
data['data'].shape

(569, 30)

In [6]:
data['feature_names'].shape

(30,)

In [7]:
data['target'].shape

(569,)

In [8]:
data['target_names']

array(['malignant', 'benign'], dtype='<U9')

In [9]:
df = pd.DataFrame(columns=data['feature_names'], data=data['data'])
df['target'] = data['target']

In [10]:
auto_ml_model = AutoMLEstimator(task='classification', speed=2, test_size=0.3)
auto_ml_model.run_automl(df, target_column='target')

Generation 1 - Current best internal CV score: 0.9673717948717948
Generation 2 - Current best internal CV score: 0.9723717948717949

Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.5, max_depth=6, max_features=0.8500000000000001, min_samples_leaf=11, min_samples_split=14, n_estimators=100, subsample=0.6000000000000001)


In [11]:
list(auto_ml_model.tpot_model.evaluated_individuals_.values())[0]

{'crossover_count': 0,
 'generation': 'INVALID',
 'internal_cv_score': 0.9397435897435897,
 'mutation_count': 2,
 'operator_count': 2,
 'predecessor': ('ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=True, ExtraTreesClassifier__criterion=entropy, ExtraTreesClassifier__max_features=0.7500000000000001, ExtraTreesClassifier__min_samples_leaf=8, ExtraTreesClassifier__min_samples_split=3, ExtraTreesClassifier__n_estimators=100)',)}

In [12]:
from sklearn.datasets import load_boston
data = load_boston()
df = pd.DataFrame(columns=data['feature_names'], data=data['data'])
df['target'] = data['target']

In [13]:
auto_ml_model = AutoMLEstimator(task='regression', speed=5, test_size=0.3)
auto_ml_model.run_automl(df, target_column='target')

Generation 1 - Current best internal CV score: -15.445139196926988
Generation 2 - Current best internal CV score: -13.871244694915386
Generation 3 - Current best internal CV score: -13.871244694915386
Generation 4 - Current best internal CV score: -13.518966973980335
Generation 5 - Current best internal CV score: -13.088285629211185

Best pipeline: AdaBoostRegressor(AdaBoostRegressor(ZeroCount(input_matrix), learning_rate=0.5, loss=linear, n_estimators=100), learning_rate=0.5, loss=linear, n_estimators=100)


In [14]:
auto_ml_model.evaluate_model()

defaultdict(None,
            {'mean_absolute_error': 2.2718073638268916,
             'mean_squared_error': 10.45529163096584,
             'r2_score': 0.788553478479608})