# AutoML Generalized Framework

In [1]:
import numpy as np
import pandas as pd
from sklearn import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from tpot import TPOTClassifier, TPOTRegressor
from datacleaner import autoclean
from collections import defaultdict

In [2]:
class AutoMLEstimator(object):
    
    def __init__(self, **kwargs):
        
        self.task = kwargs['task']
        self.speed = kwargs['speed']
        self.test_size = kwargs['test_size']
        if self.task == 'classification':
            self.tpot_model = TPOTClassifier(generations=self.speed, population_size=self.speed*10, verbosity=2, n_jobs=-1)
        else:
            self.tpot_model = TPOTRegressor(generations=self.speed, population_size=self.speed*10, verbosity=2, n_jobs=-1)
        
    def preprocess_data(self, data, target_column):
        
        clean_data = autoclean(data)
        X = clean_data.drop(target_column, axis=1)
        y = clean_data[target_column]
        
        return X, y
    
    def split_data(self, X, y):
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=self.test_size) 
    
    def fit_model(self):
        
        self.tpot_model.fit(self.X_train, self.y_train)
        
    def run_automl(self, data, target_column):
        
        self.X, self.y = self.preprocess_data(data, target_column)
        self.split_data(self.X, self.y)
        self.fit_model()
    
    def evaluate_model(self):
        
        metrics = defaultdict()
        pred = self.tpot_model.fitted_pipeline_.predict(self.X_test)
        
        if self.task == 'classification':
            
            metrics['accuracy'] = accuracy_score(pred, self.y_test)
            metrics['precision'] = precision_score(pred, self.y_test)
            metrics['recall'] = recall_score(pred, self.y_test)
            
        else:
            
            metrics['r2_score'] = r2_score(pred, self.y_test)
            metrics['mean_absolute_error'] = mean_absolute_error(pred, self.y_test)
            metrics['mean_squared_error'] = mean_squared_error(pred, self.y_test)
        
        return metrics

In [3]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [4]:
data.keys()

dict_keys(['data', 'target_names', 'target', 'feature_names', 'filename', 'DESCR'])

In [5]:
data['data'].shape

(569, 30)

In [6]:
data['feature_names'].shape

(30,)

In [7]:
data['target'].shape

(569,)

In [8]:
data['target_names']

array(['malignant', 'benign'], dtype='<U9')

In [9]:
df = pd.DataFrame(columns=data['feature_names'], data=data['data'])
df['target'] = data['target']

In [10]:
auto_ml_model = AutoMLEstimator(task='classification', speed=10, test_size=0.2)
auto_ml_model.run_automl(df, target_column='target')

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


Generation 1 - Current best internal CV score: 0.9757992249296598
Generation 2 - Current best internal CV score: 0.9757992249296598
Generation 3 - Current best internal CV score: 0.9757992249296598
Generation 4 - Current best internal CV score: 0.9757992249296598
Generation 5 - Current best internal CV score: 0.9758719541328237
Generation 6 - Current best internal CV score: 0.9758719541328237
Generation 7 - Current best internal CV score: 0.9780453363062058
Generation 8 - Current best internal CV score: 0.9780453363062058
Generation 9 - Current best internal CV score: 0.9780453363062058
Generation 10 - Current best internal CV score: 0.9780453363062058

Best pipeline: GradientBoostingClassifier(PolynomialFeatures(input_matrix, degree=2, include_bias=False, interaction_only=False), learning_rate=0.1, max_depth=5, max_features=0.15000000000000002, min_samples_leaf=10, min_samples_split=14, n_estimators=100, subsample=0.2)


In [11]:
list(auto_ml_model.tpot_model.evaluated_individuals_.values())[0]

{'crossover_count': 2,
 'generation': 'INVALID',
 'internal_cv_score': 0.9320592451027234,
 'mutation_count': 5,
 'operator_count': 2,
 'predecessor': ('GradientBoostingClassifier(input_matrix, GradientBoostingClassifier__learning_rate=0.1, GradientBoostingClassifier__max_depth=7, GradientBoostingClassifier__max_features=0.55, GradientBoostingClassifier__min_samples_leaf=14, GradientBoostingClassifier__min_samples_split=10, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=0.25)',)}

In [12]:
from sklearn.datasets import load_boston
data = load_boston()
df = pd.DataFrame(columns=data['feature_names'], data=data['data'])
df['target'] = data['target']

In [13]:
auto_ml_model = AutoMLEstimator(task='regression', speed=10, test_size=0.3)
auto_ml_model.run_automl(df, target_column='target')

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


Generation 1 - Current best internal CV score: -12.67954508477961
Generation 2 - Current best internal CV score: -11.848068774735953
Generation 3 - Current best internal CV score: -11.848068774735953
Generation 4 - Current best internal CV score: -11.848068774735953
Generation 5 - Current best internal CV score: -11.848068774735953


TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: GradientBoostingRegressor(input_matrix, alpha=0.95, learning_rate=0.1, loss=huber, max_depth=4, max_features=0.7500000000000001, min_samples_leaf=2, min_samples_split=7, n_estimators=100, subsample=0.8)


In [14]:
auto_ml_model.evaluate_model()

defaultdict(None,
            {'mean_absolute_error': 2.0780182432962433,
             'mean_squared_error': 7.262445839297437,
             'r2_score': 0.9106636135331789})

In [36]:
type(auto_ml_model.tpot_model.fitted_pipeline_[0]).__name__

'GradientBoostingRegressor'