# Install skrobot

In [1]:
!pip install skrobot



# Import Modules

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

from skrobot.core import Experiment
from skrobot.tasks import TrainTask
from skrobot.tasks import PredictionTask
from skrobot.tasks import EvaluationCrossValidationTask
from skrobot.tasks import FeatureSelectionCrossValidationTask
from skrobot.tasks import HyperParametersSearchCrossValidationTask
from skrobot.feature_selection import ColumnSelector

# Settings

In [3]:
train_data_set_file_path = 'https://bit.ly/kaggle-train-data'

test_data_set_file_path = 'https://bit.ly/kaggle-test-data'

id_column = 'PassengerId'

label_column = 'Survived'

evaluation_metric = 'accuracy'

random_seed = 42

total_cv_folds = 10

# Define Columns Transformer (Feature Preprocessing)

In [4]:
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch']

categorical_features = ['Embarked', 'Sex', 'Pclass']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('numerical_transformer', numeric_transformer, numerical_features),
    ('categorical_transformer', categorical_transformer, categorical_features)])

# Define Hyperparameters Search Space & Binary Classifier

In [5]:
search_params = {
    "classifier__C" : [ 1.e-01, 1.e+00, 1.e+01 ],
    "classifier__penalty" : [ "l1", "l2" ],
    "preprocessor__numerical_transformer__imputer__strategy" : [ "mean", "median" ]
}

classifier = LogisticRegression(solver='liblinear', random_state=random_seed)

# Build Experiment

In [6]:
experiment = Experiment('experiments-output').set_experimenter('echatzikyriakidis').build()

# Run Feature Selection Task

In [7]:
features_columns = experiment.run(FeatureSelectionCrossValidationTask (estimator=classifier,
                                                                       train_data_set_file_path=train_data_set_file_path,
                                                                       preprocessor=preprocessor,
                                                                       scoring=evaluation_metric,
                                                                       id_column=id_column,
                                                                       label_column=label_column,
                                                                       random_seed=random_seed).stratified_folds(total_folds=total_cv_folds, shuffle=True))

Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting est

# Build complete pipeline including feature selection

In [8]:
pipe = Pipeline(steps=[('preprocessor', preprocessor), ('selector', ColumnSelector(cols=features_columns)), ('classifier', classifier)])

# Run Hyperparameters Search Task

In [9]:
hyperparameters_search_results = experiment.run(HyperParametersSearchCrossValidationTask (estimator=pipe,
                                                                                          search_params=search_params,
                                                                                          train_data_set_file_path=train_data_set_file_path,
                                                                                          id_column=id_column,
                                                                                          label_column=label_column,
                                                                                          objective_score=evaluation_metric,
                                                                                          random_seed=random_seed).grid_search().stratified_folds(total_folds=total_cv_folds, shuffle=True))

Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV] classifier__C=0.1, classifier__penalty=l1, preprocessor__numerical_transformer__imputer__strategy=mean 
[CV]  classifier__C=0.1, classifier__penalty=l1, preprocessor__numerical_transformer__imputer__strategy=mean, accuracy=(train=0.795, test=0.833), average_precision=(train=0.819, test=0.910), f1=(train=0.715, test=0.789), precision=(train=0.766, test=0.778), recall=(train=0.671, test=0.800), roc_auc=(train=0.850, test=0.917), total=   0.0s
[CV] classifier__C=0.1, classifier__penalty=l1, preprocessor__numerical_transformer__imputer__strategy=mean 
[CV]  classifier__C=0.1, classifier__penalty=l1, preprocessor__numerical_transformer__imputer__strategy=mean, accuracy=(train=0.803, test=0.730), average_precision=(train=0.837, test=0.738), f1=(train=0.731, test=0.625), precision=(train=0.768, test=0.667), recall=(train=0.698, test=0.588), roc_auc=(train=0.858, test=0.839), total=   0.0s
[CV] classifier__C=0.1, classifier__p

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV]  classifier__C=0.1, classifier__penalty=l1, preprocessor__numerical_transformer__imputer__strategy=mean, accuracy=(train=0.793, test=0.820), average_precision=(train=0.824, test=0.863), f1=(train=0.720, test=0.733), precision=(train=0.750, test=0.846), recall=(train=0.692, test=0.647), roc_auc=(train=0.851, test=0.900), total=   0.0s
[CV] classifier__C=0.1, classifier__penalty=l1, preprocessor__numerical_transformer__imputer__strategy=mean 
[CV]  classifier__C=0.1, classifier__penalty=l1, preprocessor__numerical_transformer__imputer__strategy=mean, accuracy=(train=0.800, test=0.775), average_precision=(train=0.831, test=0.762), f1=(train=0.729, test=0.643), precision=(train=0.762, test=0.818), recall=(train=0.698, test=0.529), roc_auc=(train=0.863, test=0.783), total=   0.0s
[CV] classifier__C=0.1, classifier__penalty=l1, preprocessor__numerical_transformer__imputer__strategy=mean 
[CV]  classifier__C=0.1, classifier__penalty=l1, preprocessor__numerical_transformer__imputer__strat

[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    7.6s finished


# Run Evaluation Task

In [10]:
evaluation_results = experiment.run(EvaluationCrossValidationTask(estimator=pipe,
                                                                  estimator_params=hyperparameters_search_results['best_params'],
                                                                  train_data_set_file_path=train_data_set_file_path,
                                                                  id_column=id_column,
                                                                  label_column=label_column,
                                                                  threshold_selection_by=evaluation_metric,
                                                                  random_seed=random_seed,
                                                                  export_classification_reports=True,
                                                                  export_confusion_matrixes=True,
                                                                  export_pr_curves=True,
                                                                  export_roc_curves=True,
                                                                  export_false_positives_reports=True,
                                                                  export_false_negatives_reports=True,
                                                                  export_also_for_train_folds=True).stratified_folds(total_folds=total_cv_folds, shuffle=True))


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in long_scalars


invalid value encountered in lo

# Run Train Task

In [11]:
train_results = experiment.run(TrainTask(estimator=pipe,
                                         estimator_params=hyperparameters_search_results['best_params'],
                                         train_data_set_file_path=train_data_set_file_path,
                                         id_column=id_column,
                                         label_column=label_column,
                                         random_seed=random_seed))

# Run Prediction Task

In [12]:
predictions = experiment.run(PredictionTask(estimator=train_results['estimator'],
                                            data_set_file_path=test_data_set_file_path,
                                            id_column=id_column,
                                            prediction_column=label_column,
                                            threshold=evaluation_results['threshold']))

# Print Results

In [13]:
print(features_columns)

print(hyperparameters_search_results['best_params'])
print(hyperparameters_search_results['best_index'])
print(hyperparameters_search_results['best_estimator'])
print(hyperparameters_search_results['best_score'])
print(hyperparameters_search_results['search_results'])

print(evaluation_results['threshold'])
print(evaluation_results['cv_threshold_metrics'])
print(evaluation_results['cv_splits_threshold_metrics'])
print(evaluation_results['cv_splits_threshold_metrics_summary'])

print(train_results['estimator'])

print(predictions)

[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11]
{'classifier__C': 1.0, 'classifier__penalty': 'l2', 'preprocessor__numerical_transformer__imputer__strategy': 'mean'}
6
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical_transformer',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'Fare', 'SibSp',
                                                   'Parch']),
                                                 ('categorical_transformer',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
       