# Import Libraries

In [1]:
!pip install evalml



In [2]:
import pandas as pd
import numpy as np
import evalml
from evalml.objectives import get_core_objectives
from evalml.problem_types import ProblemTypes
from evalml.data_checks import DefaultDataChecks
from evalml.automl import AutoMLSearch
from sklearn.utils import shuffle
import pickle
import warnings
warnings.filterwarnings('ignore')

# Read Datasets

In [3]:
df = pd.read_csv('processed_data_isic.csv')
df.head()

Unnamed: 0,Family_History,Patient_History,Patient_Sex,Patient_Age,Melanoma
0,False,False,female,70.0,True
1,False,True,female,40.0,False
2,False,False,male,45.0,False
3,False,False,female,50.0,False
4,False,True,female,30.0,False


In [4]:
for i in range(100):
  df = shuffle(df)

df=df.reset_index(drop=True)

In [5]:
# Dataframe to DataTable conversion to treat columns with the same physical data type differently
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train,_, y_train,_ = evalml.preprocessing.split_data(X, y, 
                                                      problem_type='binary',
                                                      test_size=1/df.shape[0]*100)

# Training

In [6]:
for objective in get_core_objectives(ProblemTypes.BINARY):
    print(objective.name)

MCC Binary
Log Loss Binary
Gini
AUC
Precision
F1
Balanced Accuracy Binary
Accuracy Binary


In [7]:
data_checks = DefaultDataChecks("binary", "Balanced Accuracy Binary")
data_checks.validate(X_train, y_train)



In [8]:
automl = AutoMLSearch(X_train = X_train, 
                      y_train = y_train, 
                      problem_type = 'binary',
                      objective = "Balanced Accuracy Binary",
                      additional_objectives=['Log Loss Binary','F1','AUC'],
                      optimize_thresholds = True,
                      ensembling = True,
                      max_batches = 10,
                      verbose=True
                 )
automl.search()

Generating pipelines to search over...
8 pipelines ready for search.
Ensembling will run every 9 batches.

*****************************
* Beginning pipeline search *
*****************************

Optimizing for Balanced Accuracy Binary. 
Greater score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 10 batches for a total of 50 pipelines. 
Allowed model families: random_forest, extra_trees, catboost, decision_tree, linear_model, xgboost, lightgbm



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Evaluating Baseline Pipeline: Mode Baseline Binary Classification Pipeline
Mode Baseline Binary Classification Pipeline:
	Starting cross validation
	Finished cross validation - mean Balanced Accuracy Binary: 0.500

*****************************
* Evaluating Batch Number 1 *
*****************************

Elastic Net Classifier w/ Imputer + One Hot Encoder + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean Balanced Accuracy Binary: 0.572
Decision Tree Classifier w/ Imputer + One Hot Encoder:
	Starting cross validation
	Finished cross validation - mean Balanced Accuracy Binary: 0.407
Random Forest Classifier w/ Imputer + One Hot Encoder:
	Starting cross validation
	Finished cross validation - mean Balanced Accuracy Binary: 0.370
LightGBM Classifier w/ Imputer + One Hot Encoder:
	Starting cross validation
	Finished cross validation - mean Balanced Accuracy Binary: 0.497
Logistic Regression Classifier w/ Imputer + One Hot Encoder + Standard Scaler:
	Starting cr

In [9]:
automl.describe_pipeline(automl.rankings.iloc[0]["id"])


*************************************************************************
* Elastic Net Classifier w/ Imputer + One Hot Encoder + Standard Scaler *
*************************************************************************

Problem Type: binary
Model Family: Linear

Pipeline Steps
1. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * categorical_fill_value : None
	 * numeric_fill_value : None
2. One Hot Encoder
	 * top_n : 10
	 * features_to_encode : None
	 * categories : None
	 * drop : if_binary
	 * handle_unknown : ignore
	 * handle_missing : error
3. Standard Scaler
4. Elastic Net Classifier
	 * penalty : elasticnet
	 * C : 1.0
	 * l1_ratio : 0.15
	 * n_jobs : -1
	 * multi_class : auto
	 * solver : saga

Training
Training for binary problems.
Objective to optimize binary classification pipeline thresholds for: <evalml.objectives.standard_metrics.BalancedAccuracyBinary object at 0x7f00028b3250>
Total training time (including CV): 1.1 secon

In [10]:
automl.best_pipeline.graph_feature_importance()