# Import Libraries

In [7]:
!pip install evalml



In [8]:
import pandas as pd
import numpy as np
import evalml
from evalml.objectives import get_core_objectives
from evalml.problem_types import ProblemTypes
from evalml.data_checks import DefaultDataChecks
from evalml.automl import AutoMLSearch
from sklearn.utils import shuffle
import pickle
import warnings
warnings.filterwarnings('ignore')

# Read Datasets

In [9]:
df = pd.read_csv('processed_data_pad.csv')
df.head()

Unnamed: 0,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,cancer_history,has_piped_water,has_sewage_system,Melanoma
0,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,True,True,True,False
1,False,True,POMERANIA,POMERANIA,79,False,MALE,True,False,False,False,False
2,False,True,GERMANY,ITALY,52,False,FEMALE,False,True,True,True,False
3,False,False,POMERANIA,POMERANIA,74,True,FEMALE,False,False,False,False,False
4,False,True,GERMANY,GERMANY,58,True,FEMALE,True,True,True,True,False


In [10]:
for i in range(100):
    df = shuffle(df)

df=df.reset_index(drop=True)

In [11]:
# Dataframe to DataTable conversion to treat columns with the same physical data type differently
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train,_, y_train,_ = evalml.preprocessing.split_data(X, y, 
                                                      problem_type='binary',
                                                      test_size=1/df.shape[0]*100)

# Training

In [12]:
for objective in get_core_objectives(ProblemTypes.BINARY):
    print(objective.name)

MCC Binary
Log Loss Binary
Gini
AUC
Precision
F1
Balanced Accuracy Binary
Accuracy Binary


In [13]:
data_checks = DefaultDataChecks("binary", "Balanced Accuracy Binary")
data_checks.validate(X_train, y_train)

{'actions': [],
 'errors': [],
   'data_check_name': 'ClassImbalanceDataCheck',
   'details': {'target_values': [True]},
   'message': 'The following labels fall below 10% of the target: [True]'},
  {'code': 'CLASS_IMBALANCE_SEVERE',
   'data_check_name': 'ClassImbalanceDataCheck',
   'details': {'target_values': [True]},
   'message': 'The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than 100 samples: [True]'}]}

In [14]:
automl = AutoMLSearch(X_train = X_train, 
                      y_train = y_train, 
                      problem_type = 'binary',
                      objective = "Balanced Accuracy Binary",
                      additional_objectives=['Log Loss Binary','F1','AUC'],
                      optimize_thresholds = True,
                      ensembling = True,
                      max_batches = 10,
                      verbose=True)
automl.search()

Generating pipelines to search over...
8 pipelines ready for search.
Ensembling will run every 9 batches.

*****************************
* Beginning pipeline search *
*****************************

Optimizing for Balanced Accuracy Binary. 
Greater score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 10 batches for a total of 50 pipelines. 
Allowed model families: decision_tree, random_forest, linear_model, xgboost, extra_trees, catboost, lightgbm



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Evaluating Baseline Pipeline: Mode Baseline Binary Classification Pipeline
Mode Baseline Binary Classification Pipeline:
	Starting cross validation
	Finished cross validation - mean Balanced Accuracy Binary: 0.500

*****************************
* Evaluating Batch Number 1 *
*****************************

Elastic Net Classifier w/ Imputer + One Hot Encoder + Oversampler + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean Balanced Accuracy Binary: 0.497
Decision Tree Classifier w/ Imputer + One Hot Encoder + Oversampler:
	Starting cross validation
	Finished cross validation - mean Balanced Accuracy Binary: 0.506
Random Forest Classifier w/ Imputer + One Hot Encoder + Oversampler:
	Starting cross validation
	Finished cross validation - mean Balanced Accuracy Binary: 0.497
LightGBM Classifier w/ Imputer + One Hot Encoder + Oversampler:
	Starting cross validation
	Finished cross validation - mean Balanced Accuracy Binary: 0.497
Logistic Regression Classifier w/ I

In [15]:
automl.describe_pipeline(automl.rankings.iloc[0]["id"])


***********************************************************************
* Decision Tree Classifier w/ Imputer + One Hot Encoder + Oversampler *
***********************************************************************

Problem Type: binary
Model Family: Decision Tree

Pipeline Steps
1. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : median
	 * categorical_fill_value : None
	 * numeric_fill_value : None
2. One Hot Encoder
	 * top_n : 10
	 * features_to_encode : None
	 * categories : None
	 * drop : if_binary
	 * handle_unknown : ignore
	 * handle_missing : error
3. Oversampler
	 * sampling_ratio : 0.25
	 * k_neighbors_default : 5
	 * n_jobs : -1
	 * sampling_ratio_dict : None
	 * k_neighbors : 5
4. Decision Tree Classifier
	 * criterion : entropy
	 * max_features : log2
	 * max_depth : 6
	 * min_samples_split : 2
	 * min_weight_fraction_leaf : 0.0

Training
Training for binary problems.
Objective to optimize binary classification pipeline thresholds 

In [17]:
automl.best_pipeline.graph_feature_importance()