In [1]:
import evalml

In [2]:
from evalml import AutoMLSearch

In [3]:
from evalml.objectives import AccuracyBinary, F1

### Loading Dataset

In [4]:
X, y = evalml.demos.load_breast_cancer()

         Number of Features
Numeric                  30

Number of training examples: 569
Targets
benign       62.74%
malignant    37.26%
Name: target, dtype: object


In [5]:
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [6]:
y.unique()

['malignant', 'benign']
Categories (2, object): ['benign', 'malignant']

### Splitting the Data

In [7]:
X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(X, y, problem_type='binary')

In [8]:
print("X_Train size: ",X_train.size,"\nY_Train size: ",y_train.size)

X_Train size:  13650 
Y_Train size:  455


In [9]:
print("X_Test size: ",X_test.size,"\nY_Test size: ",y_test.size)

X_Test size:  3420 
Y_Test size:  114


### AutoML is used to train the model. Here we describe the problem type

In [10]:
automl = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type='binary')

### Automl.search()
* This line starts the search for the best pipeline. The search() method explores a wide range of pipelines and hyperparameters to find the best model for the given problem.

In [11]:
automl.search()

	High coefficient of variation (cv >= 0.5) within cross validation scores.
	Decision Tree Classifier w/ Label Encoder + Imputer + Select Columns Transformer may not perform as estimated on unseen data.


{1: {'Random Forest Classifier w/ Label Encoder + Imputer': 2.0375490188598633,
  'Total time of batch': 2.1714775562286377},
 2: {'Random Forest Classifier w/ Label Encoder + Imputer + RF Classifier Select From Model': 2.315767288208008,
  'Total time of batch': 2.4418201446533203},
 3: {'Decision Tree Classifier w/ Label Encoder + Imputer + Select Columns Transformer': 0.7734072208404541,
  'LightGBM Classifier w/ Label Encoder + Imputer + Select Columns Transformer': 1.6315433979034424,
  'Extra Trees Classifier w/ Label Encoder + Imputer + Select Columns Transformer': 1.7256405353546143,
  'Elastic Net Classifier w/ Label Encoder + Imputer + Standard Scaler + Select Columns Transformer': 1.3364245891571045,
  'CatBoost Classifier w/ Label Encoder + Imputer + Select Columns Transformer': 1.5333106517791748,
  'XGBoost Classifier w/ Label Encoder + Imputer + Select Columns Transformer': 3.1642491817474365,
  'Logistic Regression Classifier w/ Label Encoder + Imputer + Standard Scaler

### Best Pipeline

* ` This line retrieves the best pipeline found by AutoML. The best_pipeline attribute of the AutoMLSearch object stores the pipeline with the highest evaluation score.

In [12]:
best_pipeline = automl.best_pipeline

In [13]:
objectives = [F1(), AccuracyBinary()]

In [14]:
scores = best_pipeline.score(X_test, y_test, objectives=objectives)

In [18]:
print("F1: ",scores['F1']*100)
print("Accuracy: ",scores['Accuracy Binary']*100)

F1:  90.69767441860465
Accuracy:  92.98245614035088


### Predictions

In [19]:
predictions = best_pipeline.predict(X_test)
print(predictions)

477       benign
558       benign
537       benign
322       benign
474       benign
         ...    
364       benign
518       benign
354       benign
23     malignant
548       benign
Name: malignant, Length: 114, dtype: category
Categories (2, object): ['benign', 'malignant']
