In [None]:
# Only execute if you haven't already. Make sure to restart the kernel if these libraries have not been previously installed.
!pip install xgboost==0.82 --user
!pip install scikit-learn==0.20.4 --user

# Build a First Model

Based on https://aihub.cloud.google.com/p/products%2F526771c4-9b36-4022-b9c9-63629e9e3289

In [9]:
import datetime
import pickle
import os

import pandas as pd
import xgboost as xgb
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion, make_pipeline

import custom_transforms

import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [2]:
# Set the path to the CSV containing the dataset to train on.
csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'

# Set the column names for the columns in the CSV. If the CSV's first line is a header line containing
# the column names, then set this to None.
COLUMNS = (
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income-level'
)

# Read the dataset from the provided CSV and print out information about it.
raw_training_data = pd.read_csv(csv_path, names=COLUMNS, skipinitialspace=True)
raw_training_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-level
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [17]:
raw_features = raw_training_data.drop('income-level', axis=1).values
# Create training labels list
train_labels = (raw_training_data['income-level'] == '>50K').values.astype(int)

In [18]:
# Categorical features: age and hours-per-week
# Numerical features: workclass, marital-status, and relationship
numerical_indices = [0, 12]  # age-num, and hours-per-week
categorical_indices = [1, 3, 5, 7]  # workclass, education, marital-status, and relationship

p1 = make_pipeline(custom_transforms.PositionalSelector(categorical_indices),
                   custom_transforms.StripString(),
                   custom_transforms.SimpleOneHotEncoder())
p2 = make_pipeline(custom_transforms.PositionalSelector(numerical_indices),
                   StandardScaler())

p3 = FeatureUnion([
    ('numericals', p1),
    ('categoricals', p2),
])

In [19]:
pipeline = make_pipeline(
    p3,
    xgb.sklearn.XGBClassifier(max_depth=4)
)

In [20]:
pipeline.fit(raw_features, train_labels)



Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('numericals', Pipeline(memory=None,
     steps=[('positionalselector', PositionalSelector(positions=[1, 3, 5, 7])), ('stripstring', StripString()), ('simpleonehotencoder', SimpleOneHotEncoder())])), ('categoricals', Pipeline...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

# Save Trained Model to AI Platform

Following instructions at https://cloud.google.com/ai-platform/prediction/docs/exporting-for-prediction.
Need to create a setup.py file and such...

In [21]:
with open('model.pkl', 'wb') as model_file:
    pickle.dump(pipeline, model_file)

In [None]:
%%bash
python setup.py sdist --formats=gztar

gsutil cp model.pkl gs://[QWIKLABS-PROJECT_ID]/
gsutil cp dist/custom_transforms-0.1.tar.gz gs://[QWIKLABS-PROJECT_ID]/

In [24]:
!gcloud ai-platform models create income_classifier --regions us-central1

Created ml engine model [projects/nytaxi-query-test/models/income_classifier].


In [47]:
%%bash
MODEL_DIR="gs://[QWIKLABS-PROJECT_ID]/"
CUSTOM_CODE_PATH="gs://[QWIKLABS-PROJECT_ID]/custom_transforms-0.1.tar.gz"
VERSION_NAME="v1"
MODEL_NAME="income_classifier"
FRAMEWORK="scikit-learn"

gcloud beta ai-platform versions create $VERSION_NAME \
  --model $MODEL_NAME \
  --origin $MODEL_DIR \
  --runtime-version=1.15 \
  --framework $FRAMEWORK \
  --python-version=3.7 \
  --package-uris=$CUSTOM_CODE_PATH

ERROR: (gcloud.beta.ai-platform.versions.create) FAILED_PRECONDITION: Field: version.deployment_uri Error: The provided GCS prefix [gs://[QWIKLABS-PROJECT_ID]/] cannot be read by service account service-581515899388@cloud-ml.google.com.iam.gserviceaccount.com.
- '@type': type.googleapis.com/google.rpc.BadRequest
  fieldViolations:
  - description: The provided GCS prefix [gs://[QWIKLABS-PROJECT_ID]/] cannot be read
      by service account service-581515899388@cloud-ml.google.com.iam.gserviceaccount.com.
    field: version.deployment_uri


CalledProcessError: Command 'b'MODEL_DIR="gs://[QWIKLABS-PROJECT_ID]/"\nCUSTOM_CODE_PATH="gs://[QWIKLABS-PROJECT_ID]/custom_transforms-0.1.tar.gz"\nVERSION_NAME="v1"\nMODEL_NAME="income_classifier"\nFRAMEWORK="SCIKIT_LEARN"\n\ngcloud beta ai-platform versions create $VERSION_NAME \\\n  --model $MODEL_NAME \\\n  --origin $MODEL_DIR \\\n  --runtime-version=1.15 \\\n  --framework $FRAMEWORK \\\n  --python-version=3.7 \\\n  --package-uris=$CUSTOM_CODE_PATH\n'' returned non-zero exit status 1

In [48]:
%%bash
MODEL_DIR="gs://bahumbug/"
CUSTOM_CODE_PATH="gs://bahumbug/custom_transforms-0.1.tar.gz"
VERSION_NAME="v2"
MODEL_NAME="income_classifier"
FRAMEWORK="SCIKIT_LEARN"

gcloud beta ai-platform versions create $VERSION_NAME \
  --model $MODEL_NAME \
  --origin $MODEL_DIR \
  --runtime-version=1.15 \
  --framework $FRAMEWORK \
  --python-version=3.7 \
  --package-uris=$CUSTOM_CODE_PATH

Creating version (this might take a few minutes)......
.................................................................................................................................................................................................................................................done.


## What-If Tool

In [27]:
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder

In [43]:
num_datapoints = 2000  

# Load up the test dataset
test_csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
test_df = pd.read_csv(test_csv_path, names=COLUMNS, skipinitialspace=True, skiprows=1)
test_df['income-level'] = (test_df['income-level'] == '>50K.').values.astype(int)

In [49]:
config_builder = (
    WitConfigBuilder(test_df.iloc[:num_datapoints].to_numpy().tolist(), COLUMNS)
    .set_ai_platform_model('nytaxi-query-test', "income_classifier", "v1")
    .set_target_feature("income-level")
    .set_label_vocab([0, 1])
)
WitWidget(config_builder, height=800)

WitWidget(config={'model_type': 'classification', 'uses_json_list': True, 'use_aip': True, 'aip_batch_size': 5…

# Narrative for identifying bias...

Aha! We found a bias

# Make changes to model to identify bias

Redeploy to AI Platform. Train in AI Platform?

# What-If Tool to show new model is less biased