In [1]:
# Import packages, start spark session, and load data
from source.functions import SparkMethods, DataLoader, SparkMLBinaryClassifierRandomSearch
spark = SparkMethods.get_spark_session()

df = DataLoader.load_data("data/adult.data")

In [2]:
# Define categorical and scaling feature columns to use OneHotEncoder and MinMaxScaler on them.
categorical_cols = [
    'workclass', 'education', 'education-num', 'marital-status', 'occupation',
    'relationship', 'race', 'sex', 'native-country'
]

scaling_cols = [
    'age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'
]

In [3]:
#create vectorizer model and transform df
vectorizer, label_vectorizer, transformed_df = SparkMethods.vectorizer(
    df,
    labels_to_vectorize={'income': 'OneHotEncoderEstimator'},
    CategoricalCols=categorical_cols,
    MinMaxCols=scaling_cols)
transformed_df.show()

20191204
created new MLFlow Experiment
+---+----------------+------+------------+-------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+---------------+---------------+-------------------+--------------------+----------------+------------------+----------+---------+--------------------+--------------------+--------------------+----------------+-------------+---------------+------------------+-----------------+-------------+---------------+-------------+------------------+--------------------+-----+
|age|       workclass|fnlwgt|   education|education-num|      marital-status|       occupation| relationship|              race|   sex|capital-gain|capital-loss|hours-per-week|native-country|income|workclass_index|education_index|education-num_index|marital-status_index|occupation_index|relationship_index|race_index|sex_index|native-country_index|    scaling_features|     scaled_features|rela

In [4]:
# Split the data into training and test sets (30% held out for testing)
strata_cols = ['income', 'sex']
trainingData, testData = SparkMethods.train_test_split(transformed_df,
                                                       strata_cols,
                                                       trainRatio=0.7,
                                                       show_summary=True)

Train/test strata:
+------------+-----------+----------+------+
|      strata|train_count|test_count|test_%|
+------------+-----------+----------+------+
| >50K,Female|        825|       354|   0.3|
|  <=50K,Male|      10589|      4539|   0.3|
|<=50K,Female|       6714|      2878|   0.3|
|   >50K,Male|       4663|      1999|   0.3|
+------------+-----------+----------+------+



In [5]:
models = SparkMLBinaryClassifierRandomSearch(
    trainingData,
    testData,
    random_grid_size=8,
    kfolds=3,
    grid_params={
        'GBTClassifier': {
            'maxDepth': [3, 5, 7, 9],
            'maxBins': [8, 16, 32, 48, 64],
            'maxIter': [25, 50, 75],
            'stepSize': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25]
        },
        'LinearSVC': {
            'standardization': [True, False],
            'aggregationDepth': [2, 5, 7, 10],
            'regParam': [0.001, 0.01, 0.1, 1.0, 10.0],
            'maxIter': [25, 50, 75],
            'tol': [1e-06, 0.0001, 0.01]
        },
        'MultilayerPerceptronClassifier': {
            'num_hidden_layers': range(1, 5),
            'first_hidden_layer_size': range(2, 21, 4),
            'blockSize': [2, 5, 10],
            'stepSize': [0.001, 0.01, 0.1],
            'maxIter': [25, 50, 75],
            'tol': [1e-06, 0.0001, 0.01]
        },
        'LogisticRegression': {
            'standardization': [True, False],
            'aggregationDepth': [2, 5, 7, 10],
            'regParam': [0.001, 0.01, 0.1, 1.0, 10.0],
            'maxIter': [25, 50, 75],
            'threshold': [0.4, 0.5, 0.6],
            'elasticNetParam': [0.0, 0.25, 0.5, 0.75, 1.0],
            'tol': [1e-06, 0.0001, 0.01]
        },
        'RandomForestClassifier': {
            'maxDepth': [3, 5, 7, 9],
            'maxBins': [8, 16, 32, 48, 64],
            'minInfoGain': [0.0, 0.05, 0.1],
            'impurity': ['gini', 'entropy']
        }
    })

Training 8 GBTClassifier models
Using existing MLFlow Experiment
Training 8 LinearSVC models
Using existing MLFlow Experiment
Training 8 MultilayerPerceptronClassifier models
Using existing MLFlow Experiment
Training 8 LogisticRegression models
Using existing MLFlow Experiment
Training 8 RandomForestClassifier models
Using existing MLFlow Experiment


In [17]:
# Get best parameters for a tested classification model
param_map = models.models['GBTClassifier'].bestModel.stages[0].extractParamMap()

for p in param_map:
    print(p.name, param_map.get(p))

cacheNodeIds False
checkpointInterval 10
featureSubsetStrategy all
featuresCol features
labelCol label
lossType logistic
maxBins 48
maxDepth 5
maxIter 75
maxMemoryInMB 256
minInfoGain 0.0
minInstancesPerNode 1
predictionCol predicted_label
seed 4698835472529513640
stepSize 0.2
subsamplingRate 1.0
