In [ ]:
# Import packages, start spark session, and load data
from source.functions import SparkMethods, DataLoader, SparkMLBinaryClassifierRandomSearch
spark = SparkMethods.get_spark_session()

df = DataLoader.load_data("data/adult.data")

In [ ]:
# Define categorical and scaling feature columns to use OneHotEncoder and MinMaxScaler on them.
categorical_cols = [
    'workclass', 'education', 'education-num', 'marital-status', 'occupation',
    'relationship', 'race', 'sex', 'native-country'
]

scaling_cols = [
    'age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'
]

In [ ]:
#create vectorizer model and transform df
vectorizer, label_vectorizer, transformed_df = SparkMethods.vectorizer(
    df,
    labels_to_vectorize={'income': 'OneHotEncoderEstimator'},
    CategoricalCols=categorical_cols,
    MinMaxCols=scaling_cols)
transformed_df.show()

In [ ]:
# Split the data into training and test sets (30% held out for testing)
strata_cols = ['income', 'sex']
trainingData, testData = SparkMethods.train_test_split(transformed_df,
                                                       strata_cols,
                                                       trainRatio=0.7,
                                                       show_summary=True)

In [ ]:
models = SparkMLBinaryClassifierRandomSearch(
    trainingData,
    testData,
    random_grid_size=8,
    kfolds=3,
    grid_params={
        'MultilayerPerceptronClassifier': {
            'num_hidden_layers': range(1, 5),
            'first_hidden_layer_size': range(2, 21, 4),
            'blockSize': [2, 5, 10],
            'stepSize': [0.001, 0.01, 0.1],
            'maxIter': [25, 50, 75],
            'tol': [1e-06, 0.0001, 0.01]
        }
    })

In [ ]:
# Get best parameters for a tested classification model
param_map = grid_search_results.models['GBTClassifier'].bestModel.stages[0].extractParamMap()

for p in param_map:
    print(p.name, param_map.get(p.name))