In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
import plotly.express as px
plt.style.use(['science','ieee', 'no-latex']) # set the style of the plots

from util import *

In [12]:
adult = pd.read_parquet('data/formatted_data.parquet')
adult.drop(columns=['fnlwgt', 'education-num'], inplace=True) # dropping these columns
train = adult[adult['set'] == 'train'].drop(columns=['set'])
test = adult[adult['set'] == 'test'].drop(columns=['set'])
print(f'Train shape: {train.shape}')
display(train.head())
print(f'Test shape: {test.shape}')
display(test.head())

Train shape: (32561, 13)


Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


Test shape: (16281, 13)


Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,25,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,Some-college,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [15]:
def runMLPClassifier(XTrainTrans, YTrain, max_iter):
    clf = GridSearchCV(
        estimator =  MLPClassifier(max_iter=max_iter, random_state=0),
        param_grid = {
            'hidden_layer_sizes': [(64, 32, 16), (100,)],
            'activation': ['relu'],
            'solver': ['adam'],
            'alpha': [0.0001, 0.05],
            'learning_rate': ['adaptive'],
        },
        scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],
        refit='roc_auc',
        cv=4,
        n_jobs=-1
    )
    return clf.fit(XTrainTrans, YTrain)

In [16]:
# collect data
# 30 iterations 0f 4-fold cross validation for each number of maximum values per categorical feature

results = []

for numCat in [3, 5, 10, 20, 42]:
    print(f'Starting {numCat} categories')
    train_ = dfCategoryLimit(train, numCat) # limit the number values per category
    XTrain, YTrain, XTest, YTest = XYSplit(train_, test) # split into X and Y
    XTrain, YTrain = donwSampleLabels(XTrain, YTrain) # downsample the labels

    preprocessor = getPreprocessor().fit(XTrain) # fit the preprocessor
    XTrainTrans = preprocessor.transform(XTrain) # transform the training data
    XTestTrans = preprocessor.transform(XTest) # transform the test data

    nDims = XTrainTrans.shape[1] # get the number of dimensions
    print(f'Starting Search for {numCat} categories')
    search = runMLPClassifier(XTrainTrans, YTrain, max_iter=300) # run the logistic regression search
    testMetrics = getMetrics(search, XTestTrans, YTest) # get the test metrics
    cvResults = search.cv_results_ # get the cross validation results
    results.append((numCat, nDims, testMetrics, cvResults, search)) # collect the results
pd.DataFrame(results, columns=['numCat', 'nDims', 'testMetrics', 'cvResults', 'searchObj']).to_pickle('data/MLP_results.pkl')

Starting 3 categories
Starting Search for 3 categories
Starting 5 categories
Starting Search for 5 categories
Starting 10 categories
Starting Search for 10 categories
Starting 20 categories
Starting Search for 20 categories
Starting 42 categories
Starting Search for 42 categories
