In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, r2_score, explained_variance_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelBinarizer
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.linear_model import Lasso

import torch
from torch import nn
import torch.nn.functional as F

torch.manual_seed(0)
np.random.seed(0)

In [2]:
names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'salary']
train = pd.read_csv('adult.data', names=names)
test = pd.read_csv('adult.test', names=names)

df = pd.concat([train, test])

print('raw original',df.shape)

datatypes_all = [
    ('age', 'positive int'),
    ('fnlwgt', 'positive int'),
    ('workclass', 'categorical'),
    ('education', 'categorical'),
    ('education-num', 'categorical'),
    ('marital-status', 'categorical'),
    ('occupation', 'categorical'),
    ('relationship', 'categorical'),
    ('race', 'categorical'),
    ('sex', 'categorical binary'),
    ('capital-gain', 'positive float'),
    ('capital-loss', 'positive float'),
    ('hours-per-week', 'positive int'),
    ('native-country', 'categorical'),
    ('salary', 'categorical binary'),
]

raw original (48842, 15)


In [3]:
class Processor:
    def __init__(self, datatypes):
        self.datatypes = datatypes
        
    def fit(self, matrix):
        preprocessors, cutoffs = [], []
        for i, (column, datatype) in enumerate(self.datatypes):
            preprocessed_col = matrix[:,i].reshape(-1, 1)

            if 'categorical' in datatype:
                preprocessor = LabelBinarizer()
            else:
                preprocessor = MinMaxScaler()

            preprocessed_col = preprocessor.fit_transform(preprocessed_col)
            cutoffs.append(preprocessed_col.shape[1])
            preprocessors.append(preprocessor)
        
        self.cutoffs = cutoffs
        self.preprocessors = preprocessors
    
    def transform(self, matrix):
        preprocessed_cols = []
        
        for i, (column, datatype) in enumerate(self.datatypes):
            preprocessed_col = matrix[:,i].reshape(-1, 1)
            preprocessed_col = self.preprocessors[i].transform(preprocessed_col)
            preprocessed_cols.append(preprocessed_col)

        return np.concatenate(preprocessed_cols, axis=1)

        
    def fit_transform(self, matrix):
        self.fit(matrix)
        return self.transform(matrix)
            
    def inverse_transform(self, matrix):
        postprocessed_cols = []

        j = 0
        for i, (column, datatype) in enumerate(self.datatypes):
            postprocessed_col = self.preprocessors[i].inverse_transform(matrix[:,j:j+self.cutoffs[i]])

            if 'categorical' in datatype:
                postprocessed_col = postprocessed_col.reshape(-1, 1)
            else:
                if 'positive' in datatype:
                    postprocessed_col = postprocessed_col.clip(min=0)

                if 'int' in datatype:
                    postprocessed_col = postprocessed_col.round()

            postprocessed_cols.append(postprocessed_col)
            
            j += self.cutoffs[i]
        
        return np.concatenate(postprocessed_cols, axis=1)

In [4]:
datatypes_ori = [
    ('age', 'positive int'),
    ('workclass', 'categorical'),
    ('education-num', 'categorical'),
    ('marital-status', 'categorical'),
    ('occupation', 'categorical'),
    ('relationship', 'categorical'),
    ('race', 'categorical'),
    ('sex', 'categorical binary'),
    ('capital-gain', 'positive float'),
    ('capital-loss', 'positive float'),
    ('hours-per-week', 'positive int'),
    ('native-country', 'categorical'),
    ('salary', 'categorical binary'),
]
ori_df = df.drop(columns=['education', 'fnlwgt'])
print('after dropping',ori_df.shape)
map_back_ori = {}
map_forward_ori = {}
for column, datatype in datatypes_ori:
    if 'categorical' in datatype:
        series = ori_df[column].astype('category')
        ori_df[column] = series.cat.codes
        map_back_ori[column] = dict(enumerate(series.cat.categories)) # e.g. 9 --> 'USA'
        map_forward_ori[column] = dict(map(reversed, map_back_ori[column].items())) #e.g. 'USA' --> 9
ori_df=(ori_df.reset_index()).drop(columns='index')
map_forward_ori

after dropping (48842, 13)


{'workclass': {' ?': 0,
  ' Federal-gov': 1,
  ' Local-gov': 2,
  ' Never-worked': 3,
  ' Private': 4,
  ' Self-emp-inc': 5,
  ' Self-emp-not-inc': 6,
  ' State-gov': 7,
  ' Without-pay': 8},
 'education-num': {1: 0,
  2: 1,
  3: 2,
  4: 3,
  5: 4,
  6: 5,
  7: 6,
  8: 7,
  9: 8,
  10: 9,
  11: 10,
  12: 11,
  13: 12,
  14: 13,
  15: 14,
  16: 15},
 'marital-status': {' Divorced': 0,
  ' Married-AF-spouse': 1,
  ' Married-civ-spouse': 2,
  ' Married-spouse-absent': 3,
  ' Never-married': 4,
  ' Separated': 5,
  ' Widowed': 6},
 'occupation': {' ?': 0,
  ' Adm-clerical': 1,
  ' Armed-Forces': 2,
  ' Craft-repair': 3,
  ' Exec-managerial': 4,
  ' Farming-fishing': 5,
  ' Handlers-cleaners': 6,
  ' Machine-op-inspct': 7,
  ' Other-service': 8,
  ' Priv-house-serv': 9,
  ' Prof-specialty': 10,
  ' Protective-serv': 11,
  ' Sales': 12,
  ' Tech-support': 13,
  ' Transport-moving': 14},
 'relationship': {' Husband': 0,
  ' Not-in-family': 1,
  ' Other-relative': 2,
  ' Own-child': 3,
  ' Unm

In [5]:
#TESTING MY HYPOTHESIS ABOUT Anderson's random forest
SELECTEDFEATURES = ['workclass','education','marital-status','occupation',
                    'relationship','race','sex','native-country','salary']  # ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']

SIZE_DATASET = 4000 #also tried with 8000 and 48842. The score comes out similar, actually
ori_df_cut = df[SELECTEDFEATURES][:SIZE_DATASET]
for column, datatype in datatypes_all:
    if (column in ori_df_cut.columns) and ('categorical' in datatype):
        ori_df_cut[column] = ori_df_cut[column].astype('category').cat.codes
datatypes_anderson = [(col,datatype) for (col,datatype) in datatypes_all if col in ori_df_cut.columns]
ori_df_cut
#ori_scores,real_scores = get_scores(ori_df_cut,ori_df_cut,datatypes_anderson,return_original=True,anderson=True)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,salary
0,6,9,4,1,1,4,1,37,0
1,5,9,2,4,0,4,1,37,0
2,3,11,0,6,1,4,1,37,0
3,3,1,2,6,0,2,1,37,0
4,3,9,2,10,5,2,0,5,0
5,3,12,2,4,5,4,0,37,0
6,3,6,3,8,1,2,0,21,0
7,5,11,2,4,0,4,1,37,1
8,3,12,4,10,1,4,0,37,1
9,3,9,2,4,0,4,1,37,1


In [63]:
#size 4000 was found in the original implementation of the work by Anderson, so we tried it as well. 
#With 4000 instead of 32561, it increases the accuracy (overfit?), but we report the lower accuracy score from 32561. 
def get_random_forest_score(df_input,df_test,SIZE_DATASET=4000,compute_f1=False,shuffle=True):
    if shuffle: 
        df = df_input.sample(frac=1).reset_index(drop=True)
    else:
        df = df_input
        
    X_train_encoded = df[:SIZE_DATASET].values
    X_test_encoded = df_test[:SIZE_DATASET].values

    clf = RandomForestClassifier(n_estimators=800, random_state=0)
    clf.fit(X_train_encoded[:,:-1], X_train_encoded[:,-1])
    prediction = clf.predict(X_test_encoded[:,:-1])
    score = accuracy_score(X_test_encoded[:,-1], prediction)
    print('accuracy score by random forest method is:',score)
    if compute_f1: 
        print('f_1 score is:',f1_score(X_test_encoded[:,-1], prediction))
        
    return score
    
get_random_forest_score(ori_df[:32561],ori_df[-16281:])
get_random_forest_score(ori_df_cut[:32561],ori_df_cut[-16281:])

accuracy score by random forest method is: 0.84525
accuracy score by random forest method is: 0.88975


0.88975

## Random forest score by Anderson work, just on discrete columns 

In [68]:
p = Processor(datatypes_ori)
p.fit(ori_df.values)

for path in ['synthetic_ae_0_2_gan_0_3.csv','synthetic_ae_0_4_gan_0_4.csv','synthetic_ae_0_7_gan_0_7.csv']:
    X_synthetic_encoded = pd.read_csv(path, index_col='Unnamed: 0')
    X_synthetic_real = p.inverse_transform(X_synthetic_encoded.to_numpy())
    synthetic_data = pd.DataFrame(X_synthetic_real, columns=ori_df.columns) #13 columns
    our_columns_for_random_forest = [col if col != 'education' else 'education-num' for col in ori_df_cut.columns]
    synthetic_data = synthetic_data[our_columns_for_random_forest]#.drop(columns='education-num')
    SIZE_DATASET=4000
    print('with training size',SIZE_DATASET,get_random_forest_score(synthetic_data, ori_df_cut[-16281:],SIZE_DATASET=SIZE_DATASET))
    SIZE_DATASET=32561
    print('with training size',SIZE_DATASET,get_random_forest_score(synthetic_data, ori_df_cut[-16281:],SIZE_DATASET=SIZE_DATASET))



accuracy score by random forest method is: 0.72575
with training size 4000 0.72575
accuracy score by random forest method is: 0.71375
with training size 32561 0.71375
accuracy score by random forest method is: 0.716
with training size 4000 0.716
accuracy score by random forest method is: 0.7065
with training size 32561 0.7065
accuracy score by random forest method is: 0.75225
with training size 4000 0.75225
accuracy score by random forest method is: 0.73975
with training size 32561 0.73975


## Random forest score by Anderson work, on all 13 columns of our preprocessing

In [65]:
p = Processor(datatypes_ori)
p.fit(ori_df.values)

for path in ['synthetic_ae_0_2_gan_0_3.csv','synthetic_ae_0_4_gan_0_4.csv','synthetic_ae_0_7_gan_0_7.csv']:
    X_synthetic_encoded = pd.read_csv(path, index_col='Unnamed: 0')
    X_synthetic_real = p.inverse_transform(X_synthetic_encoded.to_numpy())
    synthetic_data = pd.DataFrame(X_synthetic_real, columns=ori_df.columns) #13 columns
    SIZE_DATASET=4000
    print('with training size',SIZE_DATASET,get_random_forest_score(synthetic_data, ori_df[-16281:],SIZE_DATASET=SIZE_DATASET))
    SIZE_DATASET=32561
    print('with training size',SIZE_DATASET,get_random_forest_score(synthetic_data, ori_df[-16281:],SIZE_DATASET=SIZE_DATASET))



accuracy score by random forest method is: 0.7295
with training size 4000 0.7295
accuracy score by random forest method is: 0.7466371844481298
with training size 32561 0.7466371844481298
accuracy score by random forest method is: 0.78175
with training size 4000 0.78175
accuracy score by random forest method is: 0.7868067072047171
with training size 32561 0.7868067072047171
accuracy score by random forest method is: 0.79
with training size 4000 0.79
accuracy score by random forest method is: 0.7919660954486825
with training size 32561 0.7919660954486825
