In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
# download dataset from uci
url_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
url_test = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
nan_token = ' ?'
# we must specify:
#     'task':'classification' or 'regression'
#     'cat_features':list of the categorical features,
#     'cont_features':list of the continuous features,
#     'data':data (features x, not including target),
#     'target':target (y),
#     'test_data':if there is a default data for test, None otherwise,
#     'test_target':if there is a default data for test, None otherwise
adult = {
    'task':'classification',
    'cat_features':[1, 3, 5, 6, 7, 8, 9, 13],
    'cont_features':[0, 2, 4, 10, 11, 12],
    'data':pd.read_csv(url_data, header=None, na_values=nan_token).iloc[:,0:-1],
    'target':pd.read_csv(url_data, header=None, na_values=nan_token).iloc[:,-1],
    'test_data':pd.read_csv(url_test, header=None, skiprows=1, na_values=nan_token).iloc[:,0:-1],
    'test_target':pd.read_csv(url_test, header=None, skiprows=1, na_values=nan_token).iloc[:,-1]
}

In [3]:
def preprocess_dataset(dataset, categorical_encoding='label', categorical_type='category'):
    # if there is no default test data
    if dataset['test_data'] is None:
        X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.25)
        dataset['data'] = X_train
        dataset['target'] = y_train
        dataset['test_data'] = X_test
        dataset['test_target'] = y_test

    # label encoding for classification task, float for regression
    if dataset['task'] == 'classification':
        label_encoder = LabelEncoder()
        dataset['label_encoder'] = label_encoder
        dataset['target_processed'] = label_encoder.fit_transform(dataset['target'])
        dataset['test_target_processed'] = label_encoder.transform(dataset['test_target'])
    else:
        dataset['label_encoder'] = None
        dataset['target_processed'] = dataset['target'].astype('float')
        dataset['test_target_processed'] = dataset['test_target'].astype('float')
    # encodes categorical data
    if categorical_encoding == 'label':
    # order of columns is preserved
        encoder = OrdinalEncoder(encoded_missing_value=-1)
        dataset['encoder'] = encoder
        dataset['data'].iloc[:,dataset['cat_features']] = encoder.fit_transform(
            dataset['data'].iloc[:,dataset['cat_features']])
        dataset['test_data'].iloc[:,dataset['cat_features']] = encoder.transform(
            dataset['test_data'].iloc[:,dataset['cat_features']])
    if categorical_encoding == 'one_hot':
    # in the case of the one hot encoding we will first have the continuous columns and after the categorical
    # (one hot encoded)
        encoder = OneHotEncoder(drop='if_binary',sparse=False) # Nan is treated as another category (extra column)
        dataset['encoder'] = encoder
        dataset['data'] = (dataset['data'].iloc[:,dataset['cont_features']]
                           .join(pd.DataFrame(encoder.fit_transform(dataset['data'].iloc[:,dataset['cat_features']]))))
        dataset['test_data'] = (dataset['test_data'].iloc[:,dataset['cont_features']]
                                .join(pd.DataFrame(encoder.transform(dataset['test_data'].
                                                                     iloc[:,dataset['cat_features']]))))
        dataset['cont_features'] = [i for i in range(len(dataset['cont_features']))]
        dataset['cat_features'] = [i for i in range(len(dataset['cont_features']),dataset['data'].shape[1])]

    # continuous features as float and categorical features as category
    dataset['data'].iloc[:,dataset['cat_features']] = (dataset['data'].iloc[:,dataset['cat_features']]
                                                       .astype(categorical_type))
    dataset['test_data'].iloc[:,dataset['cat_features']] = (dataset['test_data'].iloc[:,dataset['cat_features']]
                                                            .astype(categorical_type))

    dataset['data'].iloc[:,dataset['cont_features']] = (dataset['data'].iloc[:,dataset['cont_features']]
                                                        .astype('float'))
    dataset['test_data'].iloc[:,dataset['cont_features']] = (dataset['test_data'].iloc[:,dataset['cont_features']]
                                                             .astype('float'))
    return dataset

In [4]:
adult = preprocess_dataset(adult, 'label')

In [10]:
dataset = adult
categorical_encoding = 'label' # one_hot or label

In [11]:
# if there is no default test data
if dataset['test_data'] is None:
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.25)
    dataset['data'] = X_train
    dataset['target'] = y_train
    dataset['test_data'] = X_test
    dataset['test_target'] = y_test

In [12]:
# label encoding for classification task, float for regression
if dataset['task'] == 'classification':
    label_encoder = LabelEncoder()
    dataset['label_encoder'] = label_encoder
    dataset['target_processed'] = label_encoder.fit_transform(dataset['target'])
else:
    dataset['label_encoder'] = None
    dataset['target_processed'] = dataset['target'].astype('float')

In [13]:
# encodes categorical data
if categorical_encoding == 'label':
# order of columns is preserved
    encoder = OrdinalEncoder(encoded_missing_value=-1)
    dataset['encoder'] = encoder
    dataset['data'].iloc[:,dataset['cat_features']] = encoder.fit_transform(
        dataset['data'].iloc[:,dataset['cat_features']])
    dataset['test_data'].iloc[:,dataset['cat_features']] = encoder.transform(
        dataset['test_data'].iloc[:,dataset['cat_features']])
if categorical_encoding == 'one_hot':
# in the case of the one hot encoding we will first have the continuous columns and after the categorical (one hot encoded)
    encoder = OneHotEncoder(drop='if_binary',sparse=False) # Nan is treated as another category (extra column)
    dataset['encoder'] = encoder
    dataset['data'] = (dataset['data'].iloc[:,dataset['cont_features']]
                       .join(pd.DataFrame(encoder.fit_transform(dataset['data'].iloc[:,dataset['cat_features']]))))
    dataset['test_data'] = (dataset['test_data'].iloc[:,dataset['cont_features']]
                            .join(pd.DataFrame(encoder.transform(dataset['test_data'].iloc[:,dataset['cat_features']]))))
    dataset['cont_features'] = [i for i in range(len(dataset['cont_features']))]
    dataset['cat_features'] = [i for i in range(len(dataset['cont_features']),dataset['data'].shape[1])]

In [14]:
# continuous features as float and categorical features as category
dataset['data'].iloc[:,dataset['cat_features']] = (dataset['data'].iloc[:,dataset['cat_features']]
                                                   .astype('category'))
dataset['test_data'].iloc[:,dataset['cat_features']] = (dataset['test_data'].iloc[:,dataset['cat_features']]
                                                        .astype('category'))

dataset['data'].iloc[:,dataset['cont_features']] = (dataset['data'].iloc[:,dataset['cont_features']]
                                                    .astype('float'))
dataset['test_data'].iloc[:,dataset['cont_features']] = (dataset['test_data'].iloc[:,dataset['cont_features']]
                                                         .astype('float'))