In [1]:
import pandas as pd
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report

# Data

In [7]:
! curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data --output train.csv
! curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test --output test.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3881k  100 3881k    0     0  1473k      0  0:00:02  0:00:02 --:--:-- 1473k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1956k  100 1956k    0     0  1073k      0  0:00:01  0:00:01 --:--:-- 1072k


In [3]:
COLUMNS = (
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship',
    'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-level')

CATEGORICAL_COLUMNS = (
    'workclass', 'education', 'marital-status', 'occupation',
    'relationship', 'race', 'sex', 'native-country')

with open('./train.csv', 'r') as f:
    train_data = pd.read_csv(f, header=None, names=COLUMNS)

with open('./test.csv', 'r') as f:
    test_data = pd.read_csv(f, names=COLUMNS, skiprows=1)

x_train = train_data.drop('income-level', axis=1).values
y_train = (train_data['income-level'] == ' >50K').values

x_test = test_data.drop('income-level', axis=1).values
y_test = (test_data['income-level'] == ' >50K.').values

print('x_train:', x_train.shape)
print('y_train:', y_train.shape)
print('x_test:', x_test.shape)
print('y_test:', y_test.shape)

x_train: (32561, 14)
y_train: (32561,)
x_test: (16281, 14)
y_test: (16281,)


# Preprocessing

In [4]:
categorical_pipelines = []

for i, col in enumerate(COLUMNS[:-1]):
    if col in CATEGORICAL_COLUMNS:
        # Create a scores array to get the individual categorical column.
        # Example:
        #  data = [39, 'State-gov', 77516, 'Bachelors', 13, 'Never-married', 'Adm-clerical',
        #         'Not-in-family', 'White', 'Male', 2174, 0, 40, 'United-States']
        #  scores = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        #
        # Returns: [['State-gov']]
        # Build the scores array
        scores = [0] * len(COLUMNS[:-1])
        # This column is the categorical column you want to extract.
        scores[i] = 1
        skb = SelectKBest(k=1)
        skb.scores_ = scores
        # Convert the categorical column to a numerical value
        lbn = LabelBinarizer()
        r = skb.transform(x_train)
        lbn.fit(r)
        # Create the pipeline to extract the categorical feature
        categorical_pipelines.append(
            ('categorical-{}'.format(i), Pipeline([
                ('SKB-{}'.format(i), skb),
                ('LBN-{}'.format(i), lbn)])))

# Create pipeline to extract the numerical features
skb = SelectKBest(k=6)
skb.scores_ = [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0]
categorical_pipelines.append(('numerical', skb))
preprocess = FeatureUnion(categorical_pipelines)

# Model & Train

In [5]:
clf = RandomForestClassifier()
clf.fit(preprocess.transform(x_train), y_train)

RandomForestClassifier()

## Prediction

In [6]:
# Create the overall model as a single pipeline
pipeline = Pipeline([
    ('union', preprocess),
    ('classifier', clf)
])

y_pred = pipeline.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.89      0.93      0.91     12435
        True       0.72      0.61      0.66      3846

    accuracy                           0.85     16281
   macro avg       0.80      0.77      0.78     16281
weighted avg       0.85      0.85      0.85     16281

