In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split

In [2]:
adult = pd.read_csv('adult/adult.csv', delimiter=', ')

  """Entry point for launching an IPython kernel.


In [3]:
data = adult
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
data.isnull().values.any()

False

In [5]:
print(data.columns)

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')


In [6]:
data['salary'].unique()

array(['<=50K', '>50K'], dtype=object)

In [7]:
data['salary'] = data['salary'].map({ "<=50K": 0, ">50K": 1 })
y = data['salary']
data = data.drop('salary', axis=1)

In [8]:
data.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [9]:
data = data.drop("capital-gain", axis=1)
data = data.drop("capital-loss", axis=1)

In [10]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,20,United-States


In [11]:
data = pd.get_dummies(data, columns=[
    "workclass", "education", "marital-status", "occupation", "relationship",
    "race", "sex", "native-country",
])

In [12]:
data

Unnamed: 0,age,fnlwgt,education-num,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,77516,13,40,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,13,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,40,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,40,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,40,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,38,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
32557,40,154374,9,40,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
32558,58,151910,9,40,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
32559,22,201490,9,20,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    data, y, test_size=0.25, stratify=y)
print(f'Sizes:\n\tX_train: {X_train.shape}\n\tX_test: {X_test.shape}\
                \n\ty_train: {y_train.shape}\n\ty_test: {y_test.shape}')

Sizes:
	X_train: (24420, 106)
	X_test: (8141, 106)                
	y_train: (24420,)
	y_test: (8141,)


In [14]:
pd.value_counts(pd.Series(y))

0    24720
1     7841
Name: salary, dtype: int64

# Decision Tree

In [15]:
from sklearn.ensemble import RandomForestClassifier

no_trees = np.linspace(start=1, stop=400, num=20, dtype=np.int64)
depths = list(np.linspace(start=1, stop=50, num=10, dtype=np.int64))
depths.append(None)
max_no_descriptors = range(2,len(adult.columns))
param_grid = {
    'n_estimators': no_trees,
    'max_depth': depths,
    'max_features': max_no_descriptors
}

In [16]:
print(param_grid)

{'n_estimators': array([  1,  22,  43,  64,  85, 106, 127, 148, 169, 190, 211, 232, 253,
       274, 295, 316, 337, 358, 379, 400]), 'max_depth': [1, 6, 11, 17, 22, 28, 33, 39, 44, 50, None], 'max_features': range(2, 15)}


In [18]:
rf = RandomForestClassifier()

best_rf = GridSearchCV(rf, param_grid, n_jobs=-1, cv=5, verbose=10)
best_rf.fit(X_train,y_train)

print("best params: ", best_rf.best_params_)
print("best acc: ", best_rf.best_score_)

Fitting 5 folds for each of 2860 candidates, totalling 14300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   17.2s


KeyboardInterrupt: 

# Neural Network