In [114]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [115]:
raw_census_data = pd.read_csv("census_data.csv")

In [116]:
raw_census_data.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [117]:
raw_census_data["income_bracket"].unique()

array([' <=50K', ' >50K'], dtype=object)

In [118]:
def getBooleanIncomeBracket(income_bracket_string):    
    if income_bracket_string == " <=50K":
        return 0
    elif income_bracket_string == " >50K":
        return 1

In [119]:
raw_census_data["income_bracket"] = raw_census_data["income_bracket"].apply(getBooleanIncomeBracket)

In [120]:
raw_census_data.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [121]:
from sklearn.model_selection import train_test_split

In [122]:
x_data = raw_census_data.drop("income_bracket", axis=1)
y = raw_census_data["income_bracket"]

In [123]:
X_train, X_test, y_train, y_test = train_test_split(x_data, y, test_size=0.30, random_state=610)

In [124]:
y_train.size

22792

In [125]:
x_data.columns

Index(['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'gender', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country'],
      dtype='object')

In [126]:
workclass = tf.feature_column.categorical_column_with_hash_bucket("workclass",hash_bucket_size=1000)
education = tf.feature_column.categorical_column_with_hash_bucket("education",hash_bucket_size=1000)
marital_status = tf.feature_column.categorical_column_with_hash_bucket("marital_status",hash_bucket_size=1000)
occupation = tf.feature_column.categorical_column_with_hash_bucket("occupation",hash_bucket_size=1000)
relationship = tf.feature_column.categorical_column_with_hash_bucket("relationship",hash_bucket_size=1000)
race = tf.feature_column.categorical_column_with_hash_bucket("race",hash_bucket_size=1000)
gender = tf.feature_column.categorical_column_with_hash_bucket("gender",hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket("native_country",hash_bucket_size=1000)

In [127]:
age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education_num")
capital_gain = tf.feature_column.numeric_column("capital_gain")
capital_loss = tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")

In [146]:
feature_cols = [workclass, education, marital_status, occupation, relationship, race, gender, native_country, age, education_num, capital_gain, capital_loss, hours_per_week]

In [147]:
train_input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=500, num_epochs=None, shuffle= True)

In [148]:
# in case , we want to use DNN classifier, we have to wrap categorical column with embedding_column
model = tf.estimator.LinearClassifier(feature_columns= feature_cols )

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/9c/883h376s53500dcqng4b4nj00000gn/T/tmps3870smc', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a3d887c18>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [149]:
model.train(input_fn=train_input_func, steps= 20000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /var/folders/9c/883h376s53500dcqng4b4nj00000gn/T/tmps3870smc/model.ckpt.
INFO:tensorflow:loss = 346.57355, step = 1
INFO:tensorflow:global_step/sec: 63.05
INFO:tensorflow:loss = 810.0387, step = 101 (1.588 sec)
INFO:tensorflow:global_step/sec: 90.9798
INFO:tensorflow:loss = 1879.9188, step = 201 (1.099 sec)
INFO:tensorflow:global_step/sec: 91.4063
INFO:tensorflow:loss = 1341.5927, step = 301 (1.094 sec)
INFO:tensorflow:global_step/sec: 91.7042
INFO:tensorflow:loss = 793.6274, step = 401 (1.091 sec)
INFO:tensorflow:global_step/sec: 91.5363
INFO:tensorflow:loss = 446.98218, step = 501 (1.092 sec)
INFO:tensorflow:global_step/sec: 90.0369
INFO:tensorflow:loss = 634.56055, step = 601 (1.110 sec)
INFO:tensorflow:g

INFO:tensorflow:loss = 210.11717, step = 8001 (1.092 sec)
INFO:tensorflow:global_step/sec: 92.1272
INFO:tensorflow:loss = 166.87717, step = 8101 (1.085 sec)
INFO:tensorflow:global_step/sec: 91.7595
INFO:tensorflow:loss = 180.16916, step = 8201 (1.090 sec)
INFO:tensorflow:global_step/sec: 92.011
INFO:tensorflow:loss = 150.52338, step = 8301 (1.087 sec)
INFO:tensorflow:global_step/sec: 91.9974
INFO:tensorflow:loss = 417.7991, step = 8401 (1.087 sec)
INFO:tensorflow:global_step/sec: 92.1088
INFO:tensorflow:loss = 160.31532, step = 8501 (1.086 sec)
INFO:tensorflow:global_step/sec: 91.841
INFO:tensorflow:loss = 168.25864, step = 8601 (1.089 sec)
INFO:tensorflow:global_step/sec: 92.3126
INFO:tensorflow:loss = 191.85493, step = 8701 (1.083 sec)
INFO:tensorflow:global_step/sec: 92.1583
INFO:tensorflow:loss = 147.07396, step = 8801 (1.085 sec)
INFO:tensorflow:global_step/sec: 91.7051
INFO:tensorflow:loss = 171.32115, step = 8901 (1.090 sec)
INFO:tensorflow:global_step/sec: 91.9544
INFO:tensorfl

INFO:tensorflow:global_step/sec: 88.9307
INFO:tensorflow:loss = 163.0347, step = 16301 (1.125 sec)
INFO:tensorflow:global_step/sec: 89.309
INFO:tensorflow:loss = 275.13705, step = 16401 (1.119 sec)
INFO:tensorflow:global_step/sec: 90.189
INFO:tensorflow:loss = 205.23326, step = 16501 (1.109 sec)
INFO:tensorflow:global_step/sec: 89.3508
INFO:tensorflow:loss = 167.28441, step = 16601 (1.119 sec)
INFO:tensorflow:global_step/sec: 88.9218
INFO:tensorflow:loss = 188.67715, step = 16701 (1.124 sec)
INFO:tensorflow:global_step/sec: 88.0408
INFO:tensorflow:loss = 158.72926, step = 16801 (1.136 sec)
INFO:tensorflow:global_step/sec: 87.9867
INFO:tensorflow:loss = 142.08714, step = 16901 (1.136 sec)
INFO:tensorflow:global_step/sec: 87.9525
INFO:tensorflow:loss = 136.62346, step = 17001 (1.137 sec)
INFO:tensorflow:global_step/sec: 89.6522
INFO:tensorflow:loss = 165.80186, step = 17101 (1.117 sec)
INFO:tensorflow:global_step/sec: 89.254
INFO:tensorflow:loss = 194.49411, step = 17201 (1.119 sec)
INFO

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifier at 0x1a3d887358>

In [150]:
test_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test, batch_size= len(X_test), num_epochs=1, shuffle= False)

In [151]:
pred_gen = model.predict(input_fn= test_input_func)

In [152]:
predictions = list(pred_gen)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/9c/883h376s53500dcqng4b4nj00000gn/T/tmps3870smc/model.ckpt-20000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [153]:
predictions

[{'logits': array([-1.9298089], dtype=float32),
  'logistic': array([0.12677172], dtype=float32),
  'probabilities': array([0.87322825, 0.12677173], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([-5.218253], dtype=float32),
  'logistic': array([0.00538763], dtype=float32),
  'probabilities': array([0.99461246, 0.0053876 ], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([-3.4444623], dtype=float32),
  'logistic': array([0.03093445], dtype=float32),
  'probabilities': array([0.9690656 , 0.03093444], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([-4.6045866], dtype=float32),
  'logistic': array([0.00990674], dtype=float32),
  'probabilities': array([0.9900933 , 0.00990671], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([-2.4374483], dtype=float32),
  'logistic':

In [154]:
final_predictions = [pred["class_ids"][0] for pred in predictions ]

In [155]:
final_predictions

[0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [156]:
from sklearn.metrics import classification_report

In [157]:
# without print() , it would be ugly
print(classification_report(y_test, final_predictions))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90      7388
           1       0.72      0.62      0.67      2381

   micro avg       0.85      0.85      0.85      9769
   macro avg       0.80      0.77      0.78      9769
weighted avg       0.84      0.85      0.84      9769

