In [29]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [30]:
census_data = pd.read_csv('adult.csv')

In [31]:
census_data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')

In [32]:
census_data.income.unique()

array(['<=50K', '>50K'], dtype=object)

In [33]:
def change_values(input):
    if input == '<=50K':
        return 0
    else:
        return 1

In [34]:
census_data['income'] = census_data['income'].apply(change_values)

In [10]:
census_data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0


In [11]:
for i in census_data:
    print(pd.isnull(i))

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [12]:
census_data.apply(lambda x: sum(x.isnull()))

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [13]:
census_data['age'].dtype

dtype('int64')

In [14]:
X_data = census_data.drop("income",axis = 1)
y_data = census_data['income']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_data,y_data,test_size = 0.3, random_state = 24)

In [16]:
sex = tf.feature_column.categorical_column_with_vocabulary_list("sex",['Female','Male'])
race = tf.feature_column.categorical_column_with_hash_bucket("race",hash_bucket_size=1000)
occupation = tf.feature_column.categorical_column_with_hash_bucket("occupation",hash_bucket_size=1000)
relationship = tf.feature_column.categorical_column_with_hash_bucket("relationship",hash_bucket_size=1000)
education = tf.feature_column.categorical_column_with_hash_bucket("education",hash_bucket_size=1000)
marital_status = tf.feature_column.categorical_column_with_hash_bucket("marital.status",hash_bucket_size=1000)
workclass = tf.feature_column.categorical_column_with_hash_bucket("workclass",hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket("native.country",hash_bucket_size=1000)

In [17]:
age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education.num")
capital_gain = tf.feature_column.numeric_column("capital.gain")
capital_loss= tf.feature_column.numeric_column("capital.loss")
hours_per_week = tf.feature_column.numeric_column("hours.per.week")

In [18]:
feat_cols = [age, workclass, education, education_num,
       marital_status, occupation,relationship, race, sex,
       capital_gain, capital_loss, hours_per_week, native_country]

In [19]:
input_func = tf.estimator.inputs.pandas_input_fn(x = X_train,y=y_train, batch_size=100,num_epochs=None,shuffle=True)

## Create Linear Classifier

In [20]:
model = tf.estimator.LinearClassifier(feature_columns=feat_cols) 

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\ABHINA~1.BOH\\AppData\\Local\\Temp\\tmp50nfc0nc', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000021779EC6278>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [21]:
model.train(input_fn=input_func,steps = 5000)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into C:\Users\ABHINA~1.BOH\AppData\Local\Temp\tmp50nfc0nc\model.ckpt.
INFO:tensorflow:loss = 69.31474, step = 1
INFO:tensorflow:global_step/sec: 121.831
INFO:tensorflow:loss = 366.69495, step = 101 (0.823 sec)
INFO:tensorflow:global_step/sec: 367.281
INFO:tensorflow:loss = 49.578983, step = 201 (0.272 se

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifier at 0x21779eb97b8>

In [22]:
pred_fn = tf.estimator.inputs.pandas_input_fn(x = X_test, batch_size=len(X_test), shuffle=True)

In [23]:
predictions = list(model.predict(input_fn=pred_fn))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from C:\Users\ABHINA~1.BOH\AppData\Local\Temp\tmp50nfc0nc\model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [24]:
predictions[0]

{'logits': array([-6.018335], dtype=float32),
 'logistic': array([0.00242788], dtype=float32),
 'probabilities': array([0.9975721 , 0.00242781], dtype=float32),
 'class_ids': array([0], dtype=int64),
 'classes': array([b'0'], dtype=object)}

In [25]:
finalpreds = []
for preds in predictions:
    finalpreds.append(preds['class_ids'][0])

In [26]:
finalpreds[:10]

[0, 1, 0, 0, 1, 0, 0, 0, 0, 0]

In [27]:
from sklearn import metrics

In [28]:
print(metrics.classification_report(y_test,finalpreds))

             precision    recall  f1-score   support

          0       0.77      0.82      0.79      7456
          1       0.25      0.20      0.22      2313

avg / total       0.64      0.67      0.66      9769

