In [1]:
import tensorflow as tf
import pandas as pd
from imblearn.over_sampling import SMOTE

In [2]:
PATH = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
PATH_test = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
COLUMNS = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 
           'capital_loss', 'hours_week', 'native_country', 'label']

In [3]:
df_train = pd.read_csv(PATH, skipinitialspace=True, names=COLUMNS, index_col=False)
df_test = pd.read_csv(PATH_test, skipinitialspace=True, names=COLUMNS, index_col=False, skiprows=1)

In [None]:
# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [4]:
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hours_week,native_country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
df_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hours_week,native_country,label
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [6]:
print(df_train.shape, df_test.shape)

(32561, 15) (16281, 15)


In [7]:
print(df_train.dtypes)

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital           object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_week         int64
native_country    object
label             object
dtype: object


In [8]:
label = {'<=50K' : 0, '>50K' : 1}
df_train.label = [label[item] for item in df_train.label]
label = {'<=50K.' : 0, '>50K.' : 1}
df_test.label = [label[item] for item in df_test.label]

In [9]:
print(df_train["label"].value_counts())

label
0    24720
1     7841
Name: count, dtype: int64


In [10]:
print(df_test["label"].value_counts())

label
0    12435
1     3846
Name: count, dtype: int64


In [11]:
print(df_train.dtypes)

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital           object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_week         int64
native_country    object
label              int64
dtype: object


In [12]:
##Adding features to the bucket
##Define the continous list
CONTI_FEATURES = ['age', 'fnlwgt', 'capital_gain', 'education_num', 'capital_loss', 'hours_week']

In [13]:
#Defining the categorical list
CATE_FEATURES = ['workclass', 'education', 'marital', 'occupation', 'relationship', 'race', 'sex', 'native_country']

In [14]:
continous_features = [tf.feature_column.numeric_column(k) for k in CONTI_FEATURES]

In [15]:
relationship = tf.feature_column.categorical_column_with_vocabulary_list('relationship' ,['Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 'Other-relative'])

In [16]:
categorical_features = [tf.feature_column.categorical_column_with_hash_bucket(k, hash_bucket_size=1000) for k in CATE_FEATURES]

In [17]:
model = tf.estimator.LinearClassifier(n_classes=2, feature_columns=categorical_features+continous_features)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\User\\AppData\\Local\\Temp\\tmpuqnu56t5', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [18]:
FEATURES = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 
            'capital_loss', 'hours_week', 'native_country']

In [19]:
LABEL = 'label'

In [20]:
def get_input_fn(data_set, num_epochs=None, n_batch=128, shuffle=True):
    return tf.compat.v1.estimator.inputs.pandas_input_fn(
        x = pd.DataFrame({k:data_set[k].values for k in FEATURES}),
        y = pd.Series(data_set[LABEL].values),
        batch_size = n_batch,
        num_epochs = num_epochs,
        shuffle=shuffle
    )

In [21]:
model.train(input_fn=get_input_fn(df_train, num_epochs=None, n_batch=128, shuffle=False), steps=1000)



Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into C:\Users\User\AppData\Local\Temp\tmpuqnu56t5\model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpo

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x1d1c220a940>

In [22]:
model.evaluate(input_fn=get_input_fn(df_test, num_epochs=1, n_batch=128, shuffle=False), steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2024-05-24T14:11:46
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\User\AppData\Local\Temp\tmpuqnu56t5\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [100/1000]
INFO:tensorflow:Inference Time : 2.15169s
INFO:tensorflow:Finished evaluation at 2024-05-24-14:11:48
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.7955286, accuracy_baseline = 0.76377374, auc = 0.5896633, auc_precision_recall = 0.3941741, average_loss = 192.34283, global_step = 1000, label/mean = 0.23622629, loss = 191.74767, precision = 0.7561943, prediction/mean = 0.06195408, recall = 0.19838792
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: C:\Users\User\AppData\Local\Temp\tmpuqnu56t5\model.ckpt-1000


{'accuracy': 0.7955286,
 'accuracy_baseline': 0.76377374,
 'auc': 0.5896633,
 'auc_precision_recall': 0.3941741,
 'average_loss': 192.34283,
 'label/mean': 0.23622629,
 'loss': 191.74767,
 'precision': 0.7561943,
 'prediction/mean': 0.06195408,
 'recall': 0.19838792,
 'global_step': 1000}

In [23]:
#Salary is 0 at a very one age and then keeps increasing. Close to retirement it again decreases. Hence we square age.
def square_var(df_t, df_te, var_name='age'):
    df_t['new'] = df_t[var_name].pow(2)
    df_te['new'] = df_te[var_name].pow(2)
    return df_t, df_te

In [24]:
df_train_new, df_test_new = square_var(df_train, df_test, var_name='age')

In [25]:
print(df_train_new.shape, df_test_new.shape)

(32561, 16) (16281, 16)


In [26]:
CONTI_FEATURES_NEW = ['age', 'fnlwgt', 'capital_gain', 'education_num', 'capital_loss', 'hours_week', 'new']
continous_features_new = [tf.feature_column.numeric_column(k) for k in CONTI_FEATURES_NEW]

In [27]:
model_1 = tf.estimator.LinearClassifier(feature_columns=categorical_features+continous_features_new)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\User\\AppData\\Local\\Temp\\tmpitrcukic', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [28]:
FEATURES_NEW = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 
            'capital_loss', 'hours_week', 'native_country', 'new']
def get_input_fn(data_set, num_epochs=None, n_batch=128, shuffle=True):
    return tf.compat.v1.estimator.inputs.pandas_input_fn(
        x = pd.DataFrame({k:data_set[k].values for k in FEATURES_NEW}),
        y = pd.Series(data_set[LABEL].values),
        batch_size = n_batch,
        num_epochs = num_epochs,
        shuffle=shuffle
    )

In [29]:
model_1.train(input_fn=get_input_fn(df_train, num_epochs=None, n_batch=128, shuffle=False), steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into C:\Users\User\AppData\Local\Temp\tmpitrcukic\model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 0.6931472, step = 0
INFO:tensorflow:global_step/sec: 94.6468
INFO:tensorflow:loss = 646.6163, step = 100 (1.057 sec)
INFO:tensorflow:global_step/sec: 122.23
INFO:tensorflow:loss = 182.25908, step = 200 (0.818 sec)
INFO:tensorflow:global_step/sec: 121.01
INFO:tensorflow:loss = 827.76324, step = 300 (0.826 sec)
INFO:tensorflow:global_step/sec: 120.99
INFO:tensorflow:loss = 86.20713, step = 400 (0.827 sec)
INFO:tensorflow:global_step/sec: 117.806
INFO:tensorflow:loss = 133.08643, step =

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x1d1873cd190>

In [30]:
model_1.evaluate(input_fn=get_input_fn(df_test_new, num_epochs=1, n_batch=128, shuffle=False), steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2024-05-24T14:12:01
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\User\AppData\Local\Temp\tmpitrcukic\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [100/1000]
INFO:tensorflow:Inference Time : 2.07896s
INFO:tensorflow:Finished evaluation at 2024-05-24-14:12:03
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.79319453, accuracy_baseline = 0.76377374, auc = 0.6049647, auc_precision_recall = 0.396361, average_loss = 141.40201, global_step = 1000, label/mean = 0.23622629, loss = 141.08281, precision = 0.6687808, prediction/mean = 0.08702526, recall = 0.24674986
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: C:\Users\User\AppData\Local\Temp\tmpitrcukic\model.ckpt-1000


{'accuracy': 0.79319453,
 'accuracy_baseline': 0.76377374,
 'auc': 0.6049647,
 'auc_precision_recall': 0.396361,
 'average_loss': 141.40201,
 'label/mean': 0.23622629,
 'loss': 141.08281,
 'precision': 0.6687808,
 'prediction/mean': 0.08702526,
 'recall': 0.24674986,
 'global_step': 1000}

In [32]:
predictions = list(model_1.predict(input_fn=get_input_fn(df_test_new, num_epochs=1, n_batch=128, shuffle=True)))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\User\AppData\Local\Temp\tmpitrcukic\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [33]:
print(df_test_new.iloc[0])

age                              25
workclass                   Private
fnlwgt                       226802
education                      11th
education_num                     7
marital               Never-married
occupation        Machine-op-inspct
relationship              Own-child
race                          Black
sex                            Male
capital_gain                      0
capital_loss                      0
hours_week                       40
native_country        United-States
label                             0
new                             625
Name: 0, dtype: object


In [34]:
print(predictions[0])

{'logits': array([-130.25336], dtype=float32), 'logistic': array([0.], dtype=float32), 'probabilities': array([1., 0.], dtype=float32), 'class_ids': array([0], dtype=int64), 'classes': array([b'0'], dtype=object), 'all_class_ids': array([0, 1]), 'all_classes': array([b'0', b'1'], dtype=object)}


In [35]:
print(df_test_new.iloc[3])
print(predictions[3])

age                               44
workclass                    Private
fnlwgt                        160323
education               Some-college
education_num                     10
marital           Married-civ-spouse
occupation         Machine-op-inspct
relationship                 Husband
race                           Black
sex                             Male
capital_gain                    7688
capital_loss                       0
hours_week                        40
native_country         United-States
label                              1
new                             1936
Name: 3, dtype: object
{'logits': array([-413.1365], dtype=float32), 'logistic': array([0.], dtype=float32), 'probabilities': array([1., 0.], dtype=float32), 'class_ids': array([0], dtype=int64), 'classes': array([b'0'], dtype=object), 'all_class_ids': array([0, 1]), 'all_classes': array([b'0', b'1'], dtype=object)}
