# Salary Binary Classification
This problem uses the census income data-set from the Cal Univ of Irvine Machine Learning Repo. Data-set is extracted from the U.S. Census Bureau database. Using this dataset we will classify if a person has a salary if a person has a salary of greater than $50k. It is strictly a binary classification.

In [114]:
# kaggle/python Docker image: https://github.com/kaggle/docker-python
import numpy as np 
import pandas as pd 

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/adult-census-income/adult.csv


In [115]:
cols = [
    'Age', 'Workclass', 'fnlwgt', 'Education', 'Education-num', 
    'Marital-status', 'Occupation', 'Relationship', 'Race', 'Sex', 
    'CG', 'CL', 'HPW', 'Country', 'Salary'
]
unwanted_cols = [
    'fnlwgt', 'Education', 'Relationship', 'CG', 'CL', 'Country'
]

Read data. Drop unwanted columns. Replace '?' with 'nan'. Drop na.

In [116]:
salary_data = pd.read_csv('../input/adult-census-income/adult.csv', names=cols, header=0).drop(unwanted_cols, axis=1).replace('?', np.nan).dropna()
salary_features = salary_data.drop('Salary', axis=1)

> Convert data to numeric form (0 or 1)

In [117]:
label_encoder = preprocessing.LabelEncoder()

# salary_features[['Sex']] = salary_features[['Sex']].apply(label_encoder.fit_transform)
salary_target = salary_data[['Salary']].apply(label_encoder.fit_transform)

Apply one-hot-encoding

In [118]:
ohe_cols = ['Workclass', 'Marital-status', 'Occupation', 'Race', 'Sex']
salary_features = pd.get_dummies(salary_features, columns=ohe_cols)

Normalize numerical data

In [119]:
scalable_cols = ['Age', 'Education-num', 'HPW']
salary_features[scalable_cols] = preprocessing.scale(salary_features[scalable_cols])

Split into train and test sets

In [120]:
X_train, x_test, Y_train, y_test = train_test_split(
    salary_features,               
    salary_target,
    test_size=0.20,
    random_state=0
)

Training and test data into PyTorch Tensors

In [121]:
Xtrain_ = torch.from_numpy(X_train.values).float()
Xtest_ = torch.from_numpy(x_test.values).float()
Xtrain_.shape

torch.Size([24574, 38])

Reshape our data to match the y-label format that our loss function requires

In [122]:
Ytrain_ = torch.from_numpy(Y_train.values).view(1, -1)[0].type(torch.LongTensor) # reshape to 1-D tensor with all data in 1 row
Ytest_ = torch.from_numpy(y_test.values).view(1, -1)[0].type(torch.LongTensor)
Ytrain_.type()

'torch.LongTensor'

Create NN

In [123]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(38, 32)       
        self.fc2 = nn.Linear(32, 32)   
        self.fc3 = nn.Linear(32, 2)
        
    def forward(self, x): 
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return F.log_softmax(x, dim=-1)

In [124]:
model = Net()
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss() 

In [125]:
epoch_data = []
epochs = 1000

for epoch in range(1, epochs + 1):
    optimizer.zero_grad()
    
    Ypred = model(Xtrain_)
    
    loss = loss_fn(Ypred, Ytrain_)
    loss.backward() 
    
    optimizer.step()
    
    Ypred_test = model(Xtest_)
    loss_test = loss_fn(Ypred_test, Ytest_)
    
    _,pred = Ypred_test.data.max(1)
    
    accuracy = pred.eq(Ytest_.data).sum().item() / y_test.values.size
    epoch_data.append([epoch, loss.data.item(), loss_test.data.item(), accuracy])
    
    if epoch % 100 == 0:
        print(f'epoch - {epoch} ({epoch/100 * 10}%) train loss - {loss.data.item():.4f} test loss - {loss_test.data.item():.4f} accuracy - {accuracy:.4f}')

epoch - 100 (10.0%) train loss - 0.5295 test loss - 0.5260 accuracy - 0.7523
epoch - 200 (20.0%) train loss - 0.3700 test loss - 0.3651 accuracy - 0.8322
epoch - 300 (30.0%) train loss - 0.3582 test loss - 0.3550 accuracy - 0.8366
epoch - 400 (40.0%) train loss - 0.3534 test loss - 0.3521 accuracy - 0.8364
epoch - 500 (50.0%) train loss - 0.3499 test loss - 0.3492 accuracy - 0.8387
epoch - 600 (60.0%) train loss - 0.3473 test loss - 0.3486 accuracy - 0.8410
epoch - 700 (70.0%) train loss - 0.3459 test loss - 0.3484 accuracy - 0.8392
epoch - 800 (80.0%) train loss - 0.3449 test loss - 0.3486 accuracy - 0.8392
epoch - 900 (90.0%) train loss - 0.3438 test loss - 0.3481 accuracy - 0.8403
epoch - 1000 (100.0%) train loss - 0.3431 test loss - 0.3483 accuracy - 0.8390
