In [1]:
''' ClassificationModel - This problem uses the census income data-set from the Cal Univ of Irvine Machine Learning Repo. Data-set is extracted from the U.S. Census Bureau database. 
Using this dataset we will classify if a person has a salary if a person has a salary of greater than $50k. It is a binary classification.'''
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F #contains logsoftmax function. log_softmax is used for classification problems eg >50k or <50k salary
import torch.optim as optim

In [2]:
cols = ['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-num', 'Marital-status', 'Occupation',
        'Relationship', 'Race', 'Sex', 'CG', 'CL', 'HPW', 'Country', 'Salary']
unwanted_cols = ['fnlwgt', 'Education', 'Relationship', 'CG', 'CL', 'Country'] # cols which are less useful to predict salary of >50k

In [3]:
salary_data = pd.read_csv('datasets/adult_salary.csv', names = cols).drop(unwanted_cols, axis=1).replace(' ?', np.nan).dropna()

In [4]:
salary_features = salary_data.drop('Salary', axis=1)
salary_features.columns

Index(['Age', 'Workclass', 'Education-num', 'Marital-status', 'Occupation',
       'Race', 'Sex', 'HPW'],
      dtype='object')

In [5]:
salary_features['Sex'].unique() # the unique values of the sex column are string values but NN function better with numeric values

array([' Male', ' Female'], dtype=object)

In [6]:
le = preprocessing.LabelEncoder() # preprocess the data to convert them to numeric form.
salary_features[['Sex']] = salary_features[['Sex']].apply(le.fit_transform) # le.fit_transform function to convert all values to 0 or 1

In [7]:
ohe_cols = ['Workclass', 'Marital-status', 'Occupation', 'Race'] # leftover columns with string data which contains more than 2 values
salary_features = pd.get_dummies(salary_features, columns = ohe_cols) # convert column data to one-hot-encoded form so that categorical data can be more expressive (binary values) 

In [8]:
scalable_cols = ['Age', 'Education-num', 'HPW'] # store numerical data in variable to prepare for converting to standardized data
salary_features[scalable_cols] = preprocessing.scale(salary_features[scalable_cols]) # preprocess the column to standardize the data

In [9]:
salary_target = salary_data[['Salary']] # store the Y values, which will be the target values (salary), in a target variable
salary_target = salary_target.apply(le.fit_transform) # le.fit_transform function to convert all values to 0 or 1

In [10]:
# since the training set is so large we use 80% for training and 20% for testing how our model performs
X_train, x_test, Y_train, y_test = train_test_split(salary_features, 
                                                    salary_target,
                                                    test_size=0.2,
                                                    random_state=0)

In [11]:
# put training and test data into PyTorch Tensors
Xtrain_ = torch.from_numpy(X_train.values).float()
Xtest_ = torch.from_numpy(x_test.values).float()

In [12]:
Xtrain_.shape

torch.Size([24574, 37])

In [13]:
# Reshape our data to match the y-label format that our loss function requires (in this case: NLL) 
Ytrain_ = torch.from_numpy(Y_train.values).view(1, -1)[0].type(torch.LongTensor) # reshape to 1-D tensor with all data in 1 row
Ytest_ = torch.from_numpy(y_test.values).view(1, -1)[0].type(torch.LongTensor) # reshape to 1-D tensor with all data in 1 row

In [14]:
Ytrain_.type()

'torch.LongTensor'

In [15]:
input_size = 37
output_size = 2 # >50k or <50k
hidden_size = 10 # figure this out as you run several models

In [16]:
class Net(nn.Module): # build custom NN modules by subclassing nn.Module class
    def __init__(self):
        super(Net, self).__init__() # call the super class to initialize the NN before adding in the layers
        self.fc1 = nn.Linear(input_size, hidden_size)       
        self.fc2 = nn.Linear(hidden_size, hidden_size)        
        self.fc3 = nn.Linear(hidden_size, output_size) # three linear fully-connected layers
        
    def forward(self, x): # override the forward function to feed the X data into the fully-connected linear layers
        x = F.relu(self.fc1(x)) # Apply the sigmoid activation function to the first layer
        x = F.relu(self.fc2(x))
        x = self.fc3(x) # last layer is a linear layer with no activation
        
        return F.log_softmax(x, dim=-1) # feed X output from 3rd layer into log_softmax function. log_softmax is used for classification problems eg >50k or <50k salary

In [17]:
model = Net() # instantiate model of the network by calling Net() class

In [18]:
optimizer = optim.Adam(model.parameters()) # use the adam optimizer which is an adaptive learning rate optimzer which works very well in NNs and is very popular

loss_fn = nn.NLLLoss() #set loss function

In [19]:
# set training data
epoch_data = []
epochs = 1000

for epoch in range(1, epochs + 1):
    optimizer.zero_grad()
    Ypred = model(Xtrain_)
    
    loss = loss_fn(Ypred, Ytrain_) # calc loss on the prediction
    loss.backward() # perform backward pass to calc gradients
    
    optimizer.step() # update the params by applying grads
    
    Ypred_test = model(Xtest_) # 5 min 45 sec
    loss_test = loss_fn(Ypred_test, Ytest_)
    
    _,pred = Ypred_test.data.max(1)
    
    accuracy = pred.eq(Ytest_.data).sum().item() / y_test.values.size
    epoch_data.append([epoch, loss.data.item(), loss_test.data.item(), accuracy])
    
    if epoch % 100 == 0:
        print('epoch - %d (%d%%) train loss - %.2f test loss - %.2f accuracy - %.4f' 
              % (epoch, epoch/100 * 10, loss.data.item(), loss_test.data.item(), accuracy))

epoch - 100 (10%) train loss - 0.47 test loss - 0.48 accuracy - 0.7477
epoch - 200 (20%) train loss - 0.39 test loss - 0.39 accuracy - 0.8005
epoch - 300 (30%) train loss - 0.37 test loss - 0.38 accuracy - 0.8293
epoch - 400 (40%) train loss - 0.37 test loss - 0.37 accuracy - 0.8320
epoch - 500 (50%) train loss - 0.37 test loss - 0.37 accuracy - 0.8340
epoch - 600 (60%) train loss - 0.36 test loss - 0.37 accuracy - 0.8358
epoch - 700 (70%) train loss - 0.36 test loss - 0.36 accuracy - 0.8371
epoch - 800 (80%) train loss - 0.35 test loss - 0.36 accuracy - 0.8387
epoch - 900 (90%) train loss - 0.35 test loss - 0.36 accuracy - 0.8385
epoch - 1000 (100%) train loss - 0.35 test loss - 0.36 accuracy - 0.8390
