In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd

## load dataset for training and testing

In [33]:
train = pd.read_csv('./../train_balanced.csv')
# train.drop(['Unnamed: 0'], axis=1, inplace=True)

test = pd.read_csv('./../test_balanced.csv')
# test.drop(['Unnamed: 0'], axis=1, inplace=True)

In [35]:
X_train = train.iloc[:, 0:13].values
y_train = train.iloc[:, 13].values
# print(y_train)
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)

In [36]:
X_test = test.iloc[:, 0:13].values
y_test = test.iloc[:, 13].values

X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)

In [38]:
train.head(20)

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,58,4,169611,9,2,1,5,4,0,0,0,40,39,0
1,76,5,99328,13,2,5,0,4,1,6514,0,40,39,1
2,53,4,172962,15,2,10,0,4,1,0,0,40,39,1
3,30,4,205950,10,2,7,0,4,1,0,0,40,39,1
4,50,2,68898,16,2,4,0,4,1,0,0,55,39,1
5,68,4,193666,16,2,10,0,4,1,20051,0,55,39,1
6,55,6,308746,15,6,10,1,4,1,0,0,55,39,1
7,53,4,270655,8,2,13,0,4,1,0,0,45,39,1
8,41,7,190910,9,2,5,2,4,1,0,0,40,39,0
9,49,2,298445,15,2,10,5,4,0,0,1977,60,39,1


In [39]:
from sklearn.preprocessing import StandardScaler
from pickle import dump , load

columns_to_standardize = [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12]
scaler = StandardScaler()

X_train_np = X_train.numpy()
X_test_np = X_test.numpy()
print(X_test_np.shape)

# Fit the scaler on the training data columns
scaler.fit(X_train_np[:, columns_to_standardize])

# Standardize the selected columns in both X_train and X_test
X_train_np[:, columns_to_standardize] = scaler.transform(X_train_np[:, columns_to_standardize])
X_test_np[:, columns_to_standardize] = scaler.transform(X_test_np[:, columns_to_standardize])

# Convert back to PyTorch tensors
X_train = torch.from_numpy(X_train_np).type(torch.float) 
X_test = torch.from_numpy(X_test_np).type(torch.float)


# save standard scaler model.
# save model
dump(scaler, open('StandardScaler.pkl', 'wb'))

# load model
scaler = load(open('StandardScaler.pkl', 'rb'))

(4617, 13)


In [42]:
class AdultClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(13, 64)
        self.act1 = nn.ReLU()
        self.hidden2 = nn.Linear(64, 128)
        self.act2 = nn.ReLU()
        self.hidden3 = nn.Linear(128, 32)
        self.act3 = nn.ReLU()
        self.hidden4 = nn.Linear(32, 16)
        self.act4 = nn.ReLU()
        self.output = nn.Linear(16, 1)
        self.act_output = nn.Sigmoid()
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = self.dropout(self.act1(self.hidden1(x)))
        x = self.dropout(self.act2(self.hidden2(x)))
        x = self.dropout(self.act3(self.hidden3(x)))
        x = self.dropout(self.act4(self.hidden4(x)))
        # x = self.output(x)
        x = self.act_output(self.output(x))
        return x

model = AdultClassifier()
print(model)

# loss function and optimizer
loss_fn = nn.BCELoss() # binary cross entropy
optimizer = optim.Adam(model.parameters(), lr=0.001)

AdultClassifier(
  (hidden1): Linear(in_features=13, out_features=64, bias=True)
  (act1): ReLU()
  (hidden2): Linear(in_features=64, out_features=128, bias=True)
  (act2): ReLU()
  (hidden3): Linear(in_features=128, out_features=32, bias=True)
  (act3): ReLU()
  (hidden4): Linear(in_features=32, out_features=16, bias=True)
  (act4): ReLU()
  (output): Linear(in_features=16, out_features=1, bias=True)
  (act_output): Sigmoid()
  (dropout): Dropout(p=0.1, inplace=False)
)


In [43]:
def evaluate(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test)
        # print("y_pred :",  y_pred)
        # print("y_round  :",  y_pred.round())
    accuracy = (y_pred.round() == y_test).float().mean()
    # print(y_pred, y_test)
    return accuracy

In [44]:
n_epochs = 100
batch_size = 128


accuracy = evaluate(model, X_test, y_test)
print('accuracy before training {}'.format(accuracy))
for epoch in range(n_epochs):
    optimizer.zero_grad()
    for i in range(0, len(X_train), batch_size):
        Xbatch = X_train[i:i+batch_size]
        y_pred = model(Xbatch)
        ybatch = y_train[i:i+batch_size]
        loss = loss_fn(y_pred, ybatch)
        # print(loss)
        loss.backward()
        optimizer.step()
    # print(loss.tolist())
    # evaluate model on each epoch
    accuracy = evaluate(model, X_test, y_test)
    print(f'Finished epoch {epoch},  accuracy {accuracy}')

accuracy before training 0.4964262545108795
Finished epoch 0,  accuracy 0.4964262545108795
Finished epoch 1,  accuracy 0.78774094581604
Finished epoch 2,  accuracy 0.7808100581169128
Finished epoch 3,  accuracy 0.7840589284896851
Finished epoch 4,  accuracy 0.7658652663230896
Finished epoch 5,  accuracy 0.78774094581604
Finished epoch 6,  accuracy 0.8009529709815979
Finished epoch 7,  accuracy 0.8059346079826355
Finished epoch 8,  accuracy 0.7987871170043945
Finished epoch 9,  accuracy 0.7574182152748108
Finished epoch 10,  accuracy 0.7918561697006226
Finished epoch 11,  accuracy 0.6289798617362976
Finished epoch 12,  accuracy 0.8013861775398254
Finished epoch 13,  accuracy 0.6991552710533142
Finished epoch 14,  accuracy 0.8122157454490662
Finished epoch 15,  accuracy 0.8098332285881042
Finished epoch 16,  accuracy 0.8165475130081177
Finished epoch 17,  accuracy 0.8135152459144592
Finished epoch 18,  accuracy 0.8078839182853699
Finished epoch 19,  accuracy 0.8048516511917114
Finished e

In [45]:
accuracy = evaluate(model, X_test, y_test)
print(accuracy)

tensor(0.8003)


## Save/load model and evaluate 

In [46]:
torch.save(model, 'adult_credit__model')

In [47]:
model = torch.load('adult_credit__model')
model.eval()
accuracy = evaluate(model, X_test, y_test)
print(accuracy)

tensor(0.8003)


## create util functions for evolution

In [48]:
def scale_input_data(scaler, df, columns_to_standardize = [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12]):
    numpy_array = torch.tensor(df.iloc[:, 0:13].values, dtype=torch.float32)
    numpy_array = numpy_array.numpy()
    numpy_array[:, columns_to_standardize] = scaler.transform(numpy_array[:, columns_to_standardize])
    return numpy_array
    

In [49]:
scale_input_data(scaler, test, columns_to_standardize = [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12])

array([[-0.9728292 , -1.437228  , -0.0283375 , ..., -0.25733045,
         0.47698572,  0.2733499 ],
       [-0.66309863,  0.01541938,  3.079678  , ..., -0.25733045,
         0.6420446 , -4.82198   ],
       [ 1.427583  ,  0.01541938, -0.11982964, ..., -0.25733045,
        -2.906721  ,  0.2733499 ],
       ...,
       [ 0.49839118, -2.1635518 , -0.00976831, ..., -0.25733045,
         0.47698572,  0.2733499 ],
       [-0.8953966 ,  0.01541938,  0.6255687 , ..., -0.25733045,
        -0.18324976,  0.2733499 ],
       [ 1.1952851 ,  0.01541938, -0.01067085, ..., -0.25733045,
         0.47698572,  0.2733499 ]], dtype=float32)

In [50]:
X_test_np.shape

(4617, 13)

In [51]:
import numpy as np
# columns_to_standardize = [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13]
# 
# def eval_model(model, input):
#         # print(input)
#         model.eval()
#         with torch.no_grad():
#             prob = model(input)
#             # print(prob)
#         return 1 - prob.tolist()[0][0]
# 
# for i in range(20):
#     x_in = test.iloc[i, 0:14].values.reshape(-1, 14)
#     # print(x_in)
#     x_in = torch.tensor(x_in, dtype=torch.float32)
#     x_in = x_in.numpy()
#     x_in[:, columns_to_standardize] = scaler.transform(x_in[:, columns_to_standardize])
#     
#     # Convert back to PyTorch tensors
#     x_in = torch.from_numpy(x_in).type(torch.float) 
#     # print(x_in)
#     
#     print( eval_model(model, x_in))