# TensorlayerX for Tabular Data

This notebook contains me learning TensorlayerX for tabular data. The case is using Default Credit Classifier based-on my previous ANN project. 

In [5]:
import tensorflow;
import pandas;
from sklearn.model_selection import train_test_split;
from sklearn.preprocessing import StandardScaler;

In [6]:
tensorflow.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [7]:
# Loading data from pre-cleaned csv file
current_folder = "/mnt/d/Code/College/Machine Learning/Team Assignment/Default Credit Scoring/";
# current_folder = "";
dataframe = pandas.read_csv(current_folder + "credit_card_clients.csv");

dataframe = dataframe.drop(columns = ["ID"]);

dataframe.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,LABEL
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [8]:
print("Dropping everything that other than mentioned above");

most_important_features = [
    "PAY_0",
    "PAY_2",
    "PAY_AMT2",
    "LIMIT_BAL",
    "PAY_AMT3",
    "BILL_AMT1",
    "PAY_3",
    "PAY_4",
    "PAY_6",
    "PAY_5",
    "LABEL" # Don't :)
];

columns_to_be_dropped = [];
for i in dataframe.columns:
    if i not in most_important_features:
        columns_to_be_dropped.append(i);
        print(f"Dropping feature {i} since its not \"important\"");

dataframe = dataframe.drop(columns = columns_to_be_dropped);

Dropping everything that other than mentioned above
Dropping feature SEX since its not "important"
Dropping feature EDUCATION since its not "important"
Dropping feature MARRIAGE since its not "important"
Dropping feature AGE since its not "important"
Dropping feature BILL_AMT2 since its not "important"
Dropping feature BILL_AMT3 since its not "important"
Dropping feature BILL_AMT4 since its not "important"
Dropping feature BILL_AMT5 since its not "important"
Dropping feature BILL_AMT6 since its not "important"
Dropping feature PAY_AMT1 since its not "important"
Dropping feature PAY_AMT4 since its not "important"
Dropping feature PAY_AMT5 since its not "important"
Dropping feature PAY_AMT6 since its not "important"


In [9]:
# Make one hot encoding for PAY_0 to 6 since the data is an ordinal data
hot_encoded_pay_0 = pandas.get_dummies(dataframe['PAY_0'], prefix = "pay_0");
hot_encoded_pay_2 = pandas.get_dummies(dataframe['PAY_2'], prefix = "pay_2");
hot_encoded_pay_3 = pandas.get_dummies(dataframe['PAY_3'], prefix = "pay_3");
hot_encoded_pay_4 = pandas.get_dummies(dataframe['PAY_4'], prefix = "pay_4");
hot_encoded_pay_5 = pandas.get_dummies(dataframe['PAY_5'], prefix = "pay_5");
hot_encoded_pay_6 = pandas.get_dummies(dataframe['PAY_6'], prefix = "pay_6");

# Merge the hot_encoded with the main dataframe
for i in [hot_encoded_pay_0, hot_encoded_pay_2, hot_encoded_pay_3, hot_encoded_pay_4, hot_encoded_pay_5, hot_encoded_pay_6]:
    dataframe = pandas.concat([dataframe, i], axis = 1);

print("Data after cleaning: ", dataframe.shape);
print(dataframe.columns.tolist());

Data after cleaning:  (30000, 75)
['LIMIT_BAL', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'LABEL', 'pay_0_-2', 'pay_0_-1', 'pay_0_0', 'pay_0_1', 'pay_0_2', 'pay_0_3', 'pay_0_4', 'pay_0_5', 'pay_0_6', 'pay_0_7', 'pay_0_8', 'pay_2_-2', 'pay_2_-1', 'pay_2_0', 'pay_2_1', 'pay_2_2', 'pay_2_3', 'pay_2_4', 'pay_2_5', 'pay_2_6', 'pay_2_7', 'pay_2_8', 'pay_3_-2', 'pay_3_-1', 'pay_3_0', 'pay_3_1', 'pay_3_2', 'pay_3_3', 'pay_3_4', 'pay_3_5', 'pay_3_6', 'pay_3_7', 'pay_3_8', 'pay_4_-2', 'pay_4_-1', 'pay_4_0', 'pay_4_1', 'pay_4_2', 'pay_4_3', 'pay_4_4', 'pay_4_5', 'pay_4_6', 'pay_4_7', 'pay_4_8', 'pay_5_-2', 'pay_5_-1', 'pay_5_0', 'pay_5_2', 'pay_5_3', 'pay_5_4', 'pay_5_5', 'pay_5_6', 'pay_5_7', 'pay_5_8', 'pay_6_-2', 'pay_6_-1', 'pay_6_0', 'pay_6_2', 'pay_6_3', 'pay_6_4', 'pay_6_5', 'pay_6_6', 'pay_6_7', 'pay_6_8']


In [10]:
# Define label data
label = dataframe['LABEL'];

# Drop ID, SEX, EDUCATION, MARRIAGE, and LABEL from dataframe for features
features = dataframe.drop(columns=['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', "LABEL"]);

# Typecasting all values within to int64 because ANN need to be standardized

for i in dataframe.columns:
    dataframe[i] = dataframe[i].astype(int);

# Split the data into training, validation, and testing sets
feature_train, feature_test, label_train, label_test = train_test_split(
    features, 
    label, 
    train_size = 0.7, 
    test_size = 0.3, 
    random_state = 42
);

feature_validation, feature_test, label_validation, label_test = train_test_split(
    feature_test, 
    label_test, 
    test_size=0.5, 
    random_state = 42
);

# Scale the features
scaler = StandardScaler();
feature_train_scaled = scaler.fit_transform(feature_train);
feature_validation_scaled = scaler.fit_transform(feature_validation);
feature_test_scaled = scaler.transform(feature_test);

print("Features shape: ", features.shape);

Features shape:  (30000, 68)


In [26]:
import tensorlayerx;
import os;
from tensorlayerx.nn import Module, Sequential, Input, Linear, Sigmoid;
from tensorlayerx.model import Model;
from tensorlayerx.optimizers import Adam;
from tensorlayerx.losses import binary_cross_entropy;
from tensorlayerx.metrics import acc;
from tensorlayerx.dataflow import IterableDataset;
os.environ["TL_BACKEND"] = "tensorflow";

In [42]:
class DatasetLoader(IterableDataset):
    def __init__(self, feature, label):
        self.data = feature;
        self.label = label;
    
    def __getitem__(self, index):
        data = self.data[index].astype("float32");
        label = self.label[index].astype("float32");

        return data, label;

    def __len__(self):
        return len(self.data);

    def __iter__(self):
        for i in range(len(self.data)):
            yield self.data[i].astype("float32"), self.label[i].astype("float32")

In [86]:
class ANNTlx (Module):
    def __init__(self):
        super (ANNTlx, self).__init__();

        self.input = Input(shape = [64,68]);

        self.dense1 = Linear(in_features = 68, out_features = 64, act = Sigmoid);
        self.dense2 = Linear(in_features = 64, out_features = 32, act = Sigmoid);

        self.output = Linear(in_features = 32, out_features = 1, act = Sigmoid);
    
    def forward(self, x):
        x = self.dense1(x);
        x = self.dense2(x);
        out = self.output(x);

        return out;

    def construct(self, x):
        x = self.input(x);
        x = self.dense1(x);
        x = self.dense2(x);
        out = self.output(x);

        return out;
        
network = ANNTlx();
loss_function = binary_cross_entropy;
metric_function = acc;
optimizer = Adam(lr = 1e-3);

model = Model(network, loss_fn = loss_function, optimizer = optimizer, metric_function = metric_function);
train_dataset = DatasetLoader(feature_train_scaled, label_train);
val_dataset = DatasetLoader(feature_validation_scaled, label_validation);
test_dataset = DatasetLoader(feature_test_scaled, label_test);


[TLX] Input  _inputlayer_2: [68]
[TLX] Linear  linear_104: 64 Sigmoid
[TLX] Linear  linear_105: 32 Sigmoid
[TLX] Linear  linear_106: 1 Sigmoid


In [87]:
model.train(n_epoch = 10, train_dataset = train_dataset, test_dataset = val_dataset, print_freq = 1);

Output()

InvalidArgumentError: {{function_node __wrapped__MatMul_device_/job:localhost/replica:0/task:0/device:GPU:0}} In[0] and In[1] has different ndims: [68] vs. [68,64] [Op:MatMul] name: 