# Classifier: 

## 1.a. Import: Libraries

In [2]:
#data organizing
import pandas #storage
import numpy as np #data-type conversion
from os import getcwd

#preprocessing - data splitting
from sklearn.model_selection import train_test_split

#outlier removal to achieve better distribution
from sklearn.ensemble import IsolationForest
import seaborn #test distribution

#classifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow import feature_column
import tensorflow as tf

#classification result - statistical
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

https://www.tensorflow.org/tutorials/estimator/premade

https://www.tensorflow.org/tutorials/structured_data/feature_columns

https://www.tensorflow.org/api_docs/python/tf/estimator/DNNClassifier

https://stackoverflow.com/questions/56612386/defining-the-input-function-for-tensorflow-pre-made-estimator

## 1.b. Import: Dataset

In [None]:
#dtype changed from int64 to int32 to save space and speed up computation, no data was lost
def cvDefPay(prediction):
    mapper = {0: False, 1: True}
    return mapper.get(prediction)

url = getcwd() + '\\default of credit card clients.xls'
ccd = pandas.read_excel(io = url, \
                        sheet_name='Data', header = 1, index_col = 0, \
                        dtype = {'LIMIT_BAL': np.int32, 'AGE': np.int32, 'BILL_AMT1': np.int32, 'BILL_AMT2': np.int32, 'BILL_AMT3': np.int32, 'BILL_AMT4': np.int32, 'BILL_AMT5': np.int32, 'BILL_AMT6': np.int32, 'PAY_AMT1': np.int32, 'PAY_AMT2': np.int32, 'PAY_AMT3': np.int32, 'PAY_AMT4': np.int32, 'PAY_AMT5': np.int32, 'PAY_AMT6': np.int32})
                        #,converters = {'default payment next month': cvDefPay})

In [None]:
ccd.rename(columns = {'PAY_0': 'PAY_1'}, inplace = True)
ccd.rename(columns = {'default payment next month': 'default_payment_next_month'}, inplace = True)

## 3.a. Removing Outliers

Since data is highly skewed with the higher end being very sparse, having mostly outliers,

It may be better to remove those outliers so rest of the dataset has better distribution for better prediction
And outlier datapoints could be have a separate classifier model

Sould be done before data split to ensure distribution of train, dev and test sets are not different from each other.

In [None]:
isolationForest = IsolationForest(n_estimators = 100, max_samples = 0.2, contamination = 0.01,
                       n_jobs = -1, random_state = 39)

In [None]:
isolationForest.fit(ccd)
IsOutlierLabels = isolationForest.predict(ccd)

In [None]:
def cvIsOutlier(prediction):
    mapper = {-1: True, 1: False}
    return mapper.get(prediction)

ccdOutliers = ccd.copy(deep = True)

In [None]:
ccdOutliers['IsOutlier'] = list(map(cvIsOutlier, IsOutlierLabels))

In [None]:
#inliers conditions have been selected from EDA observations

ccdInliers = ccd[(ccdOutliers['IsOutlier'] == False) & (ccdOutliers['LIMIT_BAL'] <= 525000) & (ccdOutliers['AGE'] <= 60)]
ccdOutliers = ccd[~ccd.index.isin(ccdInliers.index)]

In [None]:
seaborn.jointplot(x = ccdOutliers['LIMIT_BAL'], y = ccdOutliers['AGE'], kind = 'kde')

In [None]:
seaborn.distplot(a = ccdOutliers['LIMIT_BAL'], bins = ccdOutliers['LIMIT_BAL'].value_counts().size, kde = True, hist = True, rug = True)

In [None]:
seaborn.jointplot(x = ccdInliers['LIMIT_BAL'], y = ccdInliers['AGE'], kind = 'kde')

In [None]:
seaborn.distplot(a = ccdInliers['LIMIT_BAL'], bins = ccdInliers['LIMIT_BAL'].value_counts().size, kde = True, hist = True, rug = True)

further inlier outlier correctness validation via pay_amt_mean, bill_amt_mean

## 3.b. Feature Engineering

### 3.b.1. PAY {PAY_1 to PAY_6}

1. Using mode to aggregate. An entry may have mutiple mode values (same frequency), to resolve, using severest class.

2. Why severest value? To ensure fiscally fit population of credit users.

In [5]:
ccdr = pandas.read_excel(io = url, 
                        sheet_name='Data', header = 1, index_col = 0)
ccdr.rename(columns = {'PAY_0': 'PAY_1'}, inplace = True)

In [6]:
ccdrHistory = ccdr[['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']]

In [7]:
ccdrHistoryMode = ccdrHistory.mode(axis = 'columns')
ccdrHistorySeverest = ccdrHistoryMode.apply(func = max, axis = 'columns')
ccdPayHistoryMode = map(cvPayHistory, ccdrHistorySeverest)

In [8]:
ccd['PAY_MODE_SEVEREST'] = list(ccdPayHistoryMode)

### 3.b.2. BILL_AMT {BILL_AMT1 to BILL_AMT6}

Using mean for total credit used

In [9]:
ccdSpent = ccd[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']]

In [10]:
ccd['BILL_AMT_MEAN'] = np.int32(ccdSpent.mean(axis = 'columns').round())

### 3.b.3. PAY_AMT {PAY_AMT1 to PAY_AMT6}

Using mean for total credit settled

In [11]:
ccdSettled = ccd[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]

In [12]:
ccd['PAY_AMT_MEAN'] = np.int32(ccdSettled.mean(axis = 'columns').round())

### 3.b.4. PAY_DELAY

## 3.c. Normalization

Scaling: Only to reduce the effect of very large continuous variables (in distance based esimators).

Normalization: Also reduce the effect of skewness in variables.

In [8]:
varsToScale = ['LIMIT_BAL', 'AGE', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 
               'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'BILL_AMT_MEAN', 'PAY_AMT_MEAN']
scaler = StandardScaler(copy = False)

In [9]:
for var in varsToScale:
    ccd[var] = scaler.fit_transform(ccd[var].values.reshape(-1, 1))


## 3.d. Data Splitting

Data is split before oversampling to avoid synthetic datapoints in test dataset.

Test dataset is separated even though GridSearchCV uses Stratified K-Fold cross-validation so that model's accuracy can be tested independently.

In [None]:
ccdY = pandas.DataFrame(ccd['default_payment_next_month'])
ccdX = ccd.drop(['default_payment_next_month'], axis = 'columns')

In [None]:
trainX, testX, trainY, testY = train_test_split(ccdX, ccdY, test_size = 0.25, stratify = ccdY, random_state = 44)

trainX, validationX, trainY, validationY = train_test_split(trainX, trainY, test_size = 0.25, stratify = trainY, random_state = 44)

https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensors

train_set = tf.data.Dataset.from_tensors((trainX.values, trainY.values))

validation_set/dev_set = tf.data.Dataset.from_tensors((validationX.values, validationY.values))

test_set = tf.data.Dataset.from_tensors((testX.values, testY.values))

In [None]:
def input_fnx(features, labels, validation=True, batch_size=256):
    train_data = tf.data.Dataset.from_tensors((trainX.values, trainY.values))
    validation_data = tf.data.Dataset.from_tensors((validationX.values, validationY.values))
    test_data = tf.data.Dataset.from_tensors((testX.values, testY.values))
    #An input function for training or evaluating
    # Convert the inputs to a Dataset.
    dataset = train_data

    # Shuffle and repeat if you are in training mode.
    if validation:
        dataset = validation_data
    
    return dataset.batch(batch_size)

## 4. Classifier

In [None]:
limit_bal = feature_column.embedding_column('LIMIT_BAL', dimension = ccd['LIMIT_BAL'].nunique())

sex1 = feature_column.categorical_column_with_vocabulary_list('SEX', [1, 2])
sex = feature_column.indicator_column(sex1)

education1 = feature_column.categorical_column_with_vocabulary_list('EDUCATION', [0, 1, 2, 3, 4, 5, 6])
education = feature_column.indicator_column(education1)

marriage1 = feature_column.categorical_column_with_vocabulary_list('MARRIAGE', [0, 1, 2, 3])
marriage = feature_column.indicator_column(marriage1)

age = feature_column.numeric_column('AGE')

pay_11 = feature_column.categorical_column_with_vocabulary_list('PAY_1', [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
pay_1 = feature_column.indicator_column(pay_11)

pay_21 = feature_column.categorical_column_with_vocabulary_list('PAY_2', [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
pay_2 = feature_column.indicator_column(pay_21)

pay_31 = feature_column.categorical_column_with_vocabulary_list('PAY_3', [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
pay_3 = feature_column.indicator_column(pay_31)

pay_41 = feature_column.categorical_column_with_vocabulary_list('PAY_4', [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
pay_4 = feature_column.indicator_column(pay_41)

pay_51 = feature_column.categorical_column_with_vocabulary_list('PAY_5', [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
pay_5 = feature_column.indicator_column(pay_51)

pay_61 = feature_column.categorical_column_with_vocabulary_list('PAY_6', [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
pay_6 = feature_column.indicator_column(pay_61)

bill_amt1 = feature_column.numeric_column('BILL_AMT1')
bill_amt2 = feature_column.numeric_column('BILL_AMT2')
bill_amt3 = feature_column.numeric_column('BILL_AMT3')
bill_amt4 = feature_column.numeric_column('BILL_AMT4')
bill_amt5 = feature_column.numeric_column('BILL_AMT5')
bill_amt6 = feature_column.numeric_column('BILL_AMT6')

pay_amt1 = feature_column.numeric_column('PAY_AMT1')
pay_amt2 = feature_column.numeric_column('PAY_AMT2')
pay_amt3 = feature_column.numeric_column('PAY_AMT3')
pay_amt4 = feature_column.numeric_column('PAY_AMT4')
pay_amt5 = feature_column.numeric_column('PAY_AMT5')
pay_amt6 = feature_column.numeric_column('PAY_AMT6')

ccd_feature_columns = [limit_bal, sex, education, marriage, age,
                   pay_1, pay_2, pay_3, pay_4, pay_5, pay_5,
                   bill_amt1, bill_amt2, bill_amt3, bill_amt4, bill_amt5, bill_amt6,
                   pay_amt1, pay_amt2, pay_amt3, pay_amt4, pay_amt5, pay_amt6]

#initial_feature_count = 23
#dimention_reduced_count = 5

In [None]:
classifier = tf.estimator.DNNClassifier(feature_columns = ccd_feature_columns,
                                         hidden_units = [23, 5],
                                         n_classes = 2)

In [None]:
classifier.train(input_fn = lambda: input_fnx(features = trainX, labels = trainY, validation = False), steps=25)

In [3]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1
