# Classifier: Final attempt (after internship).

## 1.a. Import: Libraries

In [1]:
#data organizing
import pandas #storage
import numpy as np #data-type conversion
from os import getcwd

#preprocessing - data splitting
from sklearn.model_selection import train_test_split

#outlier removal to achieve better distribution
from sklearn.ensemble import IsolationForest

#scaling
from sklearn.preprocessing import StandardScaler

#classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

#classification result - statistical
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#model persistence
from joblib import dump

## 1.b. Import: Dataset

In [2]:
#dtype changed from int64 to int32 to save space and speed up computation, no data was lost

url = getcwd() + '\\default of credit card clients.xls'
ccd = pandas.read_excel(io = url, \
                        sheet_name='Data', header = 1, index_col = 0, \
                        dtype = {'LIMIT_BAL': np.int32, 'AGE': np.int32, 'BILL_AMT1': np.int32, 'BILL_AMT2': np.int32, \
                                 'BILL_AMT3': np.int32, 'BILL_AMT4': np.int32, 'BILL_AMT5': np.int32, 'BILL_AMT6': np.int32, \
                                 'PAY_AMT1': np.int32, 'PAY_AMT2': np.int32, 'PAY_AMT3': np.int32, 'PAY_AMT4': np.int32, \
                                 'PAY_AMT5': np.int32, 'PAY_AMT6': np.int32})

In [3]:
ccd.rename(columns = {'PAY_0': 'PAY_1'}, inplace = True)
ccd.rename(columns = {'default payment next month': 'default_payment_next_month'}, inplace = True)

## 3.b. Feature Engineering

### 3.b.1. PAY {PAY_1 to PAY_6}

1. Using mode to aggregate. An entry may have mutiple mode values (same frequency), to resolve, using severest class.

2. Why severest value? To ensure fiscally fit population of credit users.

In [4]:
ccdr = pandas.read_excel(io = url, 
                        sheet_name='Data', header = 1, index_col = 0)
ccdr.rename(columns = {'PAY_0': 'PAY_1'}, inplace = True)

In [5]:
ccdrHistory = ccdr[['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']]

In [6]:
ccdrHistoryMode = ccdrHistory.mode(axis = 'columns')
ccdPayHistoryMode = ccdrHistoryMode.apply(func = max, axis = 'columns')

In [7]:
ccd['PAY_MODE_SEVEREST'] = list(ccdPayHistoryMode)

### 3.b.2. BILL_AMT {BILL_AMT1 to BILL_AMT6}

Using mean for total credit used

In [8]:
ccdSpent = ccd[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']]

In [9]:
ccd['BILL_AMT_MEAN'] = np.int32(ccdSpent.mean(axis = 'columns').round())

### 3.b.3. PAY_AMT {PAY_AMT1 to PAY_AMT6}

Using mean for total credit settled

In [10]:
ccdSettled = ccd[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]

In [11]:
ccd['PAY_AMT_MEAN'] = np.int32(ccdSettled.mean(axis = 'columns').round())

### 3.b.4. BALANCE_AMT {PAY_AMT - BILL_AMT}

In [12]:
ccd['BALANCE_AMT1'] = np.int32(ccd['PAY_AMT1'] - ccd['BILL_AMT1'])
ccd['BALANCE_AMT2'] = np.int32(ccd['PAY_AMT2'] - ccd['BILL_AMT2'])
ccd['BALANCE_AMT3'] = np.int32(ccd['PAY_AMT3'] - ccd['BILL_AMT3'])
ccd['BALANCE_AMT4'] = np.int32(ccd['PAY_AMT4'] - ccd['BILL_AMT4'])
ccd['BALANCE_AMT5'] = np.int32(ccd['PAY_AMT5'] - ccd['BILL_AMT5'])
ccd['BALANCE_AMT6'] = np.int32(ccd['PAY_AMT6'] - ccd['BILL_AMT6'])

ccd['BALANCE_AMT_MEAN'] = np.int32(ccd['PAY_AMT_MEAN'] - ccd['BILL_AMT_MEAN'])

## 3.c. Normalization

Scaling: Only to reduce the effect of very large continuous variables (in distance based esimators).

Normalization: Also reduce the effect of skewness in variables.

In [13]:
varsToScale = ['LIMIT_BAL', 'AGE', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 
               'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'BILL_AMT_MEAN', 'PAY_AMT_MEAN']
scaler = StandardScaler(copy = False)

In [14]:
for var in varsToScale:
    ccd[var] = scaler.fit_transform(ccd[var].values.reshape(-1, 1))


## 3.a. Removing Outliers

Since data is highly skewed with the higher end being very sparse, having mostly outliers,

It may be better to remove those outliers so rest of the dataset has better distribution for better prediction
And outlier datapoints could be have a separate classifier model

Sould be done before data split to ensure distribution of train, dev and test sets are not different from each other.

In [15]:
isolationForest = IsolationForest(n_estimators = 100, max_samples = 0.2, contamination = 'auto',
                       n_jobs = -1, random_state = 39)

In [16]:
isolationForest.fit(ccd)
IsOutlierLabels = isolationForest.predict(ccd)

In [17]:
def cvIsOutlier(prediction):
    mapper = {-1: True, 1: False}
    return mapper.get(prediction)

ccdOutliers = ccd.copy(deep = True)

In [18]:
ccdOutliers['IsOutlier'] = list(map(cvIsOutlier, IsOutlierLabels))

In [19]:
#inliers conditions have been selected from EDA observations

ccdInliers = ccd[(ccdOutliers['IsOutlier'] == False) & (ccdOutliers['LIMIT_BAL'] <= 525000) & (ccdOutliers['AGE'] <= 60)]
ccdOutliers = ccd[~ccd.index.isin(ccdInliers.index)]

## 3.d. Data Splitting

Data is split before oversampling to avoid synthetic datapoints in test dataset.

Test dataset is separated even though GridSearchCV uses Stratified K-Fold cross-validation so that model's accuracy can be tested independently.

In [20]:
ccdY = pandas.DataFrame(ccdInliers['default_payment_next_month'])
ccdX = ccdInliers.drop(['default_payment_next_month'], axis = 'columns')

In [21]:
trainX, testX, trainY, testY = train_test_split(ccdX, ccdY, test_size = 0.25, stratify = ccdY, random_state = 44)

#devX, testX, devY, testY = train_test_split(testX, testY, test_size = 0.25, stratify = testY, random_state = 44)

## 4. Classifier

In [22]:
clfRF = RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                               criterion='entropy', max_depth=10, max_features='auto',
                               max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0,
                               min_impurity_split=None, min_samples_leaf=np.int32(trainX.shape[0]/1000),
                               min_samples_split=np.int32(trainX.shape[0]/500),
                               min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
                               oob_score=False, random_state=39, verbose=0, warm_start=False)

clfLG = LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=-1, penalty='none',
                   random_state=44, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

clfTrue = GaussianNB(priors=None, var_smoothing=1e-09)

clfFalse = SVC(cache_size = 1000, max_iter = -1, random_state = 44, kernel = 'rbf', C = 1.0,
               class_weight = 'balanced')

classifier = StackingClassifier(estimators = [('clfRF', clfRF), 
                                              ('clfTrue', clfTrue),
                                              ('clfFalse', clfFalse)],
                              final_estimator = clfLG,
                              n_jobs = -1, passthrough = True, cv = 5, verbose = 0)

In [23]:
classifier.fit(trainX, trainY.values.ravel())

StackingClassifier(cv=5,
                   estimators=[('clfRF',
                                RandomForestClassifier(bootstrap=False,
                                                       class_weight='balanced',
                                                       criterion='entropy',
                                                       max_depth=10,
                                                       min_samples_leaf=22,
                                                       min_samples_split=44,
                                                       n_estimators=250,
                                                       n_jobs=-1,
                                                       random_state=39)),
                               ('clfTrue', GaussianNB()),
                               ('clfFalse',
                                SVC(cache_size=1000, class_weight='balanced',
                                    random_state=44))],
                   final_estimator=

In [24]:
dump(classifier, 'best_classifier.joblib')

['best_classifier.joblib']

In [25]:
trainPredictY = classifier.predict(trainX)

In [26]:
testPredictY = classifier.predict(testX)

In [27]:
print(classification_report(trainY, trainPredictY))

              precision    recall  f1-score   support

           0       0.91      0.81      0.85     17249
           1       0.50      0.71      0.59      4820

    accuracy                           0.78     22069
   macro avg       0.71      0.76      0.72     22069
weighted avg       0.82      0.78      0.80     22069



In [28]:
pandas.DataFrame(
    confusion_matrix(trainY, trainPredictY),
    columns=['Predicted | Not Defaulter', 'Defaulter'],
    index=['Correct | Not Defaulter', 'Defaulter'])

Unnamed: 0,Predicted | Not Defaulter,Defaulter
Correct | Not Defaulter,13910,3339
Defaulter,1415,3405


In [29]:
print(classification_report(testY, testPredictY))

              precision    recall  f1-score   support

           0       0.89      0.78      0.83      5750
           1       0.45      0.64      0.53      1607

    accuracy                           0.75      7357
   macro avg       0.67      0.71      0.68      7357
weighted avg       0.79      0.75      0.76      7357



In [30]:
pandas.DataFrame(
    confusion_matrix(testY, testPredictY),
    columns=['Predicted | Not Defaulter', 'Defaulter'],
    index=['Correct | Not Defaulter', 'Defaulter'])

Unnamed: 0,Predicted | Not Defaulter,Defaulter
Correct | Not Defaulter,4499,1251
Defaulter,583,1024


## Experiment: Prediction of outliers using model trained on inliers.

In [31]:
outliersY = pandas.DataFrame(ccdOutliers['default_payment_next_month'])
outliersX = ccdOutliers.drop(['default_payment_next_month'], axis = 'columns')

In [32]:
outliersPredictY = classifier.predict(outliersX)

In [33]:
print(classification_report(outliersY, outliersPredictY))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       365
           1       0.69      0.67      0.68       209

    accuracy                           0.77       574
   macro avg       0.75      0.75      0.75       574
weighted avg       0.77      0.77      0.77       574



In [34]:
pandas.DataFrame(
    confusion_matrix(outliersY, outliersPredictY),
    columns=['Predicted | Not Defaulter', 'Defaulter'],
    index=['Correct | Not Defaulter', 'Defaulter'])

Unnamed: 0,Predicted | Not Defaulter,Defaulter
Correct | Not Defaulter,302,63
Defaulter,70,139


## Final notes:
Success, sort of. This model as the 'marco average recall' of 0.71 for test dataset, 0.75 for that of outlier dataset.
Neural network classifier created post-internship had 'macro average recall' of 0.59 for test dataset.
Final model at the conclusion of the internship had 'macro average recall' of 0.70 for test dataset.