# Feature Engineering

#### GridSearch: scoring Metrics importance [first to last] = ['f1', 'recall', 'accuracy', 'precision', 'roc_auc']
#### Dimentionality Reduction: class importance [first to last] = [True, False]

## 1. Import: Libraries

In [1]:
#data organizing
import pandas #storage
import numpy as np #data-type conversion
from os import getcwd

#preprocessing - data splitting
from sklearn.model_selection import train_test_split

#classifier
from sklearn.tree import DecisionTreeClassifier

#classification result - statistical
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#hyper-parameter optimization
from sklearn.model_selection import GridSearchCV

#dimentionality reduction/feature selection
from sklearn.feature_selection import SelectKBest #count of k best features chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel #model.feature_importances_

#model validation
#https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9
#https://medium.com/analytics-vidhya/confusion-matrix-accuracy-precision-recall-f1-score-ade299cf63cd


## 2. Import: Dataset

In [2]:
#dtype changed from int64 to int32 to save space and speed up computation, no data was lost
def cvDefPay(prediction):
    mapper = {0: False, 1: True}
    return mapper.get(prediction)

url = getcwd() + '\\default of credit card clients.xls'
ccd = pandas.read_excel(io = url, \
                        sheet_name='Data', header = 1, index_col = 0, \
                        dtype = {'LIMIT_BAL': np.int32, 'AGE': np.int32, 'BILL_AMT1': np.int32, 'BILL_AMT2': np.int32, 'BILL_AMT3': np.int32, 'BILL_AMT4': np.int32, 'BILL_AMT5': np.int32, 'BILL_AMT6': np.int32, 'PAY_AMT1': np.int32, 'PAY_AMT2': np.int32, 'PAY_AMT3': np.int32, 'PAY_AMT4': np.int32, 'PAY_AMT5': np.int32, 'PAY_AMT6': np.int32}, \
                       converters = {'default payment next month': cvDefPay})

In [3]:
ccd.rename(columns = {'PAY_0': 'PAY_1'}, inplace = True)
ccd.rename(columns = {'default payment next month': 'default_payment_next_month'}, inplace = True)

## 3. Feature Engineering

#### 1. PAY_1 to PAY_6

In [4]:
ccdr = pandas.read_excel(io = url, 
                        sheet_name='Data', header = 1, index_col = 0)
ccdr.rename(columns = {'PAY_0': 'PAY_1'}, inplace = True)

In [5]:
ccdrHistory = ccdr[['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']]
ccdrHistoryMode = ccdrHistory.mode(axis = 'columns')
ccdrHistorySeverest = ccdrHistoryMode.apply(func = max, axis = 'columns')
ccd['PAY_MODE_SEVEREST'] = ccdrHistorySeverest

#### 2. BILL_AMT1 to BILL_AMT6

In [6]:
ccdSpent = ccd[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']]
ccd['BILL_AMT_MEAN'] = np.int32(ccdSpent.mean(axis = 'columns').round())

#### 3. PAY_AMT1 to PAY_AMT6

In [7]:
ccdSettled = ccd[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]
ccd['PAY_AMT_MEAN'] = np.int32(ccdSettled.mean(axis = 'columns').round())

## 4. Data splitting

In [8]:
ccdY = pandas.DataFrame(ccd['default_payment_next_month'])
ccdX = ccd.drop(['default_payment_next_month'], axis = 'columns')

In [9]:
trainX, testX, trainY, testY = train_test_split(ccdX, ccdY, test_size = 0.25, random_state = 44)

## 5. GridSearch: Pre-processing

In [10]:
featureCount = len(ccdX.columns)

parameterGrid = {'max_depth': [featureCount * 0.125, featureCount * 0.25, featureCount * 0.5, featureCount * 0.75, featureCount],
                 'criterion': ['gini', 'entropy'],
                 'splitter': ['best', 'random']}

#scoringMetrics = ['f1', 'recall', 'accuracy', 'precision', 'roc_auc']

scoringMetrics = 'roc_auc'

gridSearch = GridSearchCV(
    estimator = DecisionTreeClassifier(random_state = 39),
    param_grid = parameterGrid,
    scoring = scoringMetrics,
    n_jobs = -1, #https://scikit-learn.org/stable/glossary.html#term-n-jobs
    cv = 5, #5 fold stratified corss-validator
    refit = 'roc_auc',
    return_train_score = True)

## 6. Feature Selection: False

In [11]:
gridSearch.fit(trainX, trainY.values.ravel())

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=39,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                       

In [12]:
gridSearch.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6.5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=39, splitter='best')

In [13]:
predictY = gridSearch.best_estimator_.predict(testX)

In [14]:
#http://benalexkeen.com/decision-tree-classifier-in-python-using-scikit-learn/
pandas.DataFrame(
    confusion_matrix(testY, predictY),
    columns=['Predicted | Not Defaulter', 'Defaulter'],
    index=['Correct | Not Defaulter', 'Defaulter'])

Unnamed: 0,Predicted | Not Defaulter,Defaulter
Correct | Not Defaulter,5566,285
Defaulter,1041,608


In [15]:
print(classification_report(testY, predictY))

              precision    recall  f1-score   support

       False       0.84      0.95      0.89      5851
        True       0.68      0.37      0.48      1649

    accuracy                           0.82      7500
   macro avg       0.76      0.66      0.69      7500
weighted avg       0.81      0.82      0.80      7500



## 7. Feature Selection: True
https://scikit-learn.org/stable/modules/feature_selection.html#feature-selection

1. SelectKBest(]f_classif, mutual_info_classif])
2. SelectFromModel(gridSearch.best_estimator_)

### 1. SelectKBest(f_classif)

In [16]:
dimensionReducer = SelectKBest(score_func = f_classif, k = np.int32(len(ccdX.columns) * 0.75))
dimensionReducer.fit(X = ccdX, y = ccdY.values.ravel())
reducedColumnsIndices = dimensionReducer.get_support(indices = True)

ccdXdr = ccdX.iloc[:, reducedColumnsIndices]

In [17]:
trainX, testX, trainY, testY = train_test_split(ccdXdr, ccdY, test_size = 0.25, random_state = 44)

In [18]:
gridSearch.best_estimator_.fit(trainX, trainY.values.ravel())
predictY = gridSearch.best_estimator_.predict(testX)

In [19]:
#http://benalexkeen.com/decision-tree-classifier-in-python-using-scikit-learn/
pandas.DataFrame(
    confusion_matrix(testY, predictY),
    columns=['Predicted | Not Defaulter', 'Defaulter'],
    index=['Correct | Not Defaulter', 'Defaulter'])

Unnamed: 0,Predicted | Not Defaulter,Defaulter
Correct | Not Defaulter,5548,303
Defaulter,1026,623


In [20]:
print(classification_report(testY, predictY))

              precision    recall  f1-score   support

       False       0.84      0.95      0.89      5851
        True       0.67      0.38      0.48      1649

    accuracy                           0.82      7500
   macro avg       0.76      0.66      0.69      7500
weighted avg       0.81      0.82      0.80      7500



### 2. SelectKBest(mutual_info_classif)

In [21]:
dimensionReducer = SelectKBest(score_func = mutual_info_classif, k = np.int32(len(ccdX.columns) * 0.75))
dimensionReducer.fit(X = ccdX, y = ccdY.values.ravel())
reducedColumnsIndices = dimensionReducer.get_support(indices = True)

ccdXdr = ccdX.iloc[:, reducedColumnsIndices]

In [22]:
trainX, testX, trainY, testY = train_test_split(ccdXdr, ccdY, test_size = 0.25, random_state = 44)

In [23]:
gridSearch.best_estimator_.fit(trainX, trainY.values.ravel())
predictY = gridSearch.best_estimator_.predict(testX)

In [24]:
#http://benalexkeen.com/decision-tree-classifier-in-python-using-scikit-learn/
pandas.DataFrame(
    confusion_matrix(testY, predictY),
    columns=['Predicted | Not Defaulter', 'Defaulter'],
    index=['Correct | Not Defaulter', 'Defaulter'])

Unnamed: 0,Predicted | Not Defaulter,Defaulter
Correct | Not Defaulter,5561,290
Defaulter,1043,606


In [25]:
print(classification_report(testY, predictY))

              precision    recall  f1-score   support

       False       0.84      0.95      0.89      5851
        True       0.68      0.37      0.48      1649

    accuracy                           0.82      7500
   macro avg       0.76      0.66      0.68      7500
weighted avg       0.81      0.82      0.80      7500



### 3. SelectFromModel(gridSearch.best_estimator_) 

In [26]:
trainX, testX, trainY, testY = train_test_split(ccdX, ccdY, test_size = 0.25, random_state = 44)

In [27]:
dimensionReducer = SelectFromModel(estimator = gridSearch.best_estimator_, threshold = 0.01, prefit = True)
reducedColumnsIndices = dimensionReducer.get_support(indices = True)

ccdXdr = ccdX.iloc[:, reducedColumnsIndices]

In [28]:
trainX, testX, trainY, testY = train_test_split(ccdXdr, ccdY, test_size = 0.25, random_state = 44)

In [29]:
gridSearch.best_estimator_.fit(trainX, trainY.values.ravel())
predictY = gridSearch.best_estimator_.predict(testX)

In [30]:
#http://benalexkeen.com/decision-tree-classifier-in-python-using-scikit-learn/
pandas.DataFrame(
    confusion_matrix(testY, predictY),
    columns=['Predicted | Not Defaulter', 'Defaulter'],
    index=['Correct | Not Defaulter', 'Defaulter'])

Unnamed: 0,Predicted | Not Defaulter,Defaulter
Correct | Not Defaulter,5550,301
Defaulter,1233,416


In [31]:
print(classification_report(testY, predictY))

              precision    recall  f1-score   support

       False       0.82      0.95      0.88      5851
        True       0.58      0.25      0.35      1649

    accuracy                           0.80      7500
   macro avg       0.70      0.60      0.62      7500
weighted avg       0.77      0.80      0.76      7500



### Results:

#### No Feature Selection

100%                     Predicted | Not Defaulter  Defaulter
Correct | Not Defaulter                       5608        243
Defaulter                                     1092        557

#### SelecKBest(f_classif)

25%                      Predicted | Not Defaulter  Defaulter
Correct | Not Defaulter                       5570        281
Defaulter                                     1046        603

50%                      Predicted | Not Defaulter  Defaulter
Correct | Not Defaulter                       5517        334
Defaulter                                     1024        625

75%                      Predicted | Not Defaulter  Defaulter
Correct | Not Defaulter                       5569        282
Defaulter                                     1042        607
#### SelectKBest(mutual_info_classif)

25%                      Predicted | Not Defaulter  Defaulter
Correct | Not Defaulter                       5570        281
Defaulter                                     1046        603

50%                      Predicted | Not Defaulter  Defaulter
Correct | Not Defaulter                       5517        334
Defaulter                                     1024        625

75%                      Predicted | Not Defaulter  Defaulter
Correct | Not Defaulter                       5542        309
Defaulter                                     1025        624

#### SelectFromModel(gridSearch.best_estimator_)

threshold = 0.001        Predicted | Not Defaulter  Defaulter
Correct | Not Defaulter                       5598        253
Defaulter                                     1070        579

threshold = 0.01         Predicted | Not Defaulter  Defaulter
Correct | Not Defaulter                       5542        309
Defaulter                                     1025        624

#### gridSearch.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=6.5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=39, splitter='best')

## 8. Conclusion:

1. SelectFromModel(gridSearch.best_estimator_, threshold = 0.001): Best for NonDefaulter class accuracy (-10) because least accuracy degradation, Defaulter class has improvement (+22)

2. SelectFromModel(gridSearch.best_estimator_, threshold = 0.01) OR SelectKBest(mutual_info_classif, k = 75%): Best for Defaulter class accuracy (+67), NonDefaulter class has degraded (-64)