# Sampling and Validation Techniques

## 1.a. Import: Libraries

In [1]:
#data organizing
import pandas #storage
import numpy as np #data-type conversion
from os import getcwd

#scaling and encoding
from sklearn.preprocessing import StandardScaler

#dimentionality reduction/feature selection
from sklearn.feature_selection import SelectKBest #count of k best features chi2
from sklearn.feature_selection import mutual_info_classif

#validation testing
from sklearn.model_selection import cross_val_score

#classifier
from sklearn.linear_model import LogisticRegression

#classification result - statistical
from sklearn.metrics import balanced_accuracy_score

In [2]:
#validation techniques

from sklearn.model_selection import train_test_split    #holdout

#Group based splitters are more complex permutations of holdout
#from sklearn.model_selection import GroupKFold    
#from sklearn.model_selection import GroupShuffleSplit

from sklearn.model_selection import KFold

#Leave based splitters are too computationally intensive for large datasets
#from sklearn.model_selection import LeaveOneGroupOut
#from sklearn.model_selection import LeavePGroupsOut
#from sklearn.model_selection import LeaveOneOut
#from sklearn.model_selection import LeavePOut

#from sklearn.model_selection import PredefinedSplit    same as Holdout

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.model_selection import ShuffleSplit

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

#though time-series does not apply to this dataset
from sklearn.model_selection import TimeSeriesSplit

## 1.b. Import: Dataset

In [3]:
#dtype changed from int64 to int32 to save space and speed up computation, no data was lost
def cvDefPay(prediction):
    mapper = {0: False, 1: True}
    return mapper.get(prediction)

url = getcwd() + '\\default of credit card clients.xls'
ccd = pandas.read_excel(io = url, \
                        sheet_name='Data', header = 1, index_col = 0, \
                        dtype = {'LIMIT_BAL': np.int32, 'AGE': np.int32, 'BILL_AMT1': np.int32, 'BILL_AMT2': np.int32, 'BILL_AMT3': np.int32, 'BILL_AMT4': np.int32, 'BILL_AMT5': np.int32, 'BILL_AMT6': np.int32, 'PAY_AMT1': np.int32, 'PAY_AMT2': np.int32, 'PAY_AMT3': np.int32, 'PAY_AMT4': np.int32, 'PAY_AMT5': np.int32, 'PAY_AMT6': np.int32}, \
                       converters = {'default payment next month': cvDefPay})

In [4]:
ccd.rename(columns = {'PAY_0': 'PAY_1'}, inplace = True)
ccd.rename(columns = {'default payment next month': 'default_payment_next_month'}, inplace = True)

## 2.a Feature Engineering

#### 1. PAY_1 to PAY_6

In [5]:
ccdr = pandas.read_excel(io = url, 
                        sheet_name='Data', header = 1, index_col = 0)
ccdr.rename(columns = {'PAY_0': 'PAY_1'}, inplace = True)

In [6]:
ccdrHistory = ccdr[['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']]
ccdrHistoryMode = ccdrHistory.mode(axis = 'columns')
ccdrHistorySeverest = ccdrHistoryMode.apply(func = max, axis = 'columns')
ccd['PAY_MODE_SEVEREST'] = ccdrHistorySeverest

#### 2. BILL_AMT1 to BILL_AMT6

In [7]:
ccdSpent = ccd[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']]
ccd['BILL_AMT_MEAN'] = np.int32(ccdSpent.mean(axis = 'columns').round())

#### 3. PAY_AMT1 to PAY_AMT6

In [8]:
ccdSettled = ccd[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]
ccd['PAY_AMT_MEAN'] = np.int32(ccdSettled.mean(axis = 'columns').round())

## 2.b. Normalization

Scaling: Only to reduce the effect of very large continuous variables (in distance based esimators).

Normalization: Also reduce the effect of skewness in variables.

In [9]:
varsToScale = ccd.drop(['default_payment_next_month'], axis = 'columns').columns
#['LIMIT_BAL', 'AGE', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 
#               'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'BILL_AMT_MEAN', 'PAY_AMT_MEAN']
#feature_range = (0, 1), 
scaler = StandardScaler(copy = True)

In [10]:
for var in varsToScale:
    ccd[var] = scaler.fit_transform(ccd[var].values.reshape(-1, 1))


## 2.c. Feature Selection

In [11]:
ccdY = pandas.DataFrame(ccd['default_payment_next_month'])
ccdX = ccd.drop(['default_payment_next_month'], axis = 'columns')

In [12]:
featureFilter = SelectKBest(score_func = mutual_info_classif, k = np.int32(len(ccdX.columns) * 0.75))
featureFilter.fit(X = ccdX, y = ccdY.values.ravel())
filteredColumnsIndices = featureFilter.get_support(indices = True)

ccdXdr = ccdX.iloc[:, filteredColumnsIndices]

## 3.a. Data Splitting

In [13]:
trainX, testX, trainY, testY = train_test_split(ccdXdr, ccdY, test_size = 0.25, stratify = ccdY, random_state = 44)

## 4. LogisticRegression

In [14]:
classifier = LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=-1, penalty='none',
                   random_state=44, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

### 4.a. Holdout

In [15]:
classifier.fit(trainX, trainY.values.ravel())

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=-1, penalty='none',
                   random_state=44, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
testPredictedY = classifier.predict(testX)

In [17]:
score = balanced_accuracy_score(testY, testPredictedY, adjusted = True)
print('Holdout : {:2.2%}'.format(score))

Holdout : 37.08%


### 4.b. KFold

In [18]:
validator = KFold(n_splits = 5, shuffle = True, random_state = 72)

In [19]:
score = cross_val_score(classifier, ccdXdr, ccdY, scoring = 'balanced_accuracy', cv = validator, n_jobs = -1)

In [20]:
print('KFold : {:2.2%}'.format(np.mean(score)))

KFold : 67.10%


### 4.c. RepeatedKFold

In [21]:
validator = RepeatedKFold(n_splits = 5, n_repeats = 4, random_state=72)

In [22]:
score = cross_val_score(classifier, ccdXdr, ccdY, scoring = 'balanced_accuracy', cv = validator, n_jobs = -1)

In [23]:
print('RepeatedKFold : {:2.2%}'.format(np.mean(score)))

RepeatedKFold : 67.15%


### 4.d. RepeatedStratifiedKFold

In [24]:
validator = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 4, random_state=72)

In [25]:
score = cross_val_score(classifier, ccdXdr, ccdY, scoring = 'balanced_accuracy', cv = validator, n_jobs = -1)

In [26]:
print('RepeatedStratifiedKFold : {:2.2%}'.format(np.mean(score)))

RepeatedStratifiedKFold : 67.14%


### 4.e. ShuffleSplit

In [27]:
validator = ShuffleSplit(n_splits = 5, test_size = 0.25, random_state=72)

In [28]:
score = cross_val_score(classifier, ccdXdr, ccdY, scoring = 'balanced_accuracy', cv = validator, n_jobs = -1)

In [29]:
print('ShuffleSplit : {:2.2%}'.format(np.mean(score)))

ShuffleSplit : 67.41%


### 4.f. StratifiedKFold

In [30]:
validator = StratifiedKFold(n_splits = 5, shuffle = True, random_state=72)

In [31]:
score = cross_val_score(classifier, ccdXdr, ccdY, scoring = 'balanced_accuracy', cv = validator, n_jobs = -1)

In [32]:
print('StratifiedKFold : {:2.2%}'.format(np.mean(score)))

StratifiedKFold : 67.04%


### 4.g. StratifiedShuffleSplit

In [33]:
validator = StratifiedShuffleSplit(n_splits = 5, test_size = 0.25, random_state=72)

In [34]:
score = cross_val_score(classifier, ccdXdr, ccdY, scoring = 'balanced_accuracy', cv = validator, n_jobs = -1)

In [35]:
print('StratifiedShuffleSplit : {:2.2%}'.format(np.mean(score)))

StratifiedShuffleSplit : 67.09%


### 4.h. TimeSeriesSplit

In [36]:
validator = TimeSeriesSplit(n_splits = 5)

In [37]:
score = cross_val_score(classifier, ccdXdr, ccdY, scoring = 'balanced_accuracy', cv = validator, n_jobs = -1)

In [38]:
print('TimeSeriesSplit : {:2.2%}'.format(np.mean(score)))

TimeSeriesSplit : 67.52%


## Conclusion
TimeSeriesSplit has the best score with 67.52%.

ShuffleSplit trails with 67.41%.

Benchmark Holdout has the score of 37.08%