In [1]:
import pandas as pd
import utils as utils
from sklearn.model_selection import train_test_split

LOAD CONFIG FILE

In [2]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'E:\\PACMAN\\Credit Score\\Data\\Raw\\data.csv',
 'data_set_path': 'E:\\PACMAN\\Credit Score\\Data\\Output\\data.pkl',
 'input_set_path': 'E:\\PACMAN\\Credit Score\\Data\\Output\\input.pkl',
 'output_set_path': 'E:\\PACMAN\\Credit Score\\Data\\Output\\output.pkl',
 'input_columns_path': 'E:\\PACMAN\\Credit Score\\Data\\Output\\input_columns.pkl',
 'train_set_path': ['E:\\PACMAN\\Credit Score\\Data\\Output\\X_train.pkl',
  'E:\\PACMAN\\Credit Score\\Data\\Output\\y_train.pkl'],
 'valid_set_path': ['E:\\PACMAN\\Credit Score\\Data\\Output\\X_valid.pkl',
  'E:\\PACMAN\\Credit Score\\Data\\Output\\y_valid.pkl'],
 'test_set_path': ['E:\\PACMAN\\Credit Score\\Data\\Output\\X_test.pkl',
  'E:\\PACMAN\\Credit Score\\Data\\Output\\y_test.pkl'],
 'index_column': 'Unnamed: 0',
 'output_column': 'SeriousDlqin2yrs',
 'seed': 42,
 'test_size': 0.2}

In [3]:
def read_data(return_file=True):
    # Read data
    data = pd.read_csv(CONFIG_DATA['raw_dataset_path'], 
                       sep=',',
                       index_col=CONFIG_DATA['index_column'])

    # Print data
    print('data shape   :', data.shape)

    # Dump data
    utils.pickle_dump(data, CONFIG_DATA['data_set_path'])

    # Return data
    if return_file:
        return data

In [4]:
data = read_data()
data.head().T

data shape   : (150000, 11)


Unnamed: 0,1,2,3,4,5
SeriousDlqin2yrs,1.0,0.0,0.0,0.0,0.0
RevolvingUtilizationOfUnsecuredLines,0.766127,0.957151,0.65818,0.23381,0.907239
age,45.0,40.0,38.0,30.0,49.0
NumberOfTime30-59DaysPastDueNotWorse,2.0,0.0,1.0,0.0,1.0
DebtRatio,0.802982,0.121876,0.085113,0.03605,0.024926
MonthlyIncome,9120.0,2600.0,3042.0,3300.0,63588.0
NumberOfOpenCreditLinesAndLoans,13.0,4.0,2.0,5.0,7.0
NumberOfTimes90DaysLate,0.0,0.0,1.0,0.0,0.0
NumberRealEstateLoansOrLines,6.0,0.0,0.0,0.0,1.0
NumberOfTime60-89DaysPastDueNotWorse,0.0,0.0,0.0,0.0,0.0


Splitting Data

Split Input & Output

In [5]:
def split_input_output(return_file=True):
    # Read data
    data = utils.pickle_load(CONFIG_DATA['data_set_path'])

    # Split input & output
    y = data[CONFIG_DATA['output_column']]
    X = data.drop([CONFIG_DATA['output_column']], axis=1)

    # Print splitting
    print('Input shape  :', X.shape)
    print('Output shape :', y.shape)
    print('Input NAN    :')
    print(X.isnull().sum())
    print('Benchmark    :')
    print(y.value_counts(normalize=True))
    
    # Dump file
    utils.pickle_dump(X, CONFIG_DATA['input_set_path'])
    utils.pickle_dump(y, CONFIG_DATA['output_set_path'])
    utils.pickle_dump(X.columns, CONFIG_DATA['input_columns_path'])     # dump input columns

    if return_file:
        return X, y

In [6]:
X, y = split_input_output()

Input shape  : (150000, 10)
Output shape : (150000,)
Input NAN    :
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64
Benchmark    :
0    0.93316
1    0.06684
Name: SeriousDlqin2yrs, dtype: float64


In [7]:
def split_train_test(return_file=True):
    # Load data
    X = utils.pickle_load(CONFIG_DATA['input_set_path'])
    y = utils.pickle_load(CONFIG_DATA['output_set_path'])

    # Split test & rest (train & valid)
    X_train, X_test, y_train, y_test = train_test_split(
                                            X,
                                            y,
                                            test_size = CONFIG_DATA['test_size'],
                                            random_state = CONFIG_DATA['seed']
                                        )
    
    # Split train & valid
    X_train, X_valid, y_train, y_valid = train_test_split(
                                            X_train,
                                            y_train,
                                            test_size = CONFIG_DATA['test_size'],
                                            random_state = CONFIG_DATA['seed']
                                        )
    
    # Print splitting
    print('X_train shape :', X_train.shape)
    print('y_train shape :', y_train.shape)
    print('X_valid shape  :', X_valid.shape)
    print('y_valid shape  :', y_valid.shape)
    print('X_test shape  :', X_test.shape)
    print('y_test shape  :', y_test.shape)

    # Dump file
    utils.pickle_dump(X_train, CONFIG_DATA['train_set_path'][0])
    utils.pickle_dump(y_train, CONFIG_DATA['train_set_path'][1])
    utils.pickle_dump(X_valid, CONFIG_DATA['valid_set_path'][0])
    utils.pickle_dump(y_valid, CONFIG_DATA['valid_set_path'][1])
    utils.pickle_dump(X_test, CONFIG_DATA['test_set_path'][0])
    utils.pickle_dump(y_test, CONFIG_DATA['test_set_path'][1])

    if return_file:
        return X_train, X_valid, X_test, y_train, y_valid, y_test

In [8]:
X_train, X_valid, X_test, y_train, y_valid, y_test = split_train_test()

X_train shape : (96000, 10)
y_train shape : (96000,)
X_valid shape  : (24000, 10)
y_valid shape  : (24000,)
X_test shape  : (30000, 10)
y_test shape  : (30000,)
