In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Garbage collector
import gc

!pip3 install unidecode
!pip3 install ipython-autotime
!pip3 install scorecardpy
%load_ext autotime

Collecting ipython-autotime
  Downloading ipython-autotime-0.1.tar.bz2 (1.2 kB)
Building wheels for collected packages: ipython-autotime
  Building wheel for ipython-autotime (setup.py) ... [?25ldone
[?25h  Created wheel for ipython-autotime: filename=ipython_autotime-0.1-py3-none-any.whl size=1830 sha256=c9666f371998367cd5caa7f12dd48ca1eb0c6d9fe30ea408235fe3be7ca5050f
  Stored in directory: /home/jupyter/.cache/pip/wheels/65/56/4a/4b967e4b9b62bd9d8d7ca789bba648c702d705487f28845bb2
Successfully built ipython-autotime
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.1


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import math
import gc
gc.enable()

def pca(train, test):
    pca = PCA(n_components=0.97, svd_solver='full')
    train_pca = pd.DataFrame(pca.fit_transform(train), columns = ['pca_' + str(i) for i in range(pca.n_components_)])
    test_pca = pd.DataFrame(pca.transform(test), columns = ['pca_' + str(i) for i in range(pca.n_components_)])
    return (train_pca, test_pca)

def one_hot_encoding(X_train, X_test):
    train = X_train.copy()
    test = X_test.copy()
    enc = OneHotEncoder(handle_unknown='ignore')

    train_ohe = pd.DataFrame()
    test_ohe = pd.DataFrame()

    for fea in train:
        train[fea] = train[fea].replace(to_replace=[np.nan], value='none')
        test[fea] = test[fea].replace(to_replace=[np.nan], value='none')

        temp_train = enc.fit_transform(train[fea].values.reshape(-1,1)).toarray()
        temp_test = enc.transform(test[fea].values.reshape(-1,1)).toarray()

        train_ohe = pd.concat([train_ohe, pd.DataFrame(temp_train, columns=[fea + '_ohe_' + str(enc.categories_[0][i]) for i in range(len(enc.categories_[0]))])], axis=1)
        test_ohe = pd.concat([test_ohe, pd.DataFrame(temp_test, columns=[fea + '_ohe_' + str(enc.categories_[0][i]) for i in range(len(enc.categories_[0]))])], axis=1)
    del train, test
    gc.collect()
    return (train_ohe, test_ohe)

def label_encoding(X_train, X_test):
    train = X_train.copy()
    test = X_test.copy()
    
    train_label = pd.DataFrame()
    test_label = pd.DataFrame()

    for fea in train:
        train[fea] = train[fea].replace(to_replace=[np.nan], value='none')
        test[fea] = test[fea].replace(to_replace=[np.nan], value='none')

        factorised = pd.factorize(train[fea])[1]
        labels = pd.Series(range(len(factorised)), index=factorised)

        temp_train = train[fea].map(labels)
        temp_test = test[fea].map(labels)

        train_label[fea + '_labeled'] = temp_train
        test_label[fea + '_labeled'] = temp_test

    train_label.fillna(-1, inplace=True)
    test_label.fillna(-1, inplace=True)
    del train, test
    gc.collect()
    return (train_label, test_label)

def freq_encoding(X_train, X_test):
    train = X_train.copy()
    test = X_test.copy()
    
    encoded_train_cols = dict()
    encoded_test_cols = dict()
    for col in train:
        train[col] = train[col].replace(to_replace=[np.nan], value='none')
        test[col] = test[col].replace(to_replace=[np.nan], value='none')

        freq_cats = train.groupby([col])[col].count()/train.shape[0]
        encoded_train_cols[str(col) + '_freq'] = train[col].map(freq_cats)
        encoded_test_cols[str(col) + '_freq'] = test[col].map(freq_cats)

    encoded_train_cols = pd.DataFrame(encoded_train_cols)
    encoded_train_cols.fillna(0, inplace=True)
    encoded_test_cols = pd.DataFrame(encoded_test_cols)
    encoded_test_cols.fillna(0, inplace=True)
    del train, test
    gc.collect()
    return (encoded_train_cols, encoded_test_cols)


def mean_encoding(X_train, X_test, target, alpha=0, folds=5, random=True, random_state=913100):
    
    train = pd.concat([X_train, target], axis=1)
    test = X_test.copy()
    encoded_train_cols = dict()
    encoded_test_cols = dict()
    target_mean_gobal = train[target.name].mean()
    
    for col in X_train:
      train[col] = train[col].replace(to_replace=[np.nan], value='none')
      test[col] = test[col].replace(to_replace=[np.nan], value='none')

      # Getting mean for test data
      groups = train.groupby([col])
      nrows_cat = groups[target.name].count()
      target_mean_cats = groups[target.name].mean()
      target_mean_cats_adj = (target_mean_cats*nrows_cat + target_mean_gobal*alpha) / (nrows_cat + alpha) 
      # Mapping mean to test data
      encoded_test_cols[str(col) + '_mean'] = test[col].map(target_mean_cats_adj)

      if folds is None:
        encoded_train_cols[str(col) + '_mean'] = train[col].map(target_mean_cats_adj)
      else:
        kfold = StratifiedKFold(n_splits=folds, shuffle=random, random_state=random_state)  
        parts = []
        # Kfold for train data
        for tr_idx, dev_idx in kfold.split(train.drop(columns=target.name), train[target.name]):
            # Divide data
            base_df, estimate_df = train.iloc[tr_idx], train.iloc[dev_idx]

            # Gettting mean of base_df for estimation
            groups = base_df.groupby([col])
            nrows_cat = groups[target.name].count()
            target_mean_cats = groups[target.name].mean()
            target_mean_cats_adj = (target_mean_cats*nrows_cat + target_mean_gobal*alpha) / (nrows_cat + alpha) 
            # Mapping mran for estimate_df
            parts.extend(estimate_df[col].map(target_mean_cats_adj))

        encoded_train_cols[str(col)+ '_mean'] = parts
  
    encoded_train_cols = pd.DataFrame(encoded_train_cols)
    encoded_train_cols.fillna(target_mean_gobal, inplace=True)
    
    encoded_test_cols = pd.DataFrame(encoded_test_cols)
    encoded_test_cols.fillna(target_mean_gobal, inplace=True)
    del train, test
    gc.collect() 
    return (encoded_train_cols, encoded_test_cols)

def scoring_ngboost_clf(X_train, y_train, X_dev, y_dev, random_state=913100, verbose=False):
    iterations = []
    train_scores = []
    dev_scores = []
  
  
    log_iters = list(set((np.logspace(math.log(1, 8), math.log(500, 8), 
                                        num=50, endpoint=True, base=8, 
                                        dtype=np.int))))
    for estimators in sorted(log_iters):
        model = MyNGBClassifier(n_estimators=estimators, random_state=random_state)
        model.fit(X_train, y_train)
        y_train_pred_scores = model.predict_proba(X_train)
        y_dev_pred_scores = model.predict_proba(X_dev)

        train_scores.append(roc_auc_score(y_train, y_train_pred_scores[:, 1]))
        dev_scores.append(roc_auc_score(y_dev, y_dev_pred_scores[:, 1]))
        iterations.append(estimators)
        if verbose:
            print(f'{iterations[-1]}/{len(log_iters)}', train_scores[-1], dev_scores[-1])
  
    best_score = max(dev_scores)
    best_iter = iterations[dev_scores.index(best_score)]
    if verbose:
        print(f'Best score: {best_score}. Best iter: {best_iter}')
    return (train_scores, dev_scores, iterations, model)

def test_all_encodings(train, dev, target_name):
    # Format: encoding function, encoding params, encoding name, encoding color
    encoding_settings = [
                      [one_hot_encoding, {}, 'One hot encoding', '#E7E005'],
                      [label_encoding, {}, 'Label encoding', '#960000'],
                      [freq_encoding, {}, 'Frequency encoding', '#FF2F02'],
                      [mean_encoding, {'alpha':0, 'folds':None, 'target':train['label']}, 'Mean encoding, alpha=0', '#A4C400'],
                      [mean_encoding, {'alpha':2, 'folds':None, 'target':train['label']}, 'Mean encoding, alpha=2', '#73B100'],
                      [mean_encoding, {'alpha':5, 'folds':None, 'target':train['label']}, 'Mean encoding, alpha=5', '#2B8E00'],
                      [mean_encoding, {'alpha':5, 'folds':3, 'target':train['label']}, 'Mean encoding, alpha=5, 3 folds', '#00F5F2'],
                      [mean_encoding, {'alpha':5, 'folds':5, 'target':train['label']}, 'Mean encoding, alpha=5, 5 folds', '#00BAD3'],
    ]
    scoring_func = scoring_ngboost_clf
    plt.figure(figsize=(10,7))

    review_rows = []

    for encoding_func, encoding_params, str_name, color in encoding_settings:
        print(str_name)
        X_train, X_dev = encoding_func(train.drop(columns=target_name), dev.drop(columns=target_name), **encoding_params)

        # X_train_pca, X_dev_pca = pca(X_train, X_dev)

        scores = scoring_func(X_train, train[target_name], X_dev, dev[target_name])

        train_scores, dev_scores, iters, _ = scores
        plt.plot(iters,  dev_scores,  label='Test, ' + str_name, linewidth=1.5, color=color)

        best_score_dev = max(dev_scores)
        best_iter_dev = iters[dev_scores.index(best_score_dev)]
        best_score_train = max(train_scores[:best_iter_dev])

        print(f'Best score for {str_name} is {best_score_dev}, on estimators {best_iter_dev}')
        review_rows.append([str_name, best_score_train, best_score_dev, best_iter_dev])
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    columns = ['Encoding', 'Train AUC score on best iteration', 'Best AUC score (test)', 'Best iteration (test)']
    return pd.DataFrame(review_rows, columns=columns)


time: 229 ms


In [3]:
train = pd.read_csv('./input/train_input_imputed.csv')

time: 399 ms


In [5]:
kfold = []
for i in range(0,5):
  X_train = pd.read_csv("./cv_input/X_train_{}.csv".format(i + 1))
  y_train = pd.read_csv("./cv_input/y_train_{}.csv".format(i + 1))
  X_dev = pd.read_csv("./cv_input/X_dev_{}.csv".format(i + 1))
  y_dev = pd.read_csv("./cv_input/y_dev_{}.csv".format(i + 1))
  kfold.append({
      "X_train": X_train,
      "y_train": y_train['label'],
      "X_dev": X_dev,
      "y_dev": y_dev['label'],
  })

time: 1.28 s


In [6]:
for i in range(0, 5):
  print(kfold[i]['X_train'].shape)
  print(kfold[i]['X_dev'].shape)

(24000, 63)
(6000, 63)
(24000, 63)
(6000, 63)
(24000, 63)
(6000, 63)
(24000, 63)
(6000, 63)
(24000, 63)
(6000, 63)
time: 1.29 ms


# Categorical features

In [7]:
cat_features = [col for col in train if train[col].dtype == 'object']
cat_features

['province',
 'district',
 'maCv',
 'FIELD_8',
 'FIELD_9',
 'FIELD_10',
 'FIELD_13',
 'FIELD_35',
 'FIELD_39',
 'FIELD_41',
 'FIELD_42',
 'FIELD_44']

time: 6.29 ms


## KFOLD

In [8]:
# # Transform FIELD_10 FIELD_13 FIELD_39
from unidecode import unidecode
from scipy.stats import chi2_contingency
train_cat_fea_engineer_combines = []
test_cat_fea_engineer_combines = []
for i in range(len(kfold)):
  for feature in ['province', 'FIELD_8', 'FIELD_9','FIELD_10','FIELD_13','FIELD_35', 'FIELD_39', 'FIELD_41', 'FIELD_42', 'FIELD_44']:
    kfold[i]['X_train'][feature].replace(to_replace=[np.nan], value='none', inplace=True)
    kfold[i]['X_train'][feature] = kfold[i]['X_train'][feature].apply(unidecode).apply(str.lower)

    kfold[i]['X_dev'][feature].replace(to_replace=[np.nan], value='none', inplace=True)
    kfold[i]['X_dev'][feature] = kfold[i]['X_dev'][feature].apply(unidecode).apply(str.lower)

  train_cat_ohe, test_cat_ohe = one_hot_encoding(kfold[i]['X_train'][['province', 'FIELD_8', 'FIELD_9','FIELD_10','FIELD_13','FIELD_35', 'FIELD_39', 'FIELD_41', 'FIELD_42', 'FIELD_44']], 
                                                 kfold[i]['X_dev'][['province', 'FIELD_8', 'FIELD_9','FIELD_10','FIELD_13','FIELD_35', 'FIELD_39', 'FIELD_41', 'FIELD_42', 'FIELD_44']])


  train_cat_label, test_cat_label = label_encoding(kfold[i]['X_train'][['province', 'FIELD_8', 'FIELD_9','FIELD_10','FIELD_13','FIELD_35', 'FIELD_39', 'FIELD_41', 'FIELD_42', 'FIELD_44']], 
                                                  kfold[i]['X_dev'][['province', 'FIELD_8', 'FIELD_9','FIELD_10','FIELD_13','FIELD_35', 'FIELD_39', 'FIELD_41', 'FIELD_42', 'FIELD_44']])

  train_cat_feq, test_cat_feq = freq_encoding(kfold[i]['X_train'][['province', 'FIELD_8', 'FIELD_9','FIELD_10','FIELD_13','FIELD_35', 'FIELD_39', 'FIELD_41', 'FIELD_42', 'FIELD_44']], 
                                              kfold[i]['X_dev'][['province', 'FIELD_8', 'FIELD_9','FIELD_10','FIELD_13','FIELD_35', 'FIELD_39', 'FIELD_41', 'FIELD_42', 'FIELD_44']])

  encoding_params =  {'alpha':5, 'folds':3, 'target':kfold[i]['y_train']}
  train_cat_mean, test_cat_mean = mean_encoding(kfold[i]['X_train'][['province', 'FIELD_8', 'FIELD_9', 'FIELD_10', 'FIELD_13', 'FIELD_35', 'FIELD_41', 'FIELD_42', 'FIELD_44']], 
                                                kfold[i]['X_dev'][['province', 'FIELD_8', 'FIELD_9', 'FIELD_10', 'FIELD_13', 'FIELD_35', 'FIELD_41', 'FIELD_42', 'FIELD_44']], 
                                                **encoding_params) 


  train_cat_fea_engineer_combine = pd.concat([train_cat_ohe, train_cat_label, train_cat_feq, train_cat_mean], axis=1)
  test_cat_fea_engineer_combine = pd.concat([test_cat_ohe, test_cat_label, test_cat_feq, test_cat_mean], axis=1)

  

  del train_cat_ohe, train_cat_label, train_cat_feq, train_cat_mean, test_cat_ohe, test_cat_label, test_cat_feq, test_cat_mean
  gc.collect()

  cat_fea_engineer_combine_selected_columns = []
  for fea in train_cat_fea_engineer_combine:
    props = pd.crosstab(train_cat_fea_engineer_combine[fea], kfold[i]['y_train'])
    c = chi2_contingency(props, lambda_='log-likelihood')
    if (c[1] <= 0.05):
      cat_fea_engineer_combine_selected_columns.append(fea)
    # print(f'{props} \np-value={c[1]}\n')
  train_cat_fea_engineer_combine = train_cat_fea_engineer_combine[cat_fea_engineer_combine_selected_columns]
  test_cat_fea_engineer_combine = test_cat_fea_engineer_combine[cat_fea_engineer_combine_selected_columns] 

  train_cat_fea_engineer_combines.append(train_cat_fea_engineer_combine)
  test_cat_fea_engineer_combines.append(test_cat_fea_engineer_combine)
  # kfold[i]['X_train'] = pd.concat([kfold[i]['X_train'], train_cat_fea_engineer_combines[cat_fea_engineer_combine_selected_columns]], axis=1)
  # kfold[i]['X_dev'] = pd.concat([kfold[i]['X_dev'], train_cat_fea_engineer_combines[cat_fea_engineer_combine_selected_columns]], axis=1)

time: 34.5 s


# Add Subtract Divide Multiply

## KFOLD


In [9]:
from itertools import combinations

auto_columns = 'FIELD_1 FIELD_2 FIELD_3 FIELD_4 FIELD_5 FIELD_6 FIELD_14 FIELD_15 FIELD_16 FIELD_21 FIELD_22 FIELD_32 FIELD_33 FIELD_34 FIELD_46 FIELD_50 \
FIELD_51 FIELD_52 FIELD_53 FIELD_54 FIELD_55 FIELD_56 FIELD_57'.split()
train_num_fea_engineer_combines = []
test_num_fea_engineer_combines = []
for i in range(len(kfold)):
  train_num_fea_engineer_combine = pd.DataFrame()
  test_num_fea_engineer_combine = pd.DataFrame()
  for l, r in combinations(auto_columns, 2):  
    for func in 'add subtract divide multiply'.split():
      train_num_fea_engineer_combine[f'auto_{func}_{l}_{r}'] = getattr(np, func)(kfold[i]['X_train'][l], kfold[i]['X_train'][r])
      test_num_fea_engineer_combine[f'auto_{func}_{l}_{r}'] = getattr(np, func)(kfold[i]['X_dev'][l], kfold[i]['X_dev'][r])
  
  train_num_fea_engineer_combine.replace([np.inf, -np.inf], np.nan, inplace=True)
  train_num_fea_engineer_combine.fillna(-999, inplace=True)

  test_num_fea_engineer_combine.replace([np.inf, -np.inf], np.nan, inplace=True)
  test_num_fea_engineer_combine.fillna(-999, inplace=True)
  

  # num_fea_engineer_combine_selected_columns = []
  # for fea in train_num_fea_engineer_combine:
  #   props = pd.crosstab(train_num_fea_engineer_combine[fea], kfold[i]['y_train'])
  #   c = chi2_contingency(props, lambda_='log-likelihood')
  #   if (c[1] <= 0.05):
  #     num_fea_engineer_combine_selected_columns.append(fea)
  #   # print(f'{props} \np-value={c[1]}\n')

  # train_num_fea_engineer_combine = train_num_fea_engineer_combine[num_fea_engineer_combine_selected_columns]
  # test_num_fea_engineer_combine = test_num_fea_engineer_combine[num_fea_engineer_combine_selected_columns] 
  
  train_num_fea_engineer_combines.append(train_num_fea_engineer_combine)
  test_num_fea_engineer_combines.append(test_num_fea_engineer_combine)


time: 17 s


# Mean Median Max Min

## KFOLD

In [10]:
numeric_features = list(set(train.columns) - set(cat_features))
numeric_features.remove('label')
train_cat_fea_engineer_stats_combines = []
test_cat_fea_engineer_stats_combines = []
for i in range(5):
  train_cat_fea_engineer_stats_combine = pd.DataFrame()
  test_cat_fea_engineer_stats_combine = pd.DataFrame()
  for cat in cat_features:
    for num in numeric_features:
      # mean
      train_cat_fea_engineer_stats_combine[f'stats_mean_{cat}_{num}'] = kfold[i]['X_train'][cat].map(
          kfold[i]['X_train'].groupby(cat)[num].mean()
      )
      test_cat_fea_engineer_stats_combine[f'stats_mean_{cat}_{num}'] = kfold[i]['X_dev'][cat].map(
          kfold[i]['X_dev'].groupby(cat)[num].mean()
      )

      # median
      train_cat_fea_engineer_stats_combine[f'stats_mean_{cat}_{num}'] = kfold[i]['X_train'][cat].map(
          kfold[i]['X_train'].groupby(cat)[num].median()
      )
      test_cat_fea_engineer_stats_combine[f'stats_mean_{cat}_{num}'] = kfold[i]['X_dev'][cat].map(
          kfold[i]['X_dev'].groupby(cat)[num].median()
      )

      # min
      train_cat_fea_engineer_stats_combine[f'stats_mean_{cat}_{num}'] = kfold[i]['X_train'][cat].map(
          kfold[i]['X_train'].groupby(cat)[num].min()
      )
      test_cat_fea_engineer_stats_combine[f'stats_mean_{cat}_{num}'] = kfold[i]['X_dev'][cat].map(
          kfold[i]['X_dev'].groupby(cat)[num].min()
      )

      # max
      train_cat_fea_engineer_stats_combine[f'stats_mean_{cat}_{num}'] = kfold[i]['X_train'][cat].map(
          kfold[i]['X_train'].groupby(cat)[num].max()
      )
      test_cat_fea_engineer_stats_combine[f'stats_mean_{cat}_{num}'] = kfold[i]['X_dev'][cat].map(
          kfold[i]['X_dev'].groupby(cat)[num].max()
      )
  
  num_fea_engineer_stats_combine_selected_columns = []
  for fea in train_cat_fea_engineer_stats_combine:
    props = pd.crosstab(train_cat_fea_engineer_stats_combine[fea], kfold[i]['y_train'])
    c = chi2_contingency(props, lambda_='log-likelihood')
    if (c[1] <= 0.05):
      num_fea_engineer_stats_combine_selected_columns.append(fea)
    # print(f'{props} \np-value={c[1]}\n')

  train_cat_fea_engineer_stats_combine = train_cat_fea_engineer_stats_combine[num_fea_engineer_stats_combine_selected_columns]
  test_cat_fea_engineer_stats_combine = test_cat_fea_engineer_stats_combine[num_fea_engineer_stats_combine_selected_columns] 

  train_cat_fea_engineer_stats_combines.append(train_cat_fea_engineer_stats_combine)
  test_cat_fea_engineer_stats_combines.append(test_cat_fea_engineer_stats_combine)
      

time: 2min 4s


# Weight of Evidence and information value


## KFold

In [11]:
import scorecardpy as sc
from pylab import rcParams
rcParams['figure.figsize'] = 20, 10
from scipy.stats import chi2_contingency
import copy

for i in range(len(kfold)):
  print(f'{i+1}/{len(kfold)}')
  # filter variable via missing rate
  X_train = pd.concat([
                                    kfold[i]['X_train'],               
                                    train_cat_fea_engineer_combines[i],
                                    train_num_fea_engineer_combines[i],
                                    train_cat_fea_engineer_stats_combines[i]
  ], axis=1)
  print(X_train.shape)
  X_dev = pd.concat([
                                    kfold[i]['X_dev'],               
                                    test_cat_fea_engineer_combines[i],
                                    test_num_fea_engineer_combines[i],
                                    test_cat_fea_engineer_stats_combines[i]
  ], axis=1)
  print(X_dev.shape)

  dt_s = sc.var_filter(pd.concat([X_train, kfold[i]['y_train']], axis=1), y='label')
  bins = sc.woebin(dt_s, y='label', bin_num_limit=20, positive="label|1", method='tree')

  train_woe = sc.woebin_ply(X_train, bins)


  test_woe = sc.woebin_ply(X_dev, bins)
  test_woe.reset_index(drop=True, inplace=True)
  temp = train_woe.sample(test_woe.shape[0], replace=True)
  temp.reset_index(drop=True, inplace=True)
  test_woe.fillna(temp, inplace=True)

  woe_selected_columns = []
  for col in bins:
    if bins[col]['total_iv'][0] > 0.1:
      woe_selected_columns.append(col + '_woe')
  # for fea in train_woe:
  #   props = pd.crosstab(train_woe[fea], kfold[i]['y_train'])
  #   c = chi2_contingency(props, lambda_='log-likelihood')
  #   if (c[1] <= 0.05):
  #     woe_selected_columns.append(fea)
  #     # print(f'{props} \np-value={c[1]}\n')
  train_woe = train_woe[woe_selected_columns]
  test_woe = test_woe[woe_selected_columns]
  

  # train_count = pd.DataFrame()
  # test_count = pd.DataFrame()
  # train_woe['label'] = kfold[i]['y_train']
  # for fea in train_woe.drop(columns=['label']):
  #   groups = train_woe.groupby(fea, as_index=True)

  #   bad = groups.sum()['label']
  #   good = groups.count()['label'] - groups.sum()['label']
  #   train_count[fea + '_N_bad'] = train_woe[fea].map(bad)
  #   train_count[fea + '_N_good'] = train_woe[fea].map(good)

  #   test_count[fea + '_N_bad'] = test_woe[fea].map(bad)
  #   test_count[fea + '_N_good'] = test_woe[fea].map(good)
  # train_woe.drop(columns=['label'], inplace=True)

  # count_selected_columns = []
  # for fea in train_count:
  #   props = pd.crosstab(train_count[fea], train['label'])
  #   c = chi2_contingency(props, lambda_='log-likelihood')
  #   if (c[1] <= 0.05):
  #     count_selected_columns.append(fea)
  #     # print(f'{props} \np-value={c[1]}\n')

  # train_count = train_count[count_selected_columns]
  # test_count = test_count[count_selected_columns]
  
  train_dup_cols = set(X_train.columns).intersection(set(train_woe.columns))
  test_dup_cols = copy.deepcopy(train_dup_cols)

  X_train = X_train.drop(columns=cat_features)
  X_dev = X_dev.drop(columns=cat_features)
  
  kfold[i]['X_train'] = pd.concat([X_train, train_woe.drop(columns=train_dup_cols)], axis=1)
  kfold[i]['X_dev'] = pd.concat([X_dev, test_woe.drop(columns=test_dup_cols)], axis=1)
  
  del temp, dt_s, bins, train_woe, test_woe, X_train, X_dev
  gc.collect()

1/5
(24000, 1370)
(6000, 1370)
[INFO] filtering variables ...
Variable filtering on 24000 rows and 1371 columns in 00:12:24 
97 variables are removed
[INFO] creating woe binning ...
>>> There are 4 variables have too many unique non-numberic values, which might cause the binning process slow. Please double check the following variables: 
province, FIELD_13, maCv, district
>>> Continue the binning process?
1: yes 
2: no


Selection:  1


Binning on 24000 rows and 1274 columns in 00:07:49
[INFO] converting into woe values ...
Woe transformating on 24000 rows and 1273 columns in 00:01:37
[INFO] converting into woe values ...
Woe transformating on 6000 rows and 1273 columns in 00:00:28
2/5
(24000, 1378)
(6000, 1378)
[INFO] filtering variables ...
Variable filtering on 24000 rows and 1379 columns in 00:12:26 
111 variables are removed
[INFO] creating woe binning ...
>>> There are 4 variables have too many unique non-numberic values, which might cause the binning process slow. Please double check the following variables: 
province, FIELD_13, maCv, district
>>> Continue the binning process?
1: yes 
2: no


Selection:  1


Binning on 24000 rows and 1268 columns in 00:07:02
[INFO] converting into woe values ...
Woe transformating on 24000 rows and 1267 columns in 00:01:38
[INFO] converting into woe values ...
Woe transformating on 6000 rows and 1267 columns in 00:00:28
3/5
(24000, 1400)
(6000, 1400)
[INFO] filtering variables ...
Variable filtering on 24000 rows and 1401 columns in 00:12:52 
97 variables are removed
[INFO] creating woe binning ...
>>> There are 4 variables have too many unique non-numberic values, which might cause the binning process slow. Please double check the following variables: 
province, FIELD_13, maCv, district
>>> Continue the binning process?
1: yes 
2: no


Selection:  1


Binning on 24000 rows and 1304 columns in 00:04:44
[INFO] converting into woe values ...
Woe transformating on 24000 rows and 1303 columns in 00:01:40
[INFO] converting into woe values ...
Woe transformating on 6000 rows and 1303 columns in 00:00:29
4/5
(24000, 1383)
(6000, 1383)
[INFO] filtering variables ...
Variable filtering on 24000 rows and 1384 columns in 00:12:43 
97 variables are removed
[INFO] creating woe binning ...
>>> There are 4 variables have too many unique non-numberic values, which might cause the binning process slow. Please double check the following variables: 
province, FIELD_13, maCv, district
>>> Continue the binning process?
1: yes 
2: no


Selection:  1


Binning on 24000 rows and 1287 columns in 00:06:34
[INFO] converting into woe values ...
Woe transformating on 24000 rows and 1286 columns in 00:01:38
[INFO] converting into woe values ...
Woe transformating on 6000 rows and 1286 columns in 00:00:29
5/5
(24000, 1368)
(6000, 1368)
[INFO] filtering variables ...
Variable filtering on 24000 rows and 1369 columns in 00:12:35 
104 variables are removed
[INFO] creating woe binning ...
>>> There are 4 variables have too many unique non-numberic values, which might cause the binning process slow. Please double check the following variables: 
province, FIELD_13, maCv, district
>>> Continue the binning process?
1: yes 
2: no


Selection:  1


Binning on 24000 rows and 1265 columns in 00:22:47
[INFO] converting into woe values ...
Woe transformating on 24000 rows and 1264 columns in 00:01:39
[INFO] converting into woe values ...
Woe transformating on 6000 rows and 1264 columns in 00:00:29
time: 2h 2min 50s


In [12]:
for i in range(0, len(kfold)):
  # kfold[i]["X_train"].drop(columns=['label'], inplace=True)
  kfold[i]["X_train"].to_csv("./cv_input/X_train_preprocessed_{}.csv".format(i + 1), index=False)
  kfold[i]["X_dev"].to_csv("./cv_input/X_dev_preprocessed_{}.csv".format(i + 1), index=False)


time: 4min 20s


In [13]:
for i in range(0, 5):
  print(kfold[i]['X_train'].shape)
  print(kfold[i]['X_dev'].shape)

(24000, 1392)
(6000, 1392)
(24000, 1381)
(6000, 1381)
(24000, 1444)
(6000, 1444)
(24000, 1409)
(6000, 1409)
(24000, 1403)
(6000, 1403)
time: 1.64 ms
