In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore', category = FutureWarning)
pd.set_option('display.max_columns', None)

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
random_state = 4342
X = train.drop(['ID_code', 'target'], axis = 1)
y = train.target
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size = 0.2, random_state = random_state)

# FE: unique values counts

In [None]:
# df of counts for all vars
df_var = pd.DataFrame({})

for i, var in enumerate(train_X.columns):
    # series of 'unique value' - 'count'
    ucounts = train_X[var].value_counts().reset_index()
    ucounts = np.array(ucounts)
    #np array of var
    np_var = np.array(train_X[var])
    # np array of counts
    np_counts = np.zeros(len(train_X[var]))
    print('\n', i, var)
    
    for i, value in enumerate(np_var):
        # get np index for value in ucounts
        mask = np.isin(ucounts[:,0], value)
        ix = np.where(mask)[0][0]
        # get count for a value
        cnt = ucounts[ix, 1]
        # np array with counts for var
        np_counts[i] = cnt
        if(i % 10000 == 0): print(i, value, cnt)

    # add np_var to the df_var
    col_name = var + '_cnt'
    df_var[col_name] = np_counts

df_var.head()

In [None]:
df_var_backup = df_var

In [None]:
train_X_no_ix = train_X.reset_index()
train_X_cnt = pd.merge(train_X_no_ix, 
                       df_var, 
                       left_on = train_X_no_ix.index, 
                       right_on = df_var.index).drop(['key_0'], axis = 1)
train_X_cnt.head()

In [None]:
train_X_no_ix = train_X.reset_index()
train_X_cnt = pd.merge(train_X_no_ix, 
                       df_var_backup, 
                       left_on = train_X_no_ix.index, 
                       right_on = df_var_backup.index).drop(['key_0'], axis = 1)
train_X_cnt.head()

In [None]:
# create new features: unique values counts
def uniq_cnt_features(df_in):
    # df of counts for all vars
    df_var = pd.DataFrame({})

    for i, var in enumerate(df_in.columns):
        # series of 'unique value' - 'count'
        ucounts = df_in[var].value_counts().reset_index()
        ucounts = np.array(ucounts)
        #np array of var
        np_var = np.array(df_in[var])
        # np array of counts
        np_counts = np.zeros(len(df_in[var]))
        print('\n', i, var)

        for i, value in enumerate(np_var):
            # get np index for value in ucounts
            mask = np.isin(ucounts[:,0], value)
            ix = np.where(mask)[0][0]
            # get count for a value
            cnt = ucounts[ix, 1]
            # np array with counts for var
            np_counts[i] = cnt
            if(i % 10000 == 0): print(i, value, cnt)

        # add np_var to the df_var
        col_name = var + '_cnt'
        df_var[col_name] = np_counts
    
    df_in_no_ix = df_in.reset_index()
    df_in_cnt = pd.merge(df_in_no_ix, 
                         df_var, 
                         left_on = df_in_no_ix.index, 
                         right_on = df_var.index).drop(['key_0'], axis = 1)
    return df_in_cnt

In [None]:
val_X_cnt = uniq_cnt_features(val_X)

In [None]:
val_X_cnt.head()

In [None]:
def set_to_zero(df_in):
    df_in_copy = df_in
    for i, var_cnt in enumerate(count_columns):
        df_in_copy.loc[df_in_copy[var_cnt] > 5, var_cnt] = 1
        df_in_copy.loc[df_in_copy[var_cnt] <= 5, var_cnt] = 0
    return df_in_copy
#train_X_cnt_copy = train_X_cnt
#train_X_cnt_copy.loc[train_X_cnt.var_199_cnt > 1, 'var_199_cnt'] = 0
#train_X_cnt_copy.head()

In [None]:
train_X_cnt_zero = set_to_zero(train_X_cnt)
val_X_cnt_zero = set_to_zero(val_X_cnt)

# LGB

In [None]:
count_columns = list(())
for i, var in enumerate(train_X.columns):
    count_columns.append(var + '_cnt')
    
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
    #'feature_name': count_columns,
    #'categorical_feature': count_columns
}

In [None]:
len(train_X_cnt)

In [None]:
evals_result = {}

#train_y = train_y.reset_index().drop(['index'], axis = 1)
#val_y = val_y.reset_index().drop(['index'], axis = 1)
train_data = lgb.Dataset(train_X_cnt_zero.drop(['index'], axis = 1), train_y)
val_data = lgb.Dataset(val_X_cnt_zero.drop(['index'], axis = 1), val_y)

lgb.train(param, 
        train_data, 
        30000, 
        valid_sets = [train_data, val_data],
        early_stopping_rounds = 2000,
        verbose_eval = 500,
        evals_result = evals_result)

In [None]:
train_X_cnt_y = pd.merge(train_X_cnt, 
                         train_y, 
                         left_on = train_X_cnt.index, 
                         right_on = train_y.index).drop(['key_0'], axis = 1)
train_X_cnt_y.head()

In [None]:
sns.kdeplot(train_X_cnt_y[train_X_cnt_y.target == 1].var_81_cnt, label = 'var_81: 1')
sns.kdeplot(train_X_cnt_y[train_X_cnt_y.target == 0].var_81_cnt, label = 'var_81: 0')

In [None]:
sns.kdeplot(train_X_cnt_y[train_X_cnt_y.target == 1].var_12_cnt, label = 'var_12: 1')
sns.kdeplot(train_X_cnt_y[train_X_cnt_y.target == 0].var_12_cnt, label = 'var_12: 0')