In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy

In [3]:
from sklearn.preprocessing import LabelEncoder
import warnings 
warnings.simplefilter('ignore')

In [4]:
train_ = pd.read_csv('Train.csv')
test_ = pd.read_csv('Test.csv')
submission_ = pd.read_csv('SampleSubmission.csv')

In [5]:
from sklearn.model_selection import KFold

def get_train_test_names(train_, test_, submission_):
  kf = KFold(n_splits=5, shuffle=False)
  for r, (train_index, test_index) in enumerate(kf.split(train_)):
    test = train_.iloc[test_index]

    X_test = []
    X_test_columns = test.columns
    for v in test.values:
      info = v[:8]
      binary = v[8:]
      index = [k for k, i in enumerate(binary) if i == 1]
      for i in index:
        for k in range(len(binary)):
          if k == i:
            binary_transformed = list(copy.copy(binary))
            binary_transformed[i] = 0
            X_test.append(list(info) + binary_transformed)

    X_test = pd.DataFrame(X_test)
    X_test.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
          'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
          '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
          'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3']

    X_test['ID'] = [str(r)+'_'+str(i) for i in range(X_test.shape[0])]

    yield train_.iloc[train_index], X_test, submission_, '1_fold' + str(r) + '.csv'
  yield train_, test_, submission_, '1_main.csv'

In [6]:
def process(df):
  
  df['date1']=df['date1'].fillna(0).astype('int')
  df['date2']=df['date2'].fillna(0).astype('int')
  df['date3'] =df['date3'].fillna(0).astype('int')
  df['eclapse'] = 2020 - df['date3']
  df['IS_748L'] = df['branch_code'] == '748L'
  df['IS_748L'] = df['IS_748L'].astype('int')
  
  df['IS_T4MS'] = df['occupation_category_code'] == 'T4MS'
  df['IS_T4MS'] = df['IS_T4MS'].astype('int')
  
  df['IS_RVSZ'] = df['product_pred'].apply(lambda x : 1 if x=='RVSZ' else 0 )
  df['IS_K6QO'] = df['product_pred'].apply(lambda x : 1 if x=='K6QO' else 0 )

  df['IS_RVSZ'] = df['product_pred'].apply(lambda x : 1 if x=='RVSZ' else 0 )
  df['IS_K6QO'] = df['product_pred'].apply(lambda x : 1 if x=='K6QO' else 0 )
  df['IS_M'] = df['marital_status'].apply(lambda x : 1 if x=='M' else 0 )
  
  df['current_age'] = np.abs(2020 - df['birth_year'])


  return df

### Get folds

In [7]:
param = {
      'bagging_freq': 1,
      'bagging_fraction': 0.8,
      'feature_fraction': 0.9,
      'boost_from_average':'false',
      'num_leaves': 80,
      'boost': 'gbdt',
      'learning_rate': 0.05,
      'metric':'auc',
      'tree_learner': 'serial',
      'objective': 'binary',
      'random_state': 869,
      'n_jobs': -1,
      'verbosity': -1
  }

In [8]:
for train, test, submission, name in get_train_test_names(train_, test_, submission_):
  i = 1 

  print(40* '=')
  print(f' FOLD {i} / 5 ')
  i += 1
  np_data = []
  train_columns = train.columns
  for v in tqdm(train.values):
    info = v[:8]
    binary = v[8:]
    index_n = [k for k, i in enumerate(binary) if i == 1]
    for i in index_n:
      for k in range(len(binary)):
        if (k not in index_n) or (k == i):
          binary_0 = list(copy.copy(binary))
          binary_0[i] = 0
          if k == i:
            np_data.append(list(info) + binary_0 + [train_columns[8+k]] + [1])
          else:
            np_data.append(list(info) + binary_0 + [train_columns[8+k]] + [0])

  df_data = pd.DataFrame(np_data)
  df_data.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
        'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
        '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
        'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'product_pred', 'target']

  np_data_test = []
  answ_test = []
  test_columns = test.columns
  for v in tqdm(test.values):
    info = v[:8]
    binary = v[8:]
    index_n = [k for k, i in enumerate(binary) if i == 1]
    for k in range(len(binary)):
      if k not in index_n:
        np_data_test.append(list(info) + list(binary) + [test_columns[8+k]])

  df_data_test = pd.DataFrame(np_data_test)
  df_data_test.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
        'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
        '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
        'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'product_pred']

  df_data['date1'] = df_data['join_date'].apply(lambda x: x.split('/')[0] if (x == x) else np.nan)
  df_data['date2'] = df_data['join_date'].apply(lambda x: x.split('/')[1] if (x == x) else np.nan)
  df_data['date3'] = df_data['join_date'].apply(lambda x: x.split('/')[2] if (x == x) else np.nan)

  df_data_test['date1'] = df_data_test['join_date'].apply(lambda x: x.split('/')[0] if (x == x) else np.nan)
  df_data_test['date2'] = df_data_test['join_date'].apply(lambda x: x.split('/')[1] if (x == x) else np.nan)
  df_data_test['date3'] = df_data_test['join_date'].apply(lambda x: x.split('/')[2] if (x == x) else np.nan)

  ## 1 - join date feature engineering 
  df_data.join_date = pd.to_datetime(df_data.join_date, 
                                  #format="%Y/%m%d"
                                  )
  df_data_test.join_date = pd.to_datetime(df_data_test.join_date,
                                  #format="%Y-%m-%d"
  )
  df_data.join_date = pd.to_datetime(df_data.join_date, format="%Y/%m%d")
  df_data_test.join_date = pd.to_datetime(df_data_test.join_date, format="%Y-%m-%d")

  df_data['day_of_week'] = df_data['join_date'].dt.dayofweek
  df_data['day_of_week_name'] = df_data['join_date'].dt.day_name()
  df_data['age'] = np.abs(df_data['join_date'].dt.year - df_data['birth_year'])

  df_data_test['day_of_week'] = df_data_test['join_date'].dt.dayofweek
  df_data_test['day_of_week_name'] = df_data_test['join_date'].dt.day_name()
  df_data_test['age'] = np.abs( df_data_test['join_date'].dt.year - df_data_test['birth_year'])



  # new
  df_data = process(df_data)
  df_data_test = process(df_data_test)


  for col in ['sex', 'marital_status', 'branch_code', 'occupation_code',
            'occupation_category_code','day_of_week_name','join_date','product_pred']:
        df_data[col] = df_data[col].astype('object')
        df_data_test[col] = df_data_test[col].astype('object')
  print('---------------------------------------------------------------------------------------------')
  print('df_data shape : ' , df_data.shape,'df_data_test shape : ' , df_data_test.shape)
  #df_data shape :  (1298055, 45) df_data_test shape :  (197147, 44)
  print('---------------------------------------------------------------------------------------------')
##########################################################################################################################################

  train = df_data
  test = df_data_test
  df_answer = df_data_test[['ID', 'product_pred']]  
  
  fts = [x for x in df_data.columns if x not in ['IS_748L', 'IS_T4MS', 'IS_33_34_to_38',
       'IS_1993_1982_1984_and_T4MS', 'IS_month_4_5_and_T4MS'] +['ID', 'target', 'ID X PCODE','date'] ]
    
  from sklearn import preprocessing

  from sklearn import preprocessing

  for f in ['sex', 'marital_status', 'branch_code', 'occupation_code',
            'occupation_category_code','day_of_week_name','join_date','product_pred']:
      
          lbl = preprocessing.LabelEncoder()
          lbl.fit(list(train[f].values) + list(test[f].values))
          train[f] = lbl.transform(list(train[f].values))
          test[f] = lbl.transform(list(test[f].values))   

  import lightgbm as lgb

  trn_data = lgb.Dataset(train[fts], label=train['target'], categorical_feature=['branch_code','occupation_code', 'occupation_category_code']) 

  print(15 * '+')
  print('Start Training ....')
  

  lgb_model = lgb.train(param, trn_data, 1200)




  

  

  preds_proba = lgb_model.predict(test[fts],num_iteration=lgb_model.best_iteration)

  df_answer['target'] = preds_proba
  df_answer['ID X PCODE'] = df_answer['ID'] + ' X ' + df_answer['product_pred']
  df_answer.drop(['ID', 'product_pred'], axis=1, inplace=True)
  df_answer.rename(columns={'target':'Label'}, inplace=True)
  df_answer = submission[submission['ID X PCODE'].isin(list(set(list(submission['ID X PCODE'])) - set(list(df_answer['ID X PCODE']))))].append(df_answer)
  df_answer.reset_index(drop=True, inplace=True)
  df_answer.to_csv(name, index=False)

  2%|▏         | 391/23305 [00:00<00:05, 3907.02it/s]

 FOLD 1 / 5 


100%|██████████| 23305/23305 [00:07<00:00, 3034.29it/s]
100%|██████████| 13306/13306 [00:01<00:00, 8814.11it/s]


---------------------------------------------------------------------------------------------
df_data shape :  (1037829, 44) df_data_test shape :  (260226, 43)
---------------------------------------------------------------------------------------------
+++++++++++++++
Start Training ....


  2%|▏         | 419/23305 [00:00<00:05, 4182.85it/s]

 FOLD 1 / 5 


100%|██████████| 23305/23305 [00:06<00:00, 3400.58it/s]
100%|██████████| 13237/13237 [00:01<00:00, 9065.37it/s]


---------------------------------------------------------------------------------------------
df_data shape :  (1038970, 44) df_data_test shape :  (259085, 43)
---------------------------------------------------------------------------------------------
+++++++++++++++
Start Training ....


  1%|▏         | 294/23306 [00:00<00:07, 2925.93it/s]

 FOLD 1 / 5 


100%|██████████| 23306/23306 [00:08<00:00, 2859.03it/s]
100%|██████████| 13158/13158 [00:01<00:00, 9177.38it/s]


---------------------------------------------------------------------------------------------
df_data shape :  (1040173, 44) df_data_test shape :  (257882, 43)
---------------------------------------------------------------------------------------------
+++++++++++++++
Start Training ....


  2%|▏         | 355/23306 [00:00<00:06, 3543.52it/s]

 FOLD 1 / 5 


100%|██████████| 23306/23306 [00:06<00:00, 3377.41it/s]
100%|██████████| 13289/13289 [00:01<00:00, 8657.19it/s]


---------------------------------------------------------------------------------------------
df_data shape :  (1038134, 44) df_data_test shape :  (259921, 43)
---------------------------------------------------------------------------------------------
+++++++++++++++
Start Training ....


  0%|          | 0/23306 [00:00<?, ?it/s]

 FOLD 1 / 5 


100%|██████████| 23306/23306 [00:07<00:00, 3303.54it/s]
100%|██████████| 13363/13363 [00:01<00:00, 12363.41it/s]


---------------------------------------------------------------------------------------------
df_data shape :  (1037114, 44) df_data_test shape :  (260941, 43)
---------------------------------------------------------------------------------------------
+++++++++++++++
Start Training ....


  1%|          | 263/29132 [00:00<00:11, 2622.23it/s]

 FOLD 1 / 5 


100%|██████████| 29132/29132 [00:11<00:00, 2631.73it/s]
100%|██████████| 10000/10000 [00:00<00:00, 11978.65it/s]


---------------------------------------------------------------------------------------------
df_data shape :  (1298055, 44) df_data_test shape :  (197147, 43)
---------------------------------------------------------------------------------------------
+++++++++++++++
Start Training ....


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
!mkdir '/content/drive/My Drive/Zimnat/STACK_LGBM_BASELINE_2'

In [11]:
!cp 1_fold0.csv  '/content/drive/My Drive/Zimnat/STACK_LGBM_BASELINE_2'
!cp 1_fold1.csv  '/content/drive/My Drive/Zimnat/STACK_LGBM_BASELINE_2'
!cp 1_fold2.csv  '/content/drive/My Drive/Zimnat/STACK_LGBM_BASELINE_2'
!cp 1_fold3.csv  '/content/drive/My Drive/Zimnat/STACK_LGBM_BASELINE_2'
!cp 1_fold4.csv  '/content/drive/My Drive/Zimnat/STACK_LGBM_BASELINE_2'
!cp 1_main.csv  '/content/drive/My Drive/Zimnat/STACK_LGBM_BASELINE_2'