In [None]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/90/86/c3dcb600b4f9e7584ed90ea9d30a717fb5c0111574675f442c3e7bc19535/catboost-0.24.1-cp36-none-manylinux1_x86_64.whl (66.1MB)
[K     |████████████████████████████████| 66.1MB 44kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.1


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy

In [None]:
from catboost import CatBoostClassifier ,Pool
from sklearn.preprocessing import LabelEncoder

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
train_ = pd.read_csv('Train.csv')
test_ = pd.read_csv('Test.csv')
submission_ = pd.read_csv('SampleSubmission.csv')

In [None]:
from sklearn.model_selection import KFold

def get_train_test_names(train_, test_, submission_):
  kf = KFold(n_splits=5, shuffle=False)
  for r, (train_index, test_index) in enumerate(kf.split(train_)):
    test = train_.iloc[test_index]

    X_test = []
    X_test_columns = test.columns
    for v in test.values:
      info = v[:8]
      binary = v[8:]
      index = [k for k, i in enumerate(binary) if i == 1]
      for i in index:
        for k in range(len(binary)):
          if k == i:
            binary_transformed = list(copy.copy(binary))
            binary_transformed[i] = 0
            X_test.append(list(info) + binary_transformed)

    X_test = pd.DataFrame(X_test)
    X_test.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
          'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
          '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
          'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3']
    X_test['ID'] = [str(r)+'_'+str(i) for i in range(X_test.shape[0])]

    yield train_.iloc[train_index], X_test, submission_, '0_fold' + str(r) + '.csv'
  yield train_, test_, submission_, '0_main.csv'

In [None]:
def process(df):
  binary_features = []
  df['IS_30H5'] = df['branch_code'] == '30H5'
  df['IS_30H5'] = df['IS_30H5'].astype('int')

  df['IS_748L'] = df['branch_code'] == '748L'
  df['IS_748L'] = df['IS_748L'].astype('int')

  df['IS_1X1H'] = df['branch_code'] == '1X1H'
  df['IS_1X1H'] = df['IS_1X1H'].astype('int')

  df['IS_XX25'] = df['branch_code'] == 'XX25'
  df['IS_XX25'] = df['IS_XX25'].astype('int')

  df['IS_O67J'] = df['branch_code'] == 'O67J'
  df['IS_O67J'] = df['IS_O67J'].astype('int')

  df['IS_BOAS'] = df['branch_code'] == 'BOAS'
  df['IS_BOAS'] = df['IS_BOAS'].astype('int')
  
  df['IS_90QI'] = df['occupation_category_code'] == '90QI'
  df['IS_90QI'] = df['IS_90QI'].astype('int')

  df['IS_56SI'] = df['occupation_category_code'] == '56SI'
  df['IS_56SI'] = df['IS_56SI'].astype('int')

  
  
  df['IS_1982_1993_1984'] = df['birth_year'].apply(lambda x : 1 if x in [1993,1984,1982] else 0)
  df['IS_1982_1993_1984'] = df['IS_1982_1993_1984'].astype('int')
  
  df['date3'] = df['date3'].astype('int')
  #df['IS_2019_2018'] = df['date3'].apply(lambda x : 1 if x in [2019,2018] else 0)
  #df['IS_2019_2018'] = df['IS_2019_2018'].astype('int')
  
  df['date2'] = df['date2'].astype('int')
  df['IS_5_4'] = df['date2'].apply(lambda x : 1 if x in [4,5] else 0)
  df['IS_5_4'] = df['IS_5_4'].astype('int')
  
  df['age'] = df['age'].astype('int')
  df['IS_33_34_to_38'] = df['age'].apply(lambda x : 1 if x in [33,34,35,36,37,38] else 0)
  df['IS_33_34_to_38'] = df['IS_33_34_to_38'].astype('int')


  #df['IS_2019_2018_and_748L'] = df.apply(lambda x : 1 if (x['branch_code']=='748L' and x['date3'] in [2019,2018])  else 0 ,axis=1)
  
  #df['IS_2019_2018_and_T4MS'] = df.apply(lambda x : 1 if (x['occupation_category_code']=='90QI' and x['date3'] in [2019,2018])  else 0,axis=1)

  df['IS_1993_1982_1984_and_748L'] = df.apply(lambda x : 1 if (x['branch_code']=='748L' and x['birth_year'] in [1993,1984,1982])  else 0,axis=1)
  

  df['IS_1993_1982_1984_and_T4MS'] = df.apply(lambda x : 1 if (x['occupation_category_code']=='90QI' and x['birth_year'] in [1993,1984,1982])  else 0,axis=1)
  

  df['IS_1993_1982_1984_and_month4'] = df.apply(lambda x : 1 if (x['date2']==4 and x['birth_year'] in [1993,1984,1982])  else 0,axis=1)
  df['IS_1993_1982_1984_and_month5'] = df.apply(lambda x : 1 if (x['date2']==5 and x['birth_year'] in [1993,1984,1982])  else 0,axis=1)
  


  return df

### Get folds

In [None]:
for train, test, submission, name in get_train_test_names(train_, test_, submission_):
  print(30*'=|')
  j=1
  print('##### {} / 6 #####'.format(j) )
  j+=1
  X_train = []
  X_train_columns = train.columns
  c = 0
  for v in train.values:
    info = v[:8]
    binary = v[8:]
    index = [k for k, i in enumerate(binary) if i == 1]
    for i in index:
      c+=1
      for k in range(len(binary)):
        if k == i:
          binary_transformed = list(copy.copy(binary))
          binary_transformed[i] = 0
          X_train.append(list(info) + binary_transformed + [X_train_columns[8+k]] + [c])

  X_train = pd.DataFrame(X_train)
  X_train.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
        'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
        '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
        'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'product_pred', 'ID2']


  X_test = []
  true_values = []
  c = 0
  for v in test.values:
    c += 1
    info = v[:8]
    binary = v[8:]
    index = [k for k, i in enumerate(binary) if i == 1]
    X_test.append(list(info) + list(binary) + [c])
    for k in test.columns[8:][index]:
      true_values.append(v[0] + ' X ' + k)

  X_test = pd.DataFrame(X_test)
  X_test.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
        'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
        '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
        'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'ID2']


  features_train = []
  features_test = []
  columns = []

  append_features = ['P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 
  'N2MW', 'AHXO','BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 
  'ECY3', 'ID', 'ID2', 'join_date', 'sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code',
  'birth_year']
  for v in append_features:
    features_train.append(X_train[v].values.reshape(-1, 1))
    features_test.append(X_test[v].values.reshape(-1, 1))
    columns.append(np.array([v]))

  y_train = X_train[['product_pred']]


  features_train = np.concatenate(features_train, axis=1)
  features_test = np.concatenate(features_test, axis=1)
  columns = np.concatenate(np.array(columns))

  X_train = pd.DataFrame(features_train)
  X_train.columns = columns
  X_test = pd.DataFrame(features_test)
  X_test.columns = columns

  X_train['date1'] = X_train['join_date'].apply(lambda x: int(x.split('/')[0]) if (x == x) else np.nan)
  X_train['date2'] = X_train['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
  X_train['date3'] = X_train['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)

  X_test['date1'] = X_test['join_date'].apply(lambda x: int(x.split('/')[0]) if (x == x) else np.nan)
  X_test['date2'] = X_test['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
  X_test['date3'] = X_test['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)

  X_train.join_date = pd.to_datetime(X_train.join_date, 
                                   #format="%Y/%m%d"
                                   )
  X_test.join_date = pd.to_datetime(X_test.join_date,
                                    #format="%Y-%m-%d"
  )
  X_train.join_date = pd.to_datetime(X_train.join_date, format="%Y/%m%d")
  X_test.join_date = pd.to_datetime(X_test.join_date, format="%Y-%m-%d")

  X_train['day_of_week'] = X_train['join_date'].dt.dayofweek
  X_train['day_of_week_name'] = X_train['join_date'].dt.day_name()
  X_train['age'] = np.abs(X_train['join_date'].dt.year - X_train['birth_year'])
  X_test['day_of_week'] = X_test['join_date'].dt.dayofweek
  X_test['day_of_week_name'] = X_test['join_date'].dt.day_name()
  X_test['age'] = np.abs( X_test['join_date'].dt.year - X_test['birth_year'])


  X_train['date_diff'] = X_train['date3'] - X_train['birth_year']
  X_test['date_diff'] = X_test['date3'] - X_test['birth_year']

  X_train = X_train.fillna(0)
  X_test = X_test.fillna(0)
  y_train = y_train.fillna(0)
  X_train = process(X_train)
  X_test = process(X_test)

  from sklearn.preprocessing import LabelEncoder
  le = LabelEncoder()
  data = X_train.append(X_test)
  for v in ['sex', 'marital_status', 'branch_code', 'occupation_code',
            'occupation_category_code',
            ]:
    data.loc[:,v] = le.fit_transform(data.loc[:,v])
  X_train = data[:X_train.shape[0]]
  X_test = data[-X_test.shape[0]:]

  X_train.day_of_week_name = X_train.day_of_week_name.astype('str')
  X_test.day_of_week_name = X_test.day_of_week_name.astype('str')
  X_train.join_date = X_train.join_date.astype('str')
  X_test.join_date = X_test.join_date.astype('str')
  
  from sklearn.preprocessing import LabelEncoder
  # LABEL ENCODE
  enc = LabelEncoder()
  data = X_train.append(X_test)
  for v in ['sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code','day_of_week_name','join_date']:
    data.loc[:,v] = enc.fit_transform(data.loc[:,v])
  X_train = data[:X_train.shape[0]]
  X_test = data[-X_test.shape[0]:]
  
  enc.fit(y_train.iloc[:,0])
  y_train = pd.DataFrame(enc.transform(y_train.iloc[:,0]))
  y_train.columns = ['target']
  
  for col in ['sex', 'marital_status', 'branch_code',
        'occupation_code', 
        'occupation_category_code',
        ] :
        X_train[col] = X_train[col].astype('object')
        X_test[col] = X_test[col].astype('object')
  
  catfs = [X_train.drop(columns=['ID', 'ID2','date1'],axis=1).columns.get_loc(x) for x in ['sex','marital_status','branch_code',
                                                                                     'occupation_code','occupation_category_code']]
  X= X_train
  test = X_test
  y = y_train
  #group Kfold
  from sklearn.model_selection import GroupKFold
  from catboost import CatBoostClassifier ,Pool
  from sklearn.metrics import log_loss

  grk = GroupKFold(n_splits=10)
  score = []
  best_threshes = []
  groups = X.ID
  sub_preds = np.zeros((test.shape[0],21))

  #groups is the customer_id columns ,
  for i,(tr_index,ts_index) in enumerate(grk.split(X.drop(columns=['ID', 'ID2'],axis=1),groups=groups)):
      print(45*'*')
      print(f"FOLD {i+1}/{grk.n_splits}")
      X_train ,X_test = X.drop(columns=['ID', 'ID2','date1'],axis=1).iloc[tr_index] , X.drop(columns=['ID', 'ID2','date1'],axis=1).iloc[ts_index]
      y_train ,y_test = y.iloc[tr_index] , y.iloc[ts_index]
      model = CatBoostClassifier(
                                learning_rate=0.05,
                                task_type="GPU",
                                devices='0:1',
                                 iterations=2000,
                                use_best_model =True,
                                verbose=100,
                                

                                )
      
      model.fit(Pool(X_train,y_train,cat_features = catfs),eval_set = Pool(X_test,y_test,cat_features = catfs),)
      
      y_pred = np.array(model.predict_proba(X_test))
      # TEST PREDS
      sub_preds += model.predict_proba(test.drop(['ID','ID2','date1'], axis=1) ) / 10

  proba = sub_preds 
  y_test = pd.DataFrame(proba)
  y_test.columns = enc.inverse_transform(y_test.columns)

  answer_mass = []
  for i in range(test.shape[0]):
    id = test['ID'].iloc[i]
    for c in y_test.columns:
      answer_mass.append([id + ' X ' + c, y_test[c].iloc[i]])

  df_answer = pd.DataFrame(answer_mass)
  df_answer.columns = ['ID X PCODE', 'Label']
  for i in range(df_answer.shape[0]):
    if df_answer['ID X PCODE'].iloc[i] in true_values:
      df_answer['Label'].iloc[i] = 1.0

  df_answer.reset_index(drop=True, inplace=True)
  df_answer.to_csv(name, index=False)

=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|=|
##### 1 / 6 #####
*********************************************
FOLD 1/10
0:	learn: 2.5416965	test: 2.5347101	best: 2.5347101 (0)	total: 50.7ms	remaining: 1m 41s
100:	learn: 0.4791630	test: 0.4904612	best: 0.4904612 (100)	total: 3.09s	remaining: 58.1s
200:	learn: 0.4138721	test: 0.4365990	best: 0.4365990 (200)	total: 5.92s	remaining: 53s
300:	learn: 0.3826733	test: 0.4165238	best: 0.4165238 (300)	total: 8.74s	remaining: 49.3s
400:	learn: 0.3632828	test: 0.4068005	best: 0.4068005 (400)	total: 11.5s	remaining: 46s
500:	learn: 0.3504028	test: 0.4015926	best: 0.4015926 (500)	total: 14.3s	remaining: 42.7s
600:	learn: 0.3392900	test: 0.3980803	best: 0.3980803 (600)	total: 17s	remaining: 39.6s
700:	learn: 0.3294324	test: 0.3954939	best: 0.3954939 (700)	total: 19.7s	remaining: 36.5s
800:	learn: 0.3198046	test: 0.3935596	best: 0.3935596 (800)	total: 22.4s	remaining: 33.5s
900:	learn: 0.3110316	test: 0.3922747	best: 0.3922208 (897)	tot

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
!mkdir '/content/drive/My Drive/Zimnat/stack3'

In [None]:
!cp 0_fold0.csv  '/content/drive/My Drive/Zimnat/stack3'
!cp 0_fold1.csv  '/content/drive/My Drive/Zimnat/stack3'
!cp 0_fold2.csv  '/content/drive/My Drive/Zimnat/stack3'
!cp 0_fold3.csv  '/content/drive/My Drive/Zimnat/stack3'
!cp 0_fold4.csv  '/content/drive/My Drive/Zimnat/stack3'
!cp 0_main.csv  '/content/drive/My Drive/Zimnat/stack3'