# GET DATA

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy

In [2]:
train = pd.read_csv('Data/Train.csv')
test = pd.read_csv('Data/Test.csv')
submission = pd.read_csv('Data/SampleSubmission.csv')

# NEW FORMAT

In [3]:
X_train = []
X_train_columns = train.columns
c = 0
for v in train.values:
  info = v[:8]
  binary = v[8:]
  index = [k for k, i in enumerate(binary) if i == 1]
  for i in index:
    c+=1
    for k in range(len(binary)):
      if k == i:
        binary_transformed = list(copy.copy(binary))
        binary_transformed[i] = 0
        X_train.append(list(info) + binary_transformed + [X_train_columns[8+k]] + [c])

X_train = pd.DataFrame(X_train)
X_train.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
       'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
       '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
       'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'product_pred', 'ID2']

In [4]:
X_test = []
true_values = []
c = 0
for v in test.values:
  c += 1
  info = v[:8]
  binary = v[8:]
  index = [k for k, i in enumerate(binary) if i == 1]
  X_test.append(list(info) + list(binary) + [c])
  for k in test.columns[8:][index]:
    true_values.append(v[0] + ' X ' + k)

X_test = pd.DataFrame(X_test)
X_test.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
       'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
       '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
       'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'ID2']

# TRANSFORM DATA

In [5]:
features_train = []
features_test = []
columns = []

append_features = ['P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 
'N2MW', 'AHXO','BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 
'ECY3', 'ID', 'ID2', 'join_date', 'sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code',
'birth_year']
for v in append_features:
  features_train.append(X_train[v].values.reshape(-1, 1))
  features_test.append(X_test[v].values.reshape(-1, 1))
  columns.append(np.array([v]))

y_train = X_train[['product_pred']]

In [6]:
features_train = np.concatenate(features_train, axis=1)
features_test = np.concatenate(features_test, axis=1)
columns = np.concatenate(np.array(columns))

X_train = pd.DataFrame(features_train)
X_train.columns = columns
X_test = pd.DataFrame(features_test)
X_test.columns = columns

# NEW FEATURES

In [7]:
X_train['date1'] = X_train['join_date'].apply(lambda x: int(x.split('/')[0]) if (x == x) else np.nan)
X_train['date2'] = X_train['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
X_train['date3'] = X_train['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)
X_train.drop('join_date', axis=1, inplace=True)

X_test['date1'] = X_test['join_date'].apply(lambda x: int(x.split('/')[0]) if (x == x) else np.nan)
X_test['date2'] = X_test['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
X_test['date3'] = X_test['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)
X_test.drop('join_date', axis=1, inplace=True)

X_train['date_diff'] = X_train['date3'] - X_train['birth_year']
X_test['date_diff'] = X_test['date3'] - X_test['birth_year']

# CHANGE TYPES

In [8]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
y_train = y_train.fillna(0)

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data = X_train.append(X_test)
for v in ['sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code',]:
  data.loc[:,v] = le.fit_transform(data.loc[:,v])
X_train = data[:X_train.shape[0]]
X_test = data[-X_test.shape[0]:]

In [10]:
le.fit(y_train.iloc[:,0])
y_train = pd.DataFrame(le.transform(y_train.iloc[:,0]))
y_train.columns = ['target']

# MODEL

In [11]:
from catboost import CatBoostClassifier
model = CatBoostClassifier()
model.fit(X_train.drop(columns=['ID', 'ID2']), y_train, cat_features=['sex','marital_status','branch_code','occupation_code','occupation_category_code'])

Learning rate set to 0.098003
0:	learn: 1.7837255	total: 5.54s	remaining: 1h 32m 11s
1:	learn: 1.5780548	total: 9.05s	remaining: 1h 15m 16s
2:	learn: 1.4111625	total: 12.8s	remaining: 1h 10m 55s
3:	learn: 1.3028152	total: 14.9s	remaining: 1h 1m 49s
4:	learn: 1.1926574	total: 18.8s	remaining: 1h 2m 28s
5:	learn: 1.1052790	total: 22.3s	remaining: 1h 1m 35s
6:	learn: 1.0376100	total: 26.2s	remaining: 1h 1m 56s
7:	learn: 0.9787207	total: 30.2s	remaining: 1h 2m 26s
8:	learn: 0.9277719	total: 34s	remaining: 1h 2m 23s
9:	learn: 0.8868196	total: 37.7s	remaining: 1h 2m 16s
10:	learn: 0.8507237	total: 41s	remaining: 1h 1m 28s
11:	learn: 0.8163409	total: 44.5s	remaining: 1h 1m 3s
12:	learn: 0.7856310	total: 47.8s	remaining: 1h 26s
13:	learn: 0.7592917	total: 50.8s	remaining: 59m 41s
14:	learn: 0.7338182	total: 53.9s	remaining: 59m 2s
15:	learn: 0.7121350	total: 57s	remaining: 58m 26s
16:	learn: 0.6930758	total: 1m	remaining: 57m 56s
17:	learn: 0.6747357	total: 1m 3s	remaining: 57m 42s
18:	learn: 

152:	learn: 0.3901988	total: 9m 38s	remaining: 53m 21s
153:	learn: 0.3897936	total: 9m 41s	remaining: 53m 16s
154:	learn: 0.3893269	total: 9m 45s	remaining: 53m 10s
155:	learn: 0.3888015	total: 9m 49s	remaining: 53m 6s
156:	learn: 0.3885350	total: 9m 53s	remaining: 53m 5s
157:	learn: 0.3884837	total: 9m 57s	remaining: 53m 6s
158:	learn: 0.3881946	total: 10m 2s	remaining: 53m 7s
159:	learn: 0.3877708	total: 10m 6s	remaining: 53m 5s
160:	learn: 0.3875652	total: 10m 10s	remaining: 53m 2s
161:	learn: 0.3873420	total: 10m 14s	remaining: 53m 1s
162:	learn: 0.3872082	total: 10m 19s	remaining: 53m
163:	learn: 0.3868208	total: 10m 23s	remaining: 53m
164:	learn: 0.3866008	total: 10m 28s	remaining: 53m
165:	learn: 0.3863091	total: 10m 32s	remaining: 52m 58s
166:	learn: 0.3859148	total: 10m 37s	remaining: 53m
167:	learn: 0.3853802	total: 10m 40s	remaining: 52m 52s
168:	learn: 0.3852431	total: 10m 44s	remaining: 52m 51s
169:	learn: 0.3850010	total: 10m 49s	remaining: 52m 50s
170:	learn: 0.3848087	t

300:	learn: 0.3591652	total: 19m 56s	remaining: 46m 17s
301:	learn: 0.3590773	total: 19m 59s	remaining: 46m 13s
302:	learn: 0.3589843	total: 20m 4s	remaining: 46m 10s
303:	learn: 0.3589553	total: 20m 9s	remaining: 46m 8s
304:	learn: 0.3588800	total: 20m 13s	remaining: 46m 5s
305:	learn: 0.3588587	total: 20m 18s	remaining: 46m 3s
306:	learn: 0.3587530	total: 20m 22s	remaining: 45m 58s
307:	learn: 0.3584969	total: 20m 25s	remaining: 45m 54s
308:	learn: 0.3581107	total: 20m 28s	remaining: 45m 48s
309:	learn: 0.3580402	total: 20m 33s	remaining: 45m 46s
310:	learn: 0.3578170	total: 20m 39s	remaining: 45m 46s
311:	learn: 0.3577466	total: 20m 45s	remaining: 45m 46s
312:	learn: 0.3575154	total: 20m 51s	remaining: 45m 46s
313:	learn: 0.3574646	total: 20m 56s	remaining: 45m 44s
314:	learn: 0.3573012	total: 21m	remaining: 45m 41s
315:	learn: 0.3571376	total: 21m 4s	remaining: 45m 37s
316:	learn: 0.3568873	total: 21m 8s	remaining: 45m 34s
317:	learn: 0.3568059	total: 21m 13s	remaining: 45m 30s
318

448:	learn: 0.3425643	total: 30m 47s	remaining: 37m 47s
449:	learn: 0.3425185	total: 30m 51s	remaining: 37m 42s
450:	learn: 0.3423478	total: 30m 56s	remaining: 37m 39s
451:	learn: 0.3422682	total: 31m 1s	remaining: 37m 37s
452:	learn: 0.3422006	total: 31m 7s	remaining: 37m 34s
453:	learn: 0.3421278	total: 31m 13s	remaining: 37m 33s
454:	learn: 0.3421037	total: 31m 18s	remaining: 37m 30s
455:	learn: 0.3420619	total: 31m 23s	remaining: 37m 26s
456:	learn: 0.3420026	total: 31m 27s	remaining: 37m 22s
457:	learn: 0.3419493	total: 31m 31s	remaining: 37m 18s
458:	learn: 0.3419043	total: 31m 36s	remaining: 37m 15s
459:	learn: 0.3418169	total: 31m 40s	remaining: 37m 10s
460:	learn: 0.3416701	total: 31m 44s	remaining: 37m 7s
461:	learn: 0.3415399	total: 31m 48s	remaining: 37m 3s
462:	learn: 0.3414867	total: 31m 53s	remaining: 36m 59s
463:	learn: 0.3413205	total: 31m 58s	remaining: 36m 55s
464:	learn: 0.3412565	total: 32m 1s	remaining: 36m 51s
465:	learn: 0.3411320	total: 32m 6s	remaining: 36m 47

596:	learn: 0.3299057	total: 41m 58s	remaining: 28m 20s
597:	learn: 0.3298346	total: 42m 3s	remaining: 28m 16s
598:	learn: 0.3297779	total: 42m 7s	remaining: 28m 12s
599:	learn: 0.3296507	total: 42m 11s	remaining: 28m 7s
600:	learn: 0.3295176	total: 42m 16s	remaining: 28m 4s
601:	learn: 0.3294188	total: 42m 21s	remaining: 28m
602:	learn: 0.3292929	total: 42m 25s	remaining: 27m 55s
603:	learn: 0.3292533	total: 42m 29s	remaining: 27m 51s
604:	learn: 0.3292137	total: 42m 33s	remaining: 27m 46s
605:	learn: 0.3290463	total: 42m 37s	remaining: 27m 43s
606:	learn: 0.3289349	total: 42m 42s	remaining: 27m 38s
607:	learn: 0.3287812	total: 42m 46s	remaining: 27m 34s
608:	learn: 0.3287720	total: 42m 51s	remaining: 27m 31s
609:	learn: 0.3287174	total: 42m 56s	remaining: 27m 27s
610:	learn: 0.3285899	total: 43m	remaining: 27m 22s
611:	learn: 0.3285504	total: 43m 5s	remaining: 27m 18s
612:	learn: 0.3285239	total: 43m 9s	remaining: 27m 14s
613:	learn: 0.3284702	total: 43m 14s	remaining: 27m 11s
614:	l

744:	learn: 0.3176566	total: 53m 9s	remaining: 18m 11s
745:	learn: 0.3175473	total: 53m 14s	remaining: 18m 7s
746:	learn: 0.3175165	total: 53m 19s	remaining: 18m 3s
747:	learn: 0.3173723	total: 53m 24s	remaining: 17m 59s
748:	learn: 0.3172837	total: 53m 30s	remaining: 17m 55s
749:	learn: 0.3172003	total: 53m 35s	remaining: 17m 51s
750:	learn: 0.3171095	total: 53m 41s	remaining: 17m 48s
751:	learn: 0.3170674	total: 53m 45s	remaining: 17m 43s
752:	learn: 0.3170348	total: 53m 51s	remaining: 17m 39s
753:	learn: 0.3170008	total: 53m 56s	remaining: 17m 35s
754:	learn: 0.3167279	total: 54m	remaining: 17m 31s
755:	learn: 0.3167013	total: 54m 4s	remaining: 17m 27s
756:	learn: 0.3166651	total: 54m 9s	remaining: 17m 23s
757:	learn: 0.3165803	total: 54m 13s	remaining: 17m 18s
758:	learn: 0.3165367	total: 54m 17s	remaining: 17m 14s
759:	learn: 0.3164749	total: 54m 21s	remaining: 17m 9s
760:	learn: 0.3164003	total: 54m 26s	remaining: 17m 5s
761:	learn: 0.3163587	total: 54m 31s	remaining: 17m 1s
762:

891:	learn: 0.3063573	total: 1h 5m 4s	remaining: 7m 52s
892:	learn: 0.3062487	total: 1h 5m 9s	remaining: 7m 48s
893:	learn: 0.3062199	total: 1h 5m 14s	remaining: 7m 44s
894:	learn: 0.3061592	total: 1h 5m 20s	remaining: 7m 39s
895:	learn: 0.3060281	total: 1h 5m 24s	remaining: 7m 35s
896:	learn: 0.3059728	total: 1h 5m 28s	remaining: 7m 31s
897:	learn: 0.3058163	total: 1h 5m 32s	remaining: 7m 26s
898:	learn: 0.3058023	total: 1h 5m 36s	remaining: 7m 22s
899:	learn: 0.3056832	total: 1h 5m 40s	remaining: 7m 17s
900:	learn: 0.3056045	total: 1h 5m 45s	remaining: 7m 13s
901:	learn: 0.3055778	total: 1h 5m 49s	remaining: 7m 9s
902:	learn: 0.3055463	total: 1h 5m 54s	remaining: 7m 4s
903:	learn: 0.3055017	total: 1h 6m	remaining: 7m
904:	learn: 0.3053947	total: 1h 6m 5s	remaining: 6m 56s
905:	learn: 0.3053549	total: 1h 6m 12s	remaining: 6m 52s
906:	learn: 0.3051015	total: 1h 6m 17s	remaining: 6m 47s
907:	learn: 0.3049942	total: 1h 6m 22s	remaining: 6m 43s
908:	learn: 0.3049422	total: 1h 6m 26s	remai

<catboost.core.CatBoostClassifier at 0x1ebc3f8ad60>

In [12]:
proba = model.predict_proba(X_test.drop(columns=['ID','ID2'], axis=1))
y_test = pd.DataFrame(proba)
print(y_test.columns)
y_test.columns = le.inverse_transform(y_test.columns)

RangeIndex(start=0, stop=21, step=1)


# SUBMIT

In [13]:
print(y_test.columns)
answer_mass = []
for i in range(X_test.shape[0]):
  id = X_test['ID'].iloc[i]
  for c in y_test.columns:
    answer_mass.append([id + ' X ' + str(c), y_test[c].iloc[i]])
    

df_answer = pd.DataFrame(answer_mass)
df_answer.columns = ['ID X PCODE', 'Label']
for i in range(df_answer.shape[0]):
  if df_answer['ID X PCODE'].iloc[i] in true_values:
    df_answer['Label'].iloc[i] = 1.0

Index(['66FJ', '7POT', '8NN1', 'AHXO', 'BSTQ', 'ECY3', 'FM3X', 'GHYX', 'GYSR',
       'J9JW', 'JWFN', 'JZ9D', 'K6QO', 'LJR9', 'N2MW', 'P5DA', 'PYUQ', 'QBOL',
       'RIBP', 'RVSZ', 'SOP4'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [14]:
df_answer.reset_index(drop=True, inplace=True)
df_answer.to_csv('submission.csv', index=False)