# GET DATA

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy

In [2]:
train = pd.read_csv('Data/Train.csv')
test = pd.read_csv('Data/Test.csv')
submission = pd.read_csv('Data/SampleSubmission.csv')

# NEW FORMAT

In [3]:
X_train = []
X_train_columns = train.columns
c = 0
for v in train.values:
  info = v[:8]
  binary = v[8:]
  index = [k for k, i in enumerate(binary) if i == 1]
  for i in index:
    c+=1
    for k in range(len(binary)):
      if k == i:
        binary_transformed = list(copy.copy(binary))
        binary_transformed[i] = 0
        X_train.append(list(info) + binary_transformed + [X_train_columns[8+k]] + [c])

X_train = pd.DataFrame(X_train)
X_train.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
       'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
       '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
       'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'product_pred', 'ID2']

In [4]:
X_test = []
true_values = []
c = 0
for v in test.values:
  c += 1
  info = v[:8]
  binary = v[8:]
  index = [k for k, i in enumerate(binary) if i == 1]
  X_test.append(list(info) + list(binary) + [c])
  for k in test.columns[8:][index]:
    true_values.append(v[0] + ' X ' + k)

X_test = pd.DataFrame(X_test)
X_test.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
       'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
       '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
       'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'ID2']

# TRANSFORM DATA

In [5]:
features_train = []
features_test = []
columns = []

append_features = ['P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 
'N2MW', 'AHXO','BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 
'ECY3', 'ID', 'ID2', 'join_date', 'sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code',
'birth_year']
for v in append_features:
  features_train.append(X_train[v].values.reshape(-1, 1))
  features_test.append(X_test[v].values.reshape(-1, 1))
  columns.append(np.array([v]))

y_train = X_train[['product_pred']]

In [6]:
features_train = np.concatenate(features_train, axis=1)
features_test = np.concatenate(features_test, axis=1)
columns = np.concatenate(np.array(columns))

X_train = pd.DataFrame(features_train)
X_train.columns = columns
X_test = pd.DataFrame(features_test)
X_test.columns = columns

# NEW FEATURES

In [7]:
X_train['date1'] = X_train['join_date'].apply(lambda x: int(x.split('/')[0]) if (x == x) else np.nan)
X_train['date2'] = X_train['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
X_train['date3'] = X_train['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)
X_train.drop('join_date', axis=1, inplace=True)

X_test['date1'] = X_test['join_date'].apply(lambda x: int(x.split('/')[0]) if (x == x) else np.nan)
X_test['date2'] = X_test['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
X_test['date3'] = X_test['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)
X_test.drop('join_date', axis=1, inplace=True)

X_train['date_diff'] = X_train['date3'] - X_train['birth_year']
X_test['date_diff'] = X_test['date3'] - X_test['birth_year']

# CHANGE TYPES

In [8]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
y_train = y_train.fillna(0)

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data = X_train.append(X_test)
for v in ['sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code',]:
  data.loc[:,v] = le.fit_transform(data.loc[:,v])
X_train = data[:X_train.shape[0]]
X_test = data[-X_test.shape[0]:]

In [10]:
le.fit(y_train.iloc[:,0])
y_train = pd.DataFrame(le.transform(y_train.iloc[:,0]))
y_train.columns = ['target']

# MODEL

In [11]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=1500)
model.fit(X_train.drop(columns=['ID', 'ID2']), y_train, cat_features=['sex','marital_status','branch_code','occupation_code','occupation_category_code'])

Learning rate set to 0.069573
0:	learn: 2.1154156	total: 4.94s	remaining: 2h 3m 23s
1:	learn: 1.8522402	total: 9.44s	remaining: 1h 57m 50s
2:	learn: 1.6665665	total: 14.3s	remaining: 1h 58m 47s
3:	learn: 1.5406106	total: 16.9s	remaining: 1h 45m 4s
4:	learn: 1.4240590	total: 21.7s	remaining: 1h 48m 12s
5:	learn: 1.3298118	total: 26s	remaining: 1h 47m 59s
6:	learn: 1.2534370	total: 30.7s	remaining: 1h 49m 10s
7:	learn: 1.1870799	total: 35.3s	remaining: 1h 49m 50s
8:	learn: 1.1273847	total: 40.3s	remaining: 1h 51m 10s
9:	learn: 1.0777335	total: 45.2s	remaining: 1h 52m 9s
10:	learn: 1.0318508	total: 49.8s	remaining: 1h 52m 18s
11:	learn: 0.9886763	total: 53.7s	remaining: 1h 50m 54s
12:	learn: 0.9527627	total: 57.9s	remaining: 1h 50m 22s
13:	learn: 0.9175153	total: 1m 1s	remaining: 1h 48m 38s
14:	learn: 0.8870951	total: 1m 5s	remaining: 1h 48m 27s
15:	learn: 0.8599942	total: 1m 9s	remaining: 1h 47m 22s
16:	learn: 0.8352045	total: 1m 13s	remaining: 1h 46m 38s
17:	learn: 0.8122672	total: 1m 1

144:	learn: 0.4219073	total: 10m 29s	remaining: 1h 37m 58s
145:	learn: 0.4217904	total: 10m 34s	remaining: 1h 37m 59s
146:	learn: 0.4214912	total: 10m 39s	remaining: 1h 38m 6s
147:	learn: 0.4211131	total: 10m 44s	remaining: 1h 38m 8s
148:	learn: 0.4209815	total: 10m 49s	remaining: 1h 38m 8s
149:	learn: 0.4203588	total: 10m 54s	remaining: 1h 38m 8s
150:	learn: 0.4198124	total: 10m 58s	remaining: 1h 37m 58s
151:	learn: 0.4186434	total: 11m 1s	remaining: 1h 37m 45s
152:	learn: 0.4182991	total: 11m 6s	remaining: 1h 37m 43s
153:	learn: 0.4178312	total: 11m 10s	remaining: 1h 37m 39s
154:	learn: 0.4175049	total: 11m 14s	remaining: 1h 37m 30s
155:	learn: 0.4169124	total: 11m 18s	remaining: 1h 37m 23s
156:	learn: 0.4166088	total: 11m 23s	remaining: 1h 37m 27s
157:	learn: 0.4160817	total: 11m 28s	remaining: 1h 37m 25s
158:	learn: 0.4149831	total: 11m 32s	remaining: 1h 37m 24s
159:	learn: 0.4145012	total: 11m 38s	remaining: 1h 37m 28s
160:	learn: 0.4140618	total: 11m 43s	remaining: 1h 37m 29s
161

284:	learn: 0.3749906	total: 21m 8s	remaining: 1h 30m 6s
285:	learn: 0.3747923	total: 21m 14s	remaining: 1h 30m 9s
286:	learn: 0.3745695	total: 21m 20s	remaining: 1h 30m 11s
287:	learn: 0.3744659	total: 21m 26s	remaining: 1h 30m 12s
288:	learn: 0.3743426	total: 21m 30s	remaining: 1h 30m 7s
289:	learn: 0.3741691	total: 21m 34s	remaining: 1h 30m 1s
290:	learn: 0.3739700	total: 21m 39s	remaining: 1h 29m 58s
291:	learn: 0.3738770	total: 21m 43s	remaining: 1h 29m 52s
292:	learn: 0.3738060	total: 21m 47s	remaining: 1h 29m 47s
293:	learn: 0.3737605	total: 21m 52s	remaining: 1h 29m 43s
294:	learn: 0.3736846	total: 21m 57s	remaining: 1h 29m 39s
295:	learn: 0.3733762	total: 22m	remaining: 1h 29m 32s
296:	learn: 0.3732244	total: 22m 5s	remaining: 1h 29m 28s
297:	learn: 0.3731133	total: 22m 10s	remaining: 1h 29m 26s
298:	learn: 0.3730524	total: 22m 16s	remaining: 1h 29m 26s
299:	learn: 0.3728586	total: 22m 21s	remaining: 1h 29m 26s
300:	learn: 0.3726533	total: 22m 25s	remaining: 1h 29m 21s
301:	le

424:	learn: 0.3583158	total: 32m 4s	remaining: 1h 21m 6s
425:	learn: 0.3581638	total: 32m 10s	remaining: 1h 21m 6s
426:	learn: 0.3580341	total: 32m 14s	remaining: 1h 21m 1s
427:	learn: 0.3579576	total: 32m 18s	remaining: 1h 20m 54s
428:	learn: 0.3577996	total: 32m 22s	remaining: 1h 20m 49s
429:	learn: 0.3577738	total: 32m 26s	remaining: 1h 20m 43s
430:	learn: 0.3576627	total: 32m 30s	remaining: 1h 20m 37s
431:	learn: 0.3575911	total: 32m 35s	remaining: 1h 20m 33s
432:	learn: 0.3575197	total: 32m 38s	remaining: 1h 20m 27s
433:	learn: 0.3573769	total: 32m 42s	remaining: 1h 20m 20s
434:	learn: 0.3573029	total: 32m 47s	remaining: 1h 20m 16s
435:	learn: 0.3572507	total: 32m 50s	remaining: 1h 20m 9s
436:	learn: 0.3571565	total: 32m 54s	remaining: 1h 20m 3s
437:	learn: 0.3569489	total: 32m 59s	remaining: 1h 19m 58s
438:	learn: 0.3569448	total: 33m 2s	remaining: 1h 19m 52s
439:	learn: 0.3568784	total: 33m 7s	remaining: 1h 19m 48s
440:	learn: 0.3568197	total: 33m 13s	remaining: 1h 19m 46s
441:	

564:	learn: 0.3460023	total: 42m 54s	remaining: 1h 11m
565:	learn: 0.3459807	total: 42m 59s	remaining: 1h 10m 56s
566:	learn: 0.3459131	total: 43m 5s	remaining: 1h 10m 53s
567:	learn: 0.3458554	total: 43m 9s	remaining: 1h 10m 49s
568:	learn: 0.3458243	total: 43m 14s	remaining: 1h 10m 44s
569:	learn: 0.3457457	total: 43m 18s	remaining: 1h 10m 39s
570:	learn: 0.3454630	total: 43m 22s	remaining: 1h 10m 34s
571:	learn: 0.3453506	total: 43m 26s	remaining: 1h 10m 29s
572:	learn: 0.3452895	total: 43m 31s	remaining: 1h 10m 25s
573:	learn: 0.3452414	total: 43m 36s	remaining: 1h 10m 21s
574:	learn: 0.3451680	total: 43m 41s	remaining: 1h 10m 16s
575:	learn: 0.3451202	total: 43m 45s	remaining: 1h 10m 12s
576:	learn: 0.3450772	total: 43m 51s	remaining: 1h 10m 9s
577:	learn: 0.3450719	total: 43m 58s	remaining: 1h 10m 8s
578:	learn: 0.3449093	total: 44m 4s	remaining: 1h 10m 6s
579:	learn: 0.3447954	total: 44m 9s	remaining: 1h 10m 2s
580:	learn: 0.3447596	total: 44m 13s	remaining: 1h 9m 57s
581:	learn

707:	learn: 0.3355195	total: 54m 13s	remaining: 1h 39s
708:	learn: 0.3354223	total: 54m 17s	remaining: 1h 34s
709:	learn: 0.3353791	total: 54m 21s	remaining: 1h 28s
710:	learn: 0.3353421	total: 54m 24s	remaining: 1h 23s
711:	learn: 0.3352977	total: 54m 29s	remaining: 1h 18s
712:	learn: 0.3352023	total: 54m 32s	remaining: 1h 12s
713:	learn: 0.3351271	total: 54m 37s	remaining: 1h 7s
714:	learn: 0.3349961	total: 54m 40s	remaining: 1h 2s
715:	learn: 0.3349281	total: 54m 44s	remaining: 59m 56s
716:	learn: 0.3348915	total: 54m 49s	remaining: 59m 52s
717:	learn: 0.3348701	total: 54m 53s	remaining: 59m 47s
718:	learn: 0.3347867	total: 54m 57s	remaining: 59m 42s
719:	learn: 0.3346068	total: 55m	remaining: 59m 36s
720:	learn: 0.3345324	total: 55m 5s	remaining: 59m 31s
721:	learn: 0.3344589	total: 55m 11s	remaining: 59m 27s
722:	learn: 0.3344231	total: 55m 15s	remaining: 59m 23s
723:	learn: 0.3342961	total: 55m 20s	remaining: 59m 18s
724:	learn: 0.3342624	total: 55m 25s	remaining: 59m 15s
725:	le

853:	learn: 0.3263975	total: 1h 5m 20s	remaining: 49m 25s
854:	learn: 0.3263466	total: 1h 5m 24s	remaining: 49m 20s
855:	learn: 0.3262735	total: 1h 5m 29s	remaining: 49m 15s
856:	learn: 0.3262630	total: 1h 5m 33s	remaining: 49m 11s
857:	learn: 0.3262452	total: 1h 5m 37s	remaining: 49m 6s
858:	learn: 0.3262052	total: 1h 5m 41s	remaining: 49m 1s
859:	learn: 0.3261969	total: 1h 5m 46s	remaining: 48m 56s
860:	learn: 0.3261570	total: 1h 5m 50s	remaining: 48m 52s
861:	learn: 0.3260874	total: 1h 5m 55s	remaining: 48m 47s
862:	learn: 0.3260342	total: 1h 5m 59s	remaining: 48m 42s
863:	learn: 0.3259420	total: 1h 6m 3s	remaining: 48m 37s
864:	learn: 0.3258934	total: 1h 6m 6s	remaining: 48m 31s
865:	learn: 0.3258585	total: 1h 6m 10s	remaining: 48m 26s
866:	learn: 0.3257852	total: 1h 6m 14s	remaining: 48m 21s
867:	learn: 0.3257497	total: 1h 6m 18s	remaining: 48m 16s
868:	learn: 0.3256963	total: 1h 6m 22s	remaining: 48m 11s
869:	learn: 0.3256165	total: 1h 6m 27s	remaining: 48m 7s
870:	learn: 0.32556

995:	learn: 0.3177226	total: 1h 16m 3s	remaining: 38m 29s
996:	learn: 0.3176595	total: 1h 16m 8s	remaining: 38m 24s
997:	learn: 0.3175873	total: 1h 16m 12s	remaining: 38m 19s
998:	learn: 0.3175589	total: 1h 16m 16s	remaining: 38m 15s
999:	learn: 0.3174812	total: 1h 16m 21s	remaining: 38m 10s
1000:	learn: 0.3173316	total: 1h 16m 25s	remaining: 38m 5s
1001:	learn: 0.3172260	total: 1h 16m 29s	remaining: 38m 1s
1002:	learn: 0.3171744	total: 1h 16m 34s	remaining: 37m 56s
1003:	learn: 0.3171415	total: 1h 16m 40s	remaining: 37m 52s
1004:	learn: 0.3170943	total: 1h 16m 44s	remaining: 37m 47s
1005:	learn: 0.3170801	total: 1h 16m 49s	remaining: 37m 43s
1006:	learn: 0.3170223	total: 1h 16m 54s	remaining: 37m 38s
1007:	learn: 0.3169641	total: 1h 16m 58s	remaining: 37m 34s
1008:	learn: 0.3168897	total: 1h 17m 2s	remaining: 37m 29s
1009:	learn: 0.3168770	total: 1h 17m 7s	remaining: 37m 24s
1010:	learn: 0.3167891	total: 1h 17m 13s	remaining: 37m 20s
1011:	learn: 0.3167145	total: 1h 17m 17s	remaining:

1133:	learn: 0.3102746	total: 1h 26m 35s	remaining: 27m 56s
1134:	learn: 0.3102322	total: 1h 26m 41s	remaining: 27m 52s
1135:	learn: 0.3101900	total: 1h 26m 45s	remaining: 27m 47s
1136:	learn: 0.3101251	total: 1h 26m 50s	remaining: 27m 43s
1137:	learn: 0.3100276	total: 1h 26m 54s	remaining: 27m 38s
1138:	learn: 0.3099990	total: 1h 26m 59s	remaining: 27m 34s
1139:	learn: 0.3099297	total: 1h 27m 4s	remaining: 27m 29s
1140:	learn: 0.3098865	total: 1h 27m 8s	remaining: 27m 24s
1141:	learn: 0.3098059	total: 1h 27m 12s	remaining: 27m 20s
1142:	learn: 0.3097457	total: 1h 27m 16s	remaining: 27m 15s
1143:	learn: 0.3096459	total: 1h 27m 21s	remaining: 27m 11s
1144:	learn: 0.3095663	total: 1h 27m 25s	remaining: 27m 6s
1145:	learn: 0.3095033	total: 1h 27m 30s	remaining: 27m 1s
1146:	learn: 0.3094525	total: 1h 27m 34s	remaining: 26m 57s
1147:	learn: 0.3094405	total: 1h 27m 38s	remaining: 26m 52s
1148:	learn: 0.3093842	total: 1h 27m 42s	remaining: 26m 47s
1149:	learn: 0.3093318	total: 1h 27m 46s	rem

1271:	learn: 0.3014597	total: 1h 36m 41s	remaining: 17m 19s
1272:	learn: 0.3013908	total: 1h 36m 45s	remaining: 17m 15s
1273:	learn: 0.3012259	total: 1h 36m 50s	remaining: 17m 10s
1274:	learn: 0.3011909	total: 1h 36m 55s	remaining: 17m 6s
1275:	learn: 0.3011506	total: 1h 37m 1s	remaining: 17m 1s
1276:	learn: 0.3011236	total: 1h 37m 6s	remaining: 16m 57s
1277:	learn: 0.3010622	total: 1h 37m 12s	remaining: 16m 53s
1278:	learn: 0.3009690	total: 1h 37m 17s	remaining: 16m 48s
1279:	learn: 0.3009205	total: 1h 37m 22s	remaining: 16m 44s
1280:	learn: 0.3008790	total: 1h 37m 26s	remaining: 16m 39s
1281:	learn: 0.3008156	total: 1h 37m 30s	remaining: 16m 34s
1282:	learn: 0.3007860	total: 1h 37m 34s	remaining: 16m 30s
1283:	learn: 0.3007294	total: 1h 37m 38s	remaining: 16m 25s
1284:	learn: 0.3006791	total: 1h 37m 43s	remaining: 16m 21s
1285:	learn: 0.3006423	total: 1h 37m 47s	remaining: 16m 16s
1286:	learn: 0.3005847	total: 1h 37m 52s	remaining: 16m 11s
1287:	learn: 0.3005742	total: 1h 37m 58s	rem

1410:	learn: 0.2931962	total: 1h 47m 28s	remaining: 6m 46s
1411:	learn: 0.2931867	total: 1h 47m 33s	remaining: 6m 42s
1412:	learn: 0.2931467	total: 1h 47m 37s	remaining: 6m 37s
1413:	learn: 0.2930685	total: 1h 47m 41s	remaining: 6m 32s
1414:	learn: 0.2930528	total: 1h 47m 45s	remaining: 6m 28s
1415:	learn: 0.2930137	total: 1h 47m 49s	remaining: 6m 23s
1416:	learn: 0.2928857	total: 1h 47m 53s	remaining: 6m 19s
1417:	learn: 0.2928350	total: 1h 47m 58s	remaining: 6m 14s
1418:	learn: 0.2927906	total: 1h 48m 3s	remaining: 6m 10s
1419:	learn: 0.2927811	total: 1h 48m 6s	remaining: 6m 5s
1420:	learn: 0.2927021	total: 1h 48m 11s	remaining: 6m
1421:	learn: 0.2926383	total: 1h 48m 14s	remaining: 5m 56s
1422:	learn: 0.2925601	total: 1h 48m 19s	remaining: 5m 51s
1423:	learn: 0.2923881	total: 1h 48m 24s	remaining: 5m 47s
1424:	learn: 0.2923211	total: 1h 48m 29s	remaining: 5m 42s
1425:	learn: 0.2922732	total: 1h 48m 34s	remaining: 5m 38s
1426:	learn: 0.2922557	total: 1h 48m 37s	remaining: 5m 33s
1427

<catboost.core.CatBoostClassifier at 0x182fc595c10>

In [12]:
proba = model.predict_proba(X_test.drop(columns=['ID','ID2'], axis=1))
y_test = pd.DataFrame(proba)
print(y_test.columns)
y_test.columns = le.inverse_transform(y_test.columns)

RangeIndex(start=0, stop=21, step=1)


# SUBMIT

In [13]:
print(y_test.columns)
answer_mass = []
for i in range(X_test.shape[0]):
  id = X_test['ID'].iloc[i]
  for c in y_test.columns:
    answer_mass.append([id + ' X ' + str(c), y_test[c].iloc[i]])
    

df_answer = pd.DataFrame(answer_mass)
df_answer.columns = ['ID X PCODE', 'Label']
for i in range(df_answer.shape[0]):
  if df_answer['ID X PCODE'].iloc[i] in true_values:
    df_answer['Label'].iloc[i] = 1.0

Index(['66FJ', '7POT', '8NN1', 'AHXO', 'BSTQ', 'ECY3', 'FM3X', 'GHYX', 'GYSR',
       'J9JW', 'JWFN', 'JZ9D', 'K6QO', 'LJR9', 'N2MW', 'P5DA', 'PYUQ', 'QBOL',
       'RIBP', 'RVSZ', 'SOP4'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [14]:
df_answer.reset_index(drop=True, inplace=True)
df_answer.to_csv('submission.csv', index=False)