# GET DATA

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy

In [2]:
train = pd.read_csv('Data/Train.csv')
test = pd.read_csv('Data/Test.csv')
submission = pd.read_csv('Data/SampleSubmission.csv')

# NEW FORMAT

In [3]:
X_train = []
X_train_columns = train.columns
c = 0
for v in train.values:
  info = v[:8]
  binary = v[8:]
  index = [k for k, i in enumerate(binary) if i == 1]
  for i in index:
    c+=1
    for k in range(len(binary)):
      if k == i:
        binary_transformed = list(copy.copy(binary))
        binary_transformed[i] = 0
        X_train.append(list(info) + binary_transformed + [X_train_columns[8+k]] + [c])

X_train = pd.DataFrame(X_train)
X_train.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
       'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
       '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
       'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'product_pred', 'ID2']

In [4]:
X_test = []
true_values = []
c = 0
for v in test.values:
  c += 1
  info = v[:8]
  binary = v[8:]
  index = [k for k, i in enumerate(binary) if i == 1]
  X_test.append(list(info) + list(binary) + [c])
  for k in test.columns[8:][index]:
    true_values.append(v[0] + ' X ' + k)

X_test = pd.DataFrame(X_test)
X_test.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
       'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
       '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
       'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'ID2']

# TRANSFORM DATA

In [5]:
features_train = []
features_test = []
columns = []

append_features = ['P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 
'N2MW', 'AHXO','BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 
'ECY3', 'ID', 'ID2', 'join_date', 'sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code',
'birth_year']
for v in append_features:
  features_train.append(X_train[v].values.reshape(-1, 1))
  features_test.append(X_test[v].values.reshape(-1, 1))
  columns.append(np.array([v]))

y_train = X_train[['product_pred']]

In [6]:
features_train = np.concatenate(features_train, axis=1)
features_test = np.concatenate(features_test, axis=1)
columns = np.concatenate(np.array(columns))

X_train = pd.DataFrame(features_train)
X_train.columns = columns
X_test = pd.DataFrame(features_test)
X_test.columns = columns

# NEW FEATURES

In [7]:
X_train['date1'] = X_train['join_date'].apply(lambda x: int(x.split('/')[0]) if (x == x) else np.nan)
X_train['date2'] = X_train['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
X_train['date3'] = X_train['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)
X_train.drop('join_date', axis=1, inplace=True)

X_test['date1'] = X_test['join_date'].apply(lambda x: int(x.split('/')[0]) if (x == x) else np.nan)
X_test['date2'] = X_test['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
X_test['date3'] = X_test['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)
X_test.drop('join_date', axis=1, inplace=True)

X_train['date_diff'] = X_train['date3'] - X_train['birth_year']
X_test['date_diff'] = X_test['date3'] - X_test['birth_year']

# CHANGE TYPES

In [8]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
y_train = y_train.fillna(0)

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data = X_train.append(X_test)
for v in ['sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code',]:
  data.loc[:,v] = le.fit_transform(data.loc[:,v])
X_train = data[:X_train.shape[0]]
X_test = data[-X_test.shape[0]:]

In [10]:
le.fit(y_train.iloc[:,0])
y_train = pd.DataFrame(le.transform(y_train.iloc[:,0]))
y_train.columns = ['target']

# MODEL

In [11]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(learning_rate=0.1)
model.fit(X_train.drop(columns=['ID', 'ID2']), y_train, cat_features=['sex','marital_status','branch_code','occupation_code','occupation_category_code'])

0:	learn: 1.7619774	total: 3.74s	remaining: 1h 2m 21s
1:	learn: 1.5594487	total: 7.33s	remaining: 1h 58s
2:	learn: 1.3942133	total: 11s	remaining: 1h 55s
3:	learn: 1.2872962	total: 13.1s	remaining: 54m 17s
4:	learn: 1.1779875	total: 16.7s	remaining: 55m 28s
5:	learn: 1.0917268	total: 20.1s	remaining: 55m 36s
6:	learn: 1.0248950	total: 23.8s	remaining: 56m 15s
7:	learn: 0.9667826	total: 27.4s	remaining: 56m 31s
8:	learn: 0.9155176	total: 30.8s	remaining: 56m 30s
9:	learn: 0.8750616	total: 34.4s	remaining: 56m 41s
10:	learn: 0.8422215	total: 37.8s	remaining: 56m 36s
11:	learn: 0.8126774	total: 41.2s	remaining: 56m 33s
12:	learn: 0.7792049	total: 44.1s	remaining: 55m 49s
13:	learn: 0.7529412	total: 47s	remaining: 55m 11s
14:	learn: 0.7287705	total: 50s	remaining: 54m 45s
15:	learn: 0.7069389	total: 53.5s	remaining: 54m 50s
16:	learn: 0.6878295	total: 56.4s	remaining: 54m 22s
17:	learn: 0.6704002	total: 59.4s	remaining: 54m 1s
18:	learn: 0.6563127	total: 1m 2s	remaining: 53m 48s
19:	learn:

153:	learn: 0.3924124	total: 9m 53s	remaining: 54m 17s
154:	learn: 0.3919370	total: 9m 56s	remaining: 54m 13s
155:	learn: 0.3915933	total: 10m	remaining: 54m 6s
156:	learn: 0.3913248	total: 10m 4s	remaining: 54m 4s
157:	learn: 0.3909678	total: 10m 8s	remaining: 54m 4s
158:	learn: 0.3901881	total: 10m 12s	remaining: 53m 59s
159:	learn: 0.3894499	total: 10m 15s	remaining: 53m 52s
160:	learn: 0.3889973	total: 10m 19s	remaining: 53m 50s
161:	learn: 0.3885644	total: 10m 24s	remaining: 53m 49s
162:	learn: 0.3882297	total: 10m 28s	remaining: 53m 47s
163:	learn: 0.3876793	total: 10m 32s	remaining: 53m 44s
164:	learn: 0.3871666	total: 10m 36s	remaining: 53m 40s
165:	learn: 0.3867154	total: 10m 39s	remaining: 53m 35s
166:	learn: 0.3862813	total: 10m 44s	remaining: 53m 33s
167:	learn: 0.3859314	total: 10m 47s	remaining: 53m 26s
168:	learn: 0.3857783	total: 10m 51s	remaining: 53m 22s
169:	learn: 0.3855610	total: 10m 55s	remaining: 53m 22s
170:	learn: 0.3853684	total: 11m	remaining: 53m 21s
171:	le

301:	learn: 0.3575888	total: 20m 17s	remaining: 46m 55s
302:	learn: 0.3573328	total: 20m 21s	remaining: 46m 50s
303:	learn: 0.3569368	total: 20m 25s	remaining: 46m 44s
304:	learn: 0.3565830	total: 20m 29s	remaining: 46m 42s
305:	learn: 0.3565210	total: 20m 34s	remaining: 46m 39s
306:	learn: 0.3564611	total: 20m 38s	remaining: 46m 34s
307:	learn: 0.3563516	total: 20m 42s	remaining: 46m 30s
308:	learn: 0.3560170	total: 20m 46s	remaining: 46m 26s
309:	learn: 0.3556194	total: 20m 50s	remaining: 46m 22s
310:	learn: 0.3554922	total: 20m 54s	remaining: 46m 19s
311:	learn: 0.3552531	total: 20m 58s	remaining: 46m 15s
312:	learn: 0.3552367	total: 21m 2s	remaining: 46m 11s
313:	learn: 0.3550582	total: 21m 7s	remaining: 46m 8s
314:	learn: 0.3549600	total: 21m 11s	remaining: 46m 4s
315:	learn: 0.3547523	total: 21m 16s	remaining: 46m 2s
316:	learn: 0.3546511	total: 21m 20s	remaining: 45m 59s
317:	learn: 0.3545838	total: 21m 24s	remaining: 45m 54s
318:	learn: 0.3543450	total: 21m 28s	remaining: 45m 5

449:	learn: 0.3394586	total: 30m 51s	remaining: 37m 42s
450:	learn: 0.3392384	total: 30m 55s	remaining: 37m 38s
451:	learn: 0.3391964	total: 30m 59s	remaining: 37m 34s
452:	learn: 0.3390868	total: 31m 3s	remaining: 37m 30s
453:	learn: 0.3389702	total: 31m 8s	remaining: 37m 26s
454:	learn: 0.3389438	total: 31m 12s	remaining: 37m 23s
455:	learn: 0.3388695	total: 31m 17s	remaining: 37m 19s
456:	learn: 0.3388251	total: 31m 20s	remaining: 37m 14s
457:	learn: 0.3387229	total: 31m 24s	remaining: 37m 10s
458:	learn: 0.3386889	total: 31m 28s	remaining: 37m 5s
459:	learn: 0.3386166	total: 31m 32s	remaining: 37m 1s
460:	learn: 0.3385685	total: 31m 37s	remaining: 36m 58s
461:	learn: 0.3385335	total: 31m 41s	remaining: 36m 54s
462:	learn: 0.3384925	total: 31m 46s	remaining: 36m 50s
463:	learn: 0.3380774	total: 31m 50s	remaining: 36m 46s
464:	learn: 0.3380575	total: 31m 55s	remaining: 36m 43s
465:	learn: 0.3380150	total: 31m 58s	remaining: 36m 38s
466:	learn: 0.3377771	total: 32m 2s	remaining: 36m 3

597:	learn: 0.3265230	total: 42m 26s	remaining: 28m 32s
598:	learn: 0.3264916	total: 42m 31s	remaining: 28m 28s
599:	learn: 0.3263404	total: 42m 36s	remaining: 28m 24s
600:	learn: 0.3262599	total: 42m 41s	remaining: 28m 20s
601:	learn: 0.3261457	total: 42m 45s	remaining: 28m 16s
602:	learn: 0.3260491	total: 42m 50s	remaining: 28m 12s
603:	learn: 0.3258739	total: 42m 54s	remaining: 28m 8s
604:	learn: 0.3257657	total: 42m 59s	remaining: 28m 4s
605:	learn: 0.3257239	total: 43m 5s	remaining: 28m
606:	learn: 0.3256170	total: 43m 9s	remaining: 27m 56s
607:	learn: 0.3254522	total: 43m 14s	remaining: 27m 52s
608:	learn: 0.3254248	total: 43m 19s	remaining: 27m 48s
609:	learn: 0.3253805	total: 43m 23s	remaining: 27m 44s
610:	learn: 0.3253334	total: 43m 28s	remaining: 27m 40s
611:	learn: 0.3251569	total: 43m 33s	remaining: 27m 36s
612:	learn: 0.3251038	total: 43m 38s	remaining: 27m 32s
613:	learn: 0.3250625	total: 43m 43s	remaining: 27m 29s
614:	learn: 0.3249810	total: 43m 48s	remaining: 27m 25s


745:	learn: 0.3153787	total: 54m 45s	remaining: 18m 38s
746:	learn: 0.3153154	total: 54m 50s	remaining: 18m 34s
747:	learn: 0.3152005	total: 54m 55s	remaining: 18m 30s
748:	learn: 0.3151449	total: 55m	remaining: 18m 26s
749:	learn: 0.3150005	total: 55m 6s	remaining: 18m 22s
750:	learn: 0.3148848	total: 55m 11s	remaining: 18m 17s
751:	learn: 0.3147761	total: 55m 16s	remaining: 18m 13s
752:	learn: 0.3147390	total: 55m 22s	remaining: 18m 9s
753:	learn: 0.3146879	total: 55m 27s	remaining: 18m 5s
754:	learn: 0.3146364	total: 55m 31s	remaining: 18m 1s
755:	learn: 0.3145548	total: 55m 36s	remaining: 17m 57s
756:	learn: 0.3144878	total: 55m 41s	remaining: 17m 52s
757:	learn: 0.3144171	total: 55m 45s	remaining: 17m 48s
758:	learn: 0.3143309	total: 55m 50s	remaining: 17m 43s
759:	learn: 0.3143068	total: 55m 54s	remaining: 17m 39s
760:	learn: 0.3142737	total: 55m 58s	remaining: 17m 34s
761:	learn: 0.3142101	total: 56m 3s	remaining: 17m 30s
762:	learn: 0.3141684	total: 56m 8s	remaining: 17m 26s
76

891:	learn: 0.3041019	total: 1h 6m 20s	remaining: 8m 1s
892:	learn: 0.3040310	total: 1h 6m 26s	remaining: 7m 57s
893:	learn: 0.3039978	total: 1h 6m 31s	remaining: 7m 53s
894:	learn: 0.3039373	total: 1h 6m 36s	remaining: 7m 48s
895:	learn: 0.3039257	total: 1h 6m 42s	remaining: 7m 44s
896:	learn: 0.3039067	total: 1h 6m 46s	remaining: 7m 40s
897:	learn: 0.3038186	total: 1h 6m 50s	remaining: 7m 35s
898:	learn: 0.3037619	total: 1h 6m 55s	remaining: 7m 31s
899:	learn: 0.3036729	total: 1h 7m 1s	remaining: 7m 26s
900:	learn: 0.3036118	total: 1h 7m 5s	remaining: 7m 22s
901:	learn: 0.3035847	total: 1h 7m 10s	remaining: 7m 17s
902:	learn: 0.3034894	total: 1h 7m 15s	remaining: 7m 13s
903:	learn: 0.3033868	total: 1h 7m 19s	remaining: 7m 9s
904:	learn: 0.3031610	total: 1h 7m 23s	remaining: 7m 4s
905:	learn: 0.3031313	total: 1h 7m 28s	remaining: 7m
906:	learn: 0.3030539	total: 1h 7m 34s	remaining: 6m 55s
907:	learn: 0.3029731	total: 1h 7m 38s	remaining: 6m 51s
908:	learn: 0.3028825	total: 1h 7m 43s	r

<catboost.core.CatBoostClassifier at 0x24955bb48e0>

In [12]:
proba = model.predict_proba(X_test.drop(columns=['ID','ID2'], axis=1))
y_test = pd.DataFrame(proba)
print(y_test.columns)
y_test.columns = le.inverse_transform(y_test.columns)

RangeIndex(start=0, stop=21, step=1)


# SUBMIT

In [13]:
print(y_test.columns)
answer_mass = []
for i in range(X_test.shape[0]):
  id = X_test['ID'].iloc[i]
  for c in y_test.columns:
    answer_mass.append([id + ' X ' + str(c), y_test[c].iloc[i]])
    

df_answer = pd.DataFrame(answer_mass)
df_answer.columns = ['ID X PCODE', 'Label']
for i in range(df_answer.shape[0]):
  if df_answer['ID X PCODE'].iloc[i] in true_values:
    df_answer['Label'].iloc[i] = 1.0

Index(['66FJ', '7POT', '8NN1', 'AHXO', 'BSTQ', 'ECY3', 'FM3X', 'GHYX', 'GYSR',
       'J9JW', 'JWFN', 'JZ9D', 'K6QO', 'LJR9', 'N2MW', 'P5DA', 'PYUQ', 'QBOL',
       'RIBP', 'RVSZ', 'SOP4'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [14]:
df_answer.reset_index(drop=True, inplace=True)
df_answer.to_csv('submission.csv', index=False)