# GET DATA

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy

In [2]:
train = pd.read_csv('Data/Train.csv')
test = pd.read_csv('Data/Test.csv')
submission = pd.read_csv('Data/SampleSubmission.csv')

# NEW FORMAT

In [3]:
X_train = []
X_train_columns = train.columns
c = 0
for v in train.values:
  info = v[:8]
  binary = v[8:]
  index = [k for k, i in enumerate(binary) if i == 1]
  for i in index:
    c+=1
    for k in range(len(binary)):
      if k == i:
        binary_transformed = list(copy.copy(binary))
        binary_transformed[i] = 0
        X_train.append(list(info) + binary_transformed + [X_train_columns[8+k]] + [c])

X_train = pd.DataFrame(X_train)
X_train.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
       'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
       '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
       'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'product_pred', 'ID2']

In [4]:
X_test = []
true_values = []
c = 0
for v in test.values:
  c += 1
  info = v[:8]
  binary = v[8:]
  index = [k for k, i in enumerate(binary) if i == 1]
  X_test.append(list(info) + list(binary) + [c])
  for k in test.columns[8:][index]:
    true_values.append(v[0] + ' X ' + k)

X_test = pd.DataFrame(X_test)
X_test.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
       'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
       '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
       'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'ID2']

# TRANSFORM DATA

In [5]:
features_train = []
features_test = []
columns = []

append_features = ['P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 
'N2MW', 'AHXO','BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 
'ECY3', 'ID', 'ID2', 'join_date', 'sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code',
'birth_year']
for v in append_features:
  features_train.append(X_train[v].values.reshape(-1, 1))
  features_test.append(X_test[v].values.reshape(-1, 1))
  columns.append(np.array([v]))

y_train = X_train[['product_pred']]

In [6]:
features_train = np.concatenate(features_train, axis=1)
features_test = np.concatenate(features_test, axis=1)
columns = np.concatenate(np.array(columns))

X_train = pd.DataFrame(features_train)
X_train.columns = columns
X_test = pd.DataFrame(features_test)
X_test.columns = columns

# NEW FEATURES

In [7]:
X_train['date1'] = X_train['join_date'].apply(lambda x: int(x.split('/')[0]) if (x == x) else np.nan)
X_train['date2'] = X_train['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
X_train['date3'] = X_train['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)
X_train.drop('join_date', axis=1, inplace=True)

X_test['date1'] = X_test['join_date'].apply(lambda x: int(x.split('/')[0]) if (x == x) else np.nan)
X_test['date2'] = X_test['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
X_test['date3'] = X_test['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)
X_test.drop('join_date', axis=1, inplace=True)

X_train['date_diff'] = X_train['date3'] - X_train['birth_year']
X_test['date_diff'] = X_test['date3'] - X_test['birth_year']

# CHANGE TYPES

In [8]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
y_train = y_train.fillna(0)

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data = X_train.append(X_test)
for v in ['sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code',]:
  data.loc[:,v] = le.fit_transform(data.loc[:,v])
X_train = data[:X_train.shape[0]]
X_test = data[-X_test.shape[0]:]

In [10]:
le.fit(y_train.iloc[:,0])
y_train = pd.DataFrame(le.transform(y_train.iloc[:,0]))
y_train.columns = ['target']

# MODEL

In [11]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(learning_rate=0.075)
model.fit(X_train.drop(columns=['ID', 'ID2']), y_train, cat_features=['sex','marital_status','branch_code','occupation_code','occupation_category_code'])

0:	learn: 2.0491936	total: 3.79s	remaining: 1h 3m 7s
1:	learn: 1.7922363	total: 7.36s	remaining: 1h 1m 12s
2:	learn: 1.6060794	total: 11.1s	remaining: 1h 1m 36s
3:	learn: 1.4829923	total: 13.4s	remaining: 55m 29s
4:	learn: 1.3691801	total: 17.4s	remaining: 57m 45s
5:	learn: 1.2778013	total: 21.3s	remaining: 58m 54s
6:	learn: 1.2041165	total: 25.5s	remaining: 1h 20s
7:	learn: 1.1395631	total: 29.4s	remaining: 1h 43s
8:	learn: 1.0818572	total: 33.3s	remaining: 1h 1m 9s
9:	learn: 1.0340127	total: 37.2s	remaining: 1h 1m 23s
10:	learn: 0.9891952	total: 41.1s	remaining: 1h 1m 33s
11:	learn: 0.9479926	total: 44.6s	remaining: 1h 1m 12s
12:	learn: 0.9140368	total: 48.3s	remaining: 1h 1m 8s
13:	learn: 0.8803380	total: 51.4s	remaining: 1h 17s
14:	learn: 0.8513709	total: 55s	remaining: 1h 11s
15:	learn: 0.8257738	total: 58.3s	remaining: 59m 42s
16:	learn: 0.8023292	total: 1m 1s	remaining: 59m 33s
17:	learn: 0.7862266	total: 1m 3s	remaining: 57m 27s
18:	learn: 0.7639367	total: 1m 6s	remaining: 57m 

153:	learn: 0.4113292	total: 10m 33s	remaining: 58m 2s
154:	learn: 0.4105789	total: 10m 39s	remaining: 58m 6s
155:	learn: 0.4102777	total: 10m 45s	remaining: 58m 12s
156:	learn: 0.4098321	total: 10m 49s	remaining: 58m 9s
157:	learn: 0.4094904	total: 10m 54s	remaining: 58m 6s
158:	learn: 0.4093230	total: 10m 59s	remaining: 58m 6s
159:	learn: 0.4083525	total: 11m 2s	remaining: 57m 57s
160:	learn: 0.4077675	total: 11m 6s	remaining: 57m 51s
161:	learn: 0.4069576	total: 11m 10s	remaining: 57m 46s
162:	learn: 0.4064272	total: 11m 14s	remaining: 57m 43s
163:	learn: 0.4061833	total: 11m 19s	remaining: 57m 42s
164:	learn: 0.4057273	total: 11m 25s	remaining: 57m 47s
165:	learn: 0.4054703	total: 11m 30s	remaining: 57m 49s
166:	learn: 0.4051588	total: 11m 36s	remaining: 57m 52s
167:	learn: 0.4045944	total: 11m 42s	remaining: 58m
168:	learn: 0.4044370	total: 11m 48s	remaining: 58m 3s
169:	learn: 0.4038419	total: 11m 52s	remaining: 57m 57s
170:	learn: 0.4027858	total: 11m 56s	remaining: 57m 52s
171:

301:	learn: 0.3690924	total: 21m 52s	remaining: 50m 33s
302:	learn: 0.3690728	total: 21m 57s	remaining: 50m 30s
303:	learn: 0.3689845	total: 22m 2s	remaining: 50m 27s
304:	learn: 0.3687307	total: 22m 6s	remaining: 50m 21s
305:	learn: 0.3685384	total: 22m 10s	remaining: 50m 16s
306:	learn: 0.3684692	total: 22m 14s	remaining: 50m 12s
307:	learn: 0.3683345	total: 22m 18s	remaining: 50m 7s
308:	learn: 0.3682083	total: 22m 22s	remaining: 50m 2s
309:	learn: 0.3681769	total: 22m 27s	remaining: 49m 59s
310:	learn: 0.3681118	total: 22m 32s	remaining: 49m 56s
311:	learn: 0.3680338	total: 22m 37s	remaining: 49m 54s
312:	learn: 0.3679114	total: 22m 44s	remaining: 49m 54s
313:	learn: 0.3678720	total: 22m 50s	remaining: 49m 53s
314:	learn: 0.3678165	total: 22m 54s	remaining: 49m 48s
315:	learn: 0.3677416	total: 22m 59s	remaining: 49m 44s
316:	learn: 0.3675192	total: 23m 3s	remaining: 49m 40s
317:	learn: 0.3672192	total: 23m 7s	remaining: 49m 36s
318:	learn: 0.3671113	total: 23m 13s	remaining: 49m 34

449:	learn: 0.3529498	total: 35m 9s	remaining: 42m 58s
450:	learn: 0.3527504	total: 35m 14s	remaining: 42m 54s
451:	learn: 0.3524742	total: 35m 18s	remaining: 42m 48s
452:	learn: 0.3521572	total: 35m 23s	remaining: 42m 44s
453:	learn: 0.3521112	total: 35m 27s	remaining: 42m 38s
454:	learn: 0.3520177	total: 35m 32s	remaining: 42m 34s
455:	learn: 0.3518801	total: 35m 37s	remaining: 42m 30s
456:	learn: 0.3517359	total: 35m 42s	remaining: 42m 25s
457:	learn: 0.3517030	total: 35m 46s	remaining: 42m 20s
458:	learn: 0.3515681	total: 35m 51s	remaining: 42m 15s
459:	learn: 0.3514014	total: 35m 56s	remaining: 42m 11s
460:	learn: 0.3513982	total: 36m 2s	remaining: 42m 7s
461:	learn: 0.3513135	total: 36m 6s	remaining: 42m 3s
462:	learn: 0.3511812	total: 36m 11s	remaining: 41m 58s
463:	learn: 0.3511289	total: 36m 15s	remaining: 41m 52s
464:	learn: 0.3510201	total: 36m 19s	remaining: 41m 47s
465:	learn: 0.3510103	total: 36m 23s	remaining: 41m 42s
466:	learn: 0.3508228	total: 36m 28s	remaining: 41m 3

597:	learn: 0.3396331	total: 47m 58s	remaining: 32m 15s
598:	learn: 0.3395640	total: 48m 3s	remaining: 32m 10s
599:	learn: 0.3395188	total: 48m 7s	remaining: 32m 4s
600:	learn: 0.3394801	total: 48m 10s	remaining: 31m 59s
601:	learn: 0.3393690	total: 48m 15s	remaining: 31m 54s
602:	learn: 0.3392662	total: 48m 19s	remaining: 31m 49s
603:	learn: 0.3392139	total: 48m 24s	remaining: 31m 44s
604:	learn: 0.3390447	total: 48m 29s	remaining: 31m 39s
605:	learn: 0.3389485	total: 48m 34s	remaining: 31m 35s
606:	learn: 0.3389263	total: 48m 40s	remaining: 31m 31s
607:	learn: 0.3388262	total: 48m 46s	remaining: 31m 26s
608:	learn: 0.3385589	total: 48m 50s	remaining: 31m 21s
609:	learn: 0.3384624	total: 48m 55s	remaining: 31m 16s
610:	learn: 0.3381566	total: 48m 59s	remaining: 31m 11s
611:	learn: 0.3381257	total: 49m 5s	remaining: 31m 7s
612:	learn: 0.3380101	total: 49m 9s	remaining: 31m 2s
613:	learn: 0.3379362	total: 49m 14s	remaining: 30m 57s
614:	learn: 0.3378770	total: 49m 18s	remaining: 30m 52s

745:	learn: 0.3275042	total: 1h 9s	remaining: 20m 29s
746:	learn: 0.3273716	total: 1h 14s	remaining: 20m 24s
747:	learn: 0.3272947	total: 1h 19s	remaining: 20m 19s
748:	learn: 0.3272304	total: 1h 23s	remaining: 20m 14s
749:	learn: 0.3271122	total: 1h 27s	remaining: 20m 9s
750:	learn: 0.3270813	total: 1h 31s	remaining: 20m 4s
751:	learn: 0.3267954	total: 1h 36s	remaining: 19m 59s
752:	learn: 0.3267497	total: 1h 41s	remaining: 19m 54s
753:	learn: 0.3266968	total: 1h 46s	remaining: 19m 49s
754:	learn: 0.3266814	total: 1h 52s	remaining: 19m 45s
755:	learn: 0.3266067	total: 1h 58s	remaining: 19m 40s
756:	learn: 0.3264770	total: 1h 1m 3s	remaining: 19m 36s
757:	learn: 0.3264543	total: 1h 1m 10s	remaining: 19m 31s
758:	learn: 0.3263644	total: 1h 1m 14s	remaining: 19m 26s
759:	learn: 0.3263316	total: 1h 1m 19s	remaining: 19m 22s
760:	learn: 0.3262668	total: 1h 1m 24s	remaining: 19m 17s
761:	learn: 0.3261508	total: 1h 1m 28s	remaining: 19m 11s
762:	learn: 0.3260772	total: 1h 1m 33s	remaining: 1

888:	learn: 0.3174568	total: 1h 11m 41s	remaining: 8m 57s
889:	learn: 0.3174379	total: 1h 11m 46s	remaining: 8m 52s
890:	learn: 0.3173977	total: 1h 11m 51s	remaining: 8m 47s
891:	learn: 0.3173686	total: 1h 11m 56s	remaining: 8m 42s
892:	learn: 0.3172678	total: 1h 12m	remaining: 8m 37s
893:	learn: 0.3172324	total: 1h 12m 5s	remaining: 8m 32s
894:	learn: 0.3171153	total: 1h 12m 9s	remaining: 8m 27s
895:	learn: 0.3170850	total: 1h 12m 15s	remaining: 8m 23s
896:	learn: 0.3170204	total: 1h 12m 21s	remaining: 8m 18s
897:	learn: 0.3169412	total: 1h 12m 27s	remaining: 8m 13s
898:	learn: 0.3169070	total: 1h 12m 32s	remaining: 8m 9s
899:	learn: 0.3168771	total: 1h 12m 38s	remaining: 8m 4s
900:	learn: 0.3168285	total: 1h 12m 44s	remaining: 7m 59s
901:	learn: 0.3168083	total: 1h 12m 49s	remaining: 7m 54s
902:	learn: 0.3167001	total: 1h 12m 53s	remaining: 7m 49s
903:	learn: 0.3166875	total: 1h 12m 58s	remaining: 7m 44s
904:	learn: 0.3165893	total: 1h 13m 3s	remaining: 7m 40s
905:	learn: 0.3165099	t

<catboost.core.CatBoostClassifier at 0x241e6549ac0>

In [12]:
proba = model.predict_proba(X_test.drop(columns=['ID','ID2'], axis=1))
y_test = pd.DataFrame(proba)
print(y_test.columns)
y_test.columns = le.inverse_transform(y_test.columns)

RangeIndex(start=0, stop=21, step=1)


# SUBMIT

In [13]:
print(y_test.columns)
answer_mass = []
for i in range(X_test.shape[0]):
  id = X_test['ID'].iloc[i]
  for c in y_test.columns:
    answer_mass.append([id + ' X ' + str(c), y_test[c].iloc[i]])
    

df_answer = pd.DataFrame(answer_mass)
df_answer.columns = ['ID X PCODE', 'Label']
for i in range(df_answer.shape[0]):
  if df_answer['ID X PCODE'].iloc[i] in true_values:
    df_answer['Label'].iloc[i] = 1.0

Index(['66FJ', '7POT', '8NN1', 'AHXO', 'BSTQ', 'ECY3', 'FM3X', 'GHYX', 'GYSR',
       'J9JW', 'JWFN', 'JZ9D', 'K6QO', 'LJR9', 'N2MW', 'P5DA', 'PYUQ', 'QBOL',
       'RIBP', 'RVSZ', 'SOP4'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [14]:
df_answer.reset_index(drop=True, inplace=True)
df_answer.to_csv('submission.csv', index=False)