# GET DATA

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy

In [2]:
train = pd.read_csv('Data/Train.csv')
test = pd.read_csv('Data/Test.csv')
submission = pd.read_csv('Data/SampleSubmission.csv')

# NEW FORMAT

In [3]:
X_train = []
X_train_columns = train.columns
c = 0
for v in train.values:
  info = v[:8]
  binary = v[8:]
  index = [k for k, i in enumerate(binary) if i == 1]
  for i in index:
    c+=1
    for k in range(len(binary)):
      if k == i:
        binary_transformed = list(copy.copy(binary))
        binary_transformed[i] = 0
        X_train.append(list(info) + binary_transformed + [X_train_columns[8+k]] + [c])

X_train = pd.DataFrame(X_train)
X_train.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
       'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
       '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
       'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'product_pred', 'ID2']

In [4]:
X_test = []
true_values = []
c = 0
for v in test.values:
  c += 1
  info = v[:8]
  binary = v[8:]
  index = [k for k, i in enumerate(binary) if i == 1]
  X_test.append(list(info) + list(binary) + [c])
  for k in test.columns[8:][index]:
    true_values.append(v[0] + ' X ' + k)

X_test = pd.DataFrame(X_test)
X_test.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
       'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
       '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
       'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'ID2']

# TRANSFORM DATA

In [5]:
features_train = []
features_test = []
columns = []

append_features = ['P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 
'N2MW', 'AHXO','BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 
'ECY3', 'ID', 'ID2', 'join_date', 'sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code',
'birth_year']
for v in append_features:
  features_train.append(X_train[v].values.reshape(-1, 1))
  features_test.append(X_test[v].values.reshape(-1, 1))
  columns.append(np.array([v]))

y_train = X_train[['product_pred']]

In [6]:
features_train = np.concatenate(features_train, axis=1)
features_test = np.concatenate(features_test, axis=1)
columns = np.concatenate(np.array(columns))

X_train = pd.DataFrame(features_train)
X_train.columns = columns
X_test = pd.DataFrame(features_test)
X_test.columns = columns

# NEW FEATURES

In [7]:
X_train['date1'] = X_train['join_date'].apply(lambda x: int(x.split('/')[0]) if (x == x) else np.nan)
X_train['date2'] = X_train['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
X_train['date3'] = X_train['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)
X_train.drop('join_date', axis=1, inplace=True)

X_test['date1'] = X_test['join_date'].apply(lambda x: int(x.split('/')[0]) if (x == x) else np.nan)
X_test['date2'] = X_test['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
X_test['date3'] = X_test['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)
X_test.drop('join_date', axis=1, inplace=True)

X_train['date_diff'] = X_train['date3'] - X_train['birth_year']
X_test['date_diff'] = X_test['date3'] - X_test['birth_year']

# CHANGE TYPES

In [8]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
y_train = y_train.fillna(0)

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data = X_train.append(X_test)
for v in ['sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code',]:
  data.loc[:,v] = le.fit_transform(data.loc[:,v])
X_train = data[:X_train.shape[0]]
X_test = data[-X_test.shape[0]:]

In [10]:
le.fit(y_train.iloc[:,0])
y_train = pd.DataFrame(le.transform(y_train.iloc[:,0]))
y_train.columns = ['target']

# MODEL

In [None]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=2000)
model.fit(X_train.drop(columns=['ID', 'ID2']), y_train, cat_features=['sex','marital_status','branch_code','occupation_code','occupation_category_code'])

Learning rate set to 0.054559
0:	learn: 2.3045876	total: 4.28s	remaining: 2h 22m 33s
1:	learn: 2.0324586	total: 8.02s	remaining: 2h 13m 32s
2:	learn: 1.8435748	total: 12.8s	remaining: 2h 22m 20s
3:	learn: 1.7121058	total: 15.5s	remaining: 2h 9m 13s
4:	learn: 1.5907272	total: 20.1s	remaining: 2h 13m 47s
5:	learn: 1.4920917	total: 24.4s	remaining: 2h 14m 59s
6:	learn: 1.4109468	total: 28.2s	remaining: 2h 13m 35s
7:	learn: 1.3401344	total: 32.1s	remaining: 2h 13m 18s
8:	learn: 1.2774178	total: 35.8s	remaining: 2h 11m 59s
9:	learn: 1.2227315	total: 39.6s	remaining: 2h 11m 29s
10:	learn: 1.1730231	total: 44s	remaining: 2h 12m 40s
11:	learn: 1.1268018	total: 47.8s	remaining: 2h 11m 57s
12:	learn: 1.0843030	total: 51s	remaining: 2h 9m 53s
13:	learn: 1.0449043	total: 54.3s	remaining: 2h 8m 28s
14:	learn: 1.0106200	total: 57.7s	remaining: 2h 7m 21s
15:	learn: 0.9804906	total: 1m 1s	remaining: 2h 7m 23s
16:	learn: 0.9508918	total: 1m 5s	remaining: 2h 7m 51s
17:	learn: 0.9260242	total: 1m 10s	rem

145:	learn: 0.4404635	total: 11m 9s	remaining: 2h 21m 43s
146:	learn: 0.4400884	total: 11m 14s	remaining: 2h 21m 44s
147:	learn: 0.4399542	total: 11m 20s	remaining: 2h 21m 50s
148:	learn: 0.4392280	total: 11m 24s	remaining: 2h 21m 46s
149:	learn: 0.4384105	total: 11m 28s	remaining: 2h 21m 28s
150:	learn: 0.4375344	total: 11m 32s	remaining: 2h 21m 17s
151:	learn: 0.4366661	total: 11m 36s	remaining: 2h 21m 5s
152:	learn: 0.4360923	total: 11m 40s	remaining: 2h 21m
153:	learn: 0.4356064	total: 11m 46s	remaining: 2h 21m 4s
154:	learn: 0.4352912	total: 11m 51s	remaining: 2h 21m 3s
155:	learn: 0.4349360	total: 11m 56s	remaining: 2h 21m 5s
156:	learn: 0.4346473	total: 12m 1s	remaining: 2h 21m 5s
157:	learn: 0.4341978	total: 12m 6s	remaining: 2h 21m 11s
158:	learn: 0.4338346	total: 12m 11s	remaining: 2h 21m 14s
159:	learn: 0.4334461	total: 12m 15s	remaining: 2h 21m
160:	learn: 0.4327345	total: 12m 20s	remaining: 2h 20m 54s
161:	learn: 0.4323253	total: 12m 24s	remaining: 2h 20m 48s
162:	learn: 0

In [None]:
proba = model.predict_proba(X_test.drop(columns=['ID','ID2'], axis=1))
y_test = pd.DataFrame(proba)
print(y_test.columns)
y_test.columns = le.inverse_transform(y_test.columns)

# SUBMIT

In [None]:
print(y_test.columns)
answer_mass = []
for i in range(X_test.shape[0]):
  id = X_test['ID'].iloc[i]
  for c in y_test.columns:
    answer_mass.append([id + ' X ' + str(c), y_test[c].iloc[i]])
    

df_answer = pd.DataFrame(answer_mass)
df_answer.columns = ['ID X PCODE', 'Label']
for i in range(df_answer.shape[0]):
  if df_answer['ID X PCODE'].iloc[i] in true_values:
    df_answer['Label'].iloc[i] = 1.0

In [None]:
df_answer.reset_index(drop=True, inplace=True)
df_answer.to_csv('submission.csv', index=False)