### 1-Imports

In [76]:
import numpy as np
import pandas as pd
import os
import gc
import optuna

import xgboost as xgb
import lightgbm as lgb
import catboost as cb


from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

### 2-Test whether the files are loaded

In [4]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amexfeather/test_data_f32.ftr
/kaggle/input/amexfeather/train_data.ftr
/kaggle/input/amexfeather/train_data_f32.ftr
/kaggle/input/amexfeather/test_data.ftr
/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


### 3-Read the training dataset in feather dataset
Here I used feather dataset of Amex because the default dataset cannot be read with pandas

In [5]:
train_dataset_ = pd.read_feather('../input/amexfeather/train_data.ftr')
train_dataset_.shape

(5531451, 191)

### 4-Create reduced the size

Keep only the latest statement features for each customer

In [6]:
train_data = train_dataset_.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()
train_data.shape

(458913, 190)

### 5-Remove original dataset

In [7]:
del train_dataset_
gc.collect()

46

### 6-Doing same for test data

In [13]:
test_data_ = pd.read_feather('../input/amexfeather/test_data.ftr')
test_data = test_data_.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()
del test_data_
gc.collect()
test_data.shape

(924621, 189)

### 7-Check for missing values

In [14]:
missing_values_count = train_data.isnull().sum()
missing_values_count

S_2            0
P_2         2969
D_39           0
B_1            0
B_2           31
           ...  
D_142     378598
D_143       2830
D_144          0
D_145       2830
target         0
Length: 190, dtype: int64

### 8-Seperate label and features

In [15]:
x = train_data.drop(["target"],axis=1)
Y = train_data["target"]

In [16]:
Y.head()

customer_ID
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a    0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5    0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1    0
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc    0
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed    0
Name: target, dtype: int64

### 9-Looking for the features having more than 80% of missing values

In [17]:
missing_values = np.array(x.isnull().sum())
missing_percentages = np.array(((missing_values*100)/len(x)).round(2))
columns_missing_values_80 = []
fillable_columns = []
for i in range(len(missing_percentages)):
    if (missing_percentages[i] >= 80):
        columns_missing_values_80.append(x.columns[i])
    else:
        fillable_columns.append(x.columns[i])
print(columns_missing_values_80)

['D_42', 'D_49', 'D_66', 'D_73', 'D_76', 'R_9', 'B_29', 'D_87', 'D_88', 'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'B_42', 'D_132', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142']


### 10-Remove above filtered columns with S_2 which is date

In [18]:
removable_columns = columns_missing_values_80 + ['S_2']
x = x.drop(removable_columns, axis=1)
x.shape

(458913, 165)

### 11-Drop selected columns (with missing values more than 80%)

In [19]:
test_data = test_data.drop(removable_columns, axis=1)
test_data.shape

(924621, 165)

### 12-Fill missing values

#### For numerical columns, used medain

In [20]:
numerical_columns = np.array(['P_2','S_3','B_2','D_41','D_43','B_3','D_44','D_45','D_46','D_48','D_50','D_53','S_7','D_56','S_9','B_6','B_8','D_52','P_3','D_54','D_55','B_13','D_59','D_61','B_15','D_62','B_16','B_17','D_77','B_19','B_20','D_69','B_22','D_70','D_72','D_74','R_7','B_25','B_26','D_78','D_79','D_80','B_27','D_81','R_12','D_82','D_105','S_27','D_83','R_14','D_84','D_86','R_20','B_33','D_89','D_91','S_22','S_23','S_24','S_25','S_26','D_102','D_103','D_104','D_107','B_37','R_27','D_109','D_112','B_40','D_113','D_115','D_118','D_119','D_121','D_122','D_123','D_124','D_125','D_128','D_129','B_41','D_130','D_131','D_133','D_139','D_140','D_141','D_143','D_144','D_145', 'S_12', 'S_17'])
for col in numerical_columns:
    x[col] = x[col].fillna(x[col].median())

In [21]:
for col in numerical_columns:
    test_data[col] = test_data[col].fillna(x[col].median())

#### For categorical columns, used mode

In [22]:
categorical_columns = np.array(['D_68','B_30','B_38','D_64','D_114','D_116','D_117','D_120','D_126'])

for col in categorical_columns:
    x[col] =  x[col].fillna(x[col].mode()[0])

In [23]:

for col in categorical_columns:
    test_data[col] =  test_data[col].fillna(x[col].mode()[0])

In [24]:
print(test_data.isnull().sum().to_string())

P_2      0
D_39     0
B_1      0
B_2      0
R_1      0
S_3      0
D_41     0
B_3      0
D_43     0
D_44     0
B_4      0
D_45     0
B_5      0
R_2      0
D_46     0
D_47     0
D_48     0
B_6      0
B_7      0
B_8      0
D_50     0
D_51     0
B_9      0
R_3      0
D_52     0
P_3      0
B_10     0
D_53     0
S_5      0
B_11     0
S_6      0
D_54     0
R_4      0
S_7      0
B_12     0
S_8      0
D_55     0
D_56     0
B_13     0
R_5      0
D_58     0
S_9      0
B_14     0
D_59     0
D_60     0
D_61     0
B_15     0
S_11     0
D_62     0
D_63     0
D_64     0
D_65     0
B_16     0
B_17     0
B_18     0
B_19     0
B_20     0
D_68     0
S_12     0
R_6      0
S_13     0
B_21     0
D_69     0
B_22     0
D_70     0
D_71     0
D_72     0
S_15     0
B_23     0
P_4      0
D_74     0
D_75     0
B_24     0
R_7      0
D_77     0
B_25     0
B_26     0
D_78     0
D_79     0
R_8      0
S_16     0
D_80     0
R_10     0
R_11     0
B_27     0
D_81     0
D_82     0
S_17     0
R_12     0
B_28     0
R_13     0

### 13-Ordinal encodings for categorical columns

In [25]:
categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

enc = OrdinalEncoder()
categorical_cols.remove('D_66')

x[categorical_cols] = enc.fit_transform(x[categorical_cols])

In [26]:
test_data[categorical_cols] = enc.transform(test_data[categorical_cols])

### 14-Standrdize datasets

In [27]:
scaler = StandardScaler()

# Fit on training set only.
scaler.fit(x)

# Apply transform to both the training set and the test set.
x = scaler.transform(x)

In [28]:
customer_IDs = test_data.index.values

In [29]:
test_data = scaler.transform(test_data)

### 15-PCA

In [30]:
x.shape

(458913, 165)

In [31]:
pca = PCA(.95)

# Fit on training set only.
pca.fit(x)
x = pca.transform(x)

In [32]:
x.shape

(458913, 111)

In [33]:
test_data.shape

(924621, 165)

In [34]:
test_data

array([[-0.25011797, -0.19915317, -0.55454289, ..., -0.44946748,
        -0.26547586, -0.30640945],
       [ 0.77145726, -0.18655624, -0.52956973, ..., -0.4678029 ,
        -0.28439404, -0.28321486],
       [ 0.23320794, -0.4942514 , -0.59473664, ...,  2.18438644,
         2.22898916,  0.14552584],
       ...,
       [-1.60901441, -0.49844405, -0.51485635, ..., -0.45211838,
        -0.25911566, -0.31200914],
       [-0.24279485, -0.41595693, -0.38593284, ..., -0.46343995,
        -0.2446559 , -0.28736854],
       [-0.67852048, -0.49007772, -0.59966034, ..., -0.45338359,
        -0.25035598, -0.30679764]])

In [35]:
test_data = pca.transform(test_data)

In [36]:
test_data.shape

(924621, 111)

### 16-Split the training dataset

In [37]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x, Y, test_size=0.3, random_state=42, stratify=Y)

### 17-Amex metric for validation

In [38]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

## **Models**

### 18-Support Vector Machine

In [71]:
# we are using linearSVR rather than SVR of sklearn because there are lot of data in this dataset. linearSVR has performance gain
param_grid = { 
    # just guessing with previous results
    'C': [6,7,8],
}
clf = LinearSVR(random_state = 42)
# Perform grid serach to get best C
svc_random = GridSearchCV(estimator = clf, param_grid = param_grid, cv=2, verbose=0, n_jobs = -1)
svc_random.fit(x_train,y_train)
train_model_SVR = svc_random.best_estimator_



In [72]:
predictions = train_model_SVR.predict(x_test)
y_test_df = pd.DataFrame({'target': y_test.values})
predictions_df = pd.DataFrame({'prediction': predictions})
amex_value_SVR = amex_metric(y_test_df, predictions_df)

In [73]:
amex_value_SVR

0.5574198592191982

### 19-KNN

In [78]:
train_model_knn = KNeighborsRegressor(n_neighbors=3)
x_train_knn,x_test_knn,y_train_knn,y_test_knn = train_test_split(x, Y, train_size=0.1, random_state=42, stratify=Y)
train_model_knn.fit(x_train_knn,y_train_knn)
predictions = train_model_knn.predict(x_test_knn)
y_test_df = pd.DataFrame({'target': y_test_knn.values})
predictions_df = pd.DataFrame({'prediction': predictions})
amex_value_KNN = amex_metric(y_test_df, predictions_df)

In [79]:
amex_value_KNN

0.6042892971838837

### 20-XGBoost

In [44]:
modelXGB = xgb.XGBRegressor(
    learning_rate=0.02,
    n_estimators=15,
    objective="reg:squarederror",
    nthread=3,
    tree_method="gpu_hist"  # this enables GPU.
)

#### Predict test set split of training set

In [45]:
train_model_XGB = modelXGB.fit(x_train, y_train)
predictions = train_model_XGB.predict(x_test)
predictions

array([0.37006435, 0.43671736, 0.571397  , ..., 0.39499098, 0.6276235 ,
       0.37807113], dtype=float32)

#### Evaluate

In [46]:
y_test_df = pd.DataFrame({'target': y_test.values})
predictions_df = pd.DataFrame({'prediction': predictions})

In [47]:
amex_value_XGB = amex_metric(y_test_df, predictions_df)
amex_value_XGB

0.7316698122563788

#### Real test set

In [48]:
train_model_XGB_real = modelXGB.fit(x, Y)
predictions_real = train_model_XGB_real.predict(test_data)
predictions_real

In [49]:
prediction_real_df = pd.DataFrame({'customer_ID': customer_IDs, 'prediction': predictions_real})
prediction_real_df.head()

In [50]:
prediction_real_df.to_csv('submission.csv', index=False)

### 21-LightGBM

In [51]:
train_model_LGBM = lgb.LGBMRegressor()
train_model_LGBM.fit(x_train, y_train)

LGBMRegressor()

In [52]:
predictions = train_model_LGBM.predict(x_test)
predictions

array([0.00738904, 0.31322581, 0.77325805, ..., 0.09681491, 1.03475812,
       0.00917068])

In [53]:
y_test_df = pd.DataFrame({'target': y_test.values})
predictions_df = pd.DataFrame({'prediction': predictions})

In [54]:
amex_value_LGBM = amex_metric(y_test_df, predictions_df)
amex_value_LGBM

0.7609647614422164

In [55]:
train_model_LGBM.fit(x, Y)
predictions_real = train_model_LGBM.predict(test_data)
prediction_real_df = pd.DataFrame({'customer_ID': customer_IDs, 'prediction': predictions_real})
prediction_real_df.to_csv('submission_lgbm.csv', index=False)

### 22-CatBoost

In [56]:
train_model_CB = cb.CatBoostRegressor(verbose=False)
train_model_CB.fit(x_train, y_train)

<catboost.core.CatBoostRegressor at 0x7fae6515b0d0>

In [67]:
predictions = train_model_CB.predict(x_test)
predictions

array([ 0.00516924,  0.28681417,  0.85095616, ...,  0.08526685,
        1.02116478, -0.01824901])

In [68]:
y_test_df = pd.DataFrame({'target': y_test.values})
predictions_df = pd.DataFrame({'prediction': predictions})

In [69]:
amex_value_CB = amex_metric(y_test_df, predictions_df)
amex_value_CB

0.7676634510874343

In [None]:
prediction_real_df = pd.DataFrame({'customer_ID': customer_IDs, 'prediction': predictions_real})

In [None]:
train_model_CB.fit(x, Y)
predictions_real = train_model_CB.predict(test_data)
prediction_real_df = pd.DataFrame({'customer_ID': customer_IDs, 'prediction': predictions_real})
prediction_real_df.to_csv('submission_lgbm.csv', index=False)

### 23-CatBoost with Optuna

In [None]:
def objective(trial):
    model = cb.CatBoostRegressor(
        iterations=trial.suggest_int("iterations", 100, 1000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 4, 10),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        od_wait=trial.suggest_int("od_wait", 10, 50),
        verbose=False
    )
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    y_test_df = pd.DataFrame({'target': y_test.values})
    predictions_df = pd.DataFrame({'prediction': y_pred})
    accuracy = amex_metric(y_test_df, predictions_df)
    print(accuracy)
    return accuracy

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = optuna.samplers.TPESampler(seed=1)
study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)

### **Validation of Results**

For the model validations *amex_metric* was used becuase it has suggested in the evaluation part of the given question.
So, according to the results of training and validataion tests (from train_test_split),

In [80]:
validation_results = pd.DataFrame({
    'Model': ['Support Vector Regressor', 'KNN', 'XGBoost', 'LightGBM', 'CatBoost'],
    'amex_metric Value': [amex_value_SVR, amex_value_KNN, amex_value_XGB, amex_value_LGBM, amex_value_CB]
})
validation_results

Unnamed: 0,Model,amex_metric Value
0,Support Vector Regressor,0.55742
1,KNN,0.604289
2,XGBoost,0.73167
3,LightGBM,0.760965
4,CatBoost,0.767663
