In [None]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import gc

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from catboost import CatBoostClassifier

# data viz
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

# styling
plt.style.use("ggplot")
rcParams['figure.figsize'] = (12,  6)

RANDOM_SEED = 6    # Set a random seed for reproducibility!

In [None]:
# Freed up memory
gc.collect()

# Initial Details

In [None]:
cat_columns = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

# Reduce Dataset Size

In [None]:
# Load train labels
targets = pd.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].apply(lambda x: int(x[-16:],16)).astype('int64')

print(f"Target Shape: {targets.shape}")

# Create a parquet file containing train labels
targets.to_parquet('/kaggle/working/train_labels.pqt')

In [None]:
def reduce_size(df, cat_columns):
    # Reduce date time and customer id column size
    df['customer_ID'] = df['customer_ID'].apply(lambda x: int(x[-16:],16) ).astype('int64')
    df.S_2 = pd.to_datetime(df.S_2)
    df['year'] = (df.S_2.dt.year-2000).astype('int8')
    df['month'] = (df.S_2.dt.month).astype('int8')
    df['day'] = (df.S_2.dt.day).astype('int8')
    del df['S_2']
    
    # Reduce categorical column sizes (Apply label encoding)
    d_63_map = {'CL':2, 'CO':3, 'CR':4, 'XL':5, 'XM':6, 'XZ':7}
    df['D_63'] = df.D_63.map(d_63_map).fillna(1).astype('int8')

    d_64_map = {'-1':2,'O':3, 'R':4, 'U':5}
    df['D_64'] = df.D_64.map(d_64_map).fillna(1).astype('int8')
    
    cat_int_columns = cat_columns[:7] + cat_columns[-2:]
    adding_values = [2,1,2,2,3,2,3,2,2]
    for c,s in zip(cat_int_columns, adding_values):
        df[c] = df[c] + s
        df[c] = df[c].fillna(1).astype('int8')
    
    # Reduce size of other columns
    skip_columns = ['customer_ID']
    for c in df.columns:
        if c in skip_columns:
            continue
        if str(df[c].dtype) == 'int64':
            df[c] = df[c].astype('int32')
        if str(df[c].dtype) == 'float64':
            df[c] = df[c].astype('float32')
    
    # Sort dataframe by customer_ID, year, month, day (ascending order)
    df = df.sort_values(['customer_ID','year','month','day']).reset_index(drop=True)
    
    return df

In [None]:
# Load train data reduce size and write to a parquet file
# chunksize = 500000 rows
pq_writer = None
for idx, df in enumerate(pd.read_csv('/kaggle/input/amex-default-prediction/train_data.csv', chunksize=500000)):
    print(df.shape)
    df = reduce_size(df, cat_columns)
    
    table = pa.Table.from_pandas(df)
    if (idx == 0):
        pq_writer = pq.ParquetWriter('/kaggle/working/train_data.pqt', table.schema)
    pq_writer.write_table(table)

if (pq_writer):
    pq_writer.close()

In [None]:
del targets
gc.collect()

In [None]:
# Load test data reduce size and write to a parquet file
# chunksize = 3000000 rows
for idx, df in enumerate(pd.read_csv('/kaggle/input/amex-default-prediction/test_data.csv', chunksize=3000000)):
    print(df.shape)
    df = reduce_size(df, cat_columns)
    
    df.to_parquet(f'/kaggle/working/test_data_{idx}.pqt')

# Load Prepared Dataset

In [None]:
# Load training data
train = pd.read_parquet('/kaggle/working/train_data.pqt')
train.shape

In [None]:
# Load training labels
train_labels = pd.read_parquet('/kaggle/working/train_labels.pqt')
train_labels.shape

# EDA

In [None]:
train.describe()

In [None]:
train.info(verbose=True, show_counts=True)

In [None]:
# Print duplicate rows count
train.duplicated().sum()

In [None]:
train_labels.target.value_counts(normalize=True)

In [None]:
train_labels.target.value_counts().plot(kind="bar")
plt.title("Value counts of the target variable")
plt.xlabel("Default or not")
plt.xticks(rotation=0)
plt.ylabel("Count")
plt.show()

In [None]:
temp_df = train.merge(train_labels,on='customer_ID',how='left')
crr = temp_df.corr()

In [None]:
cor_cols = []
cols = list(crr.columns)
for i in range(len(cols)):
    for j in cols[:i]:
        if (crr.loc[cols[i]][j] > 0.6):
            cor_cols.append(j)

In [None]:
cor_cols = set(cor_cols)
print(len(cor_cols))
list(cor_cols)

In [None]:
nan_percentage = (train.isna().sum()/len(train.index)) * 100
drop_columns = []
for idx in nan_percentage.index:
    if (nan_percentage[idx] > 80):
        drop_columns.append(idx)
drop_columns

# Feature Engineering

In [None]:
def feature_eng(df, labels, is_train=True):
    df_local = df.copy()
    
    # Drop Columns With High NaN Rates
    df_local = df_local.drop(columns=['D_42','D_49','D_73','D_76','R_9','B_29','D_87','D_88','D_106','R_26','D_108','D_110','D_111','B_39','B_42','D_132','D_134','D_135','D_136','D_137','D_138','D_142'])
    
    # Remove Highly Correlated Features => threshold = 0.8
    df_local = df_local.drop(columns=['D_74','B_11','B_18','S_22','D_103','D_104','D_48','D_58','B_1','B_7','S_3','B_12','D_131','D_118','D_141','D_115','B_16','D_139','D_62','B_14','B_2','D_79'])
    
    # Handle Missing Values
    imp_freq = SimpleImputer(strategy='median')
    skips = cat_columns + ['customer_ID', 'day', 'month', 'year']
    for col in df_local.columns:
        if col not in skips:
            df_local[col] = imp_freq.fit_transform(df_local[[col]])
            
    # Scaling
    scalar = MinMaxScaler()
    for col in df_local.columns:
        if col not in skips:
            df_local[col] = scalar.fit_transform(df_local[[col]])
            
    # Get the average of all ime series data for each customer
    df_local = df_local.sort_values(['customer_ID','year','month','day']).reset_index(drop=True)
    df_local = df_local.groupby('customer_ID').mean()
    
    # Remove non-required columns
    df_local = df_local.drop(columns=['year','month','day'])
    
    if is_train:
        # Join labels and data
        target_df = labels.sort_values(['customer_ID']).reset_index(drop=True)
        print("Sort Order Correct: ", list(target_df['customer_ID']) == list(df_local.index))
        df_local["target"] = pd.Series(target_df['target'].to_numpy(), index=df_local.index)
    
    return df_local

In [None]:
train = feature_eng(train, train_labels)

In [None]:
train.shape

# Model implementation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train.drop(columns=['target']),
    train['target'],
    test_size=0.2,
    shuffle=True,
    stratify=train['target'],
    random_state=RANDOM_SEED
)

In [None]:
model = CatBoostClassifier(n_estimators=3000, verbose=False, loss_function="CrossEntropy")
# model.fit(train.drop(columns=['target']), train['target'])
model.fit(X_train, y_train, cat_features=cat_columns)

# Evaluation

In [None]:
# COMPETITION METRIC FROM Konstantin Yakovlev
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):
    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
preds = model.predict_proba(X_test)
amex_metric_mod(y_test, preds[:,1])

In [None]:
predictions = []
for idx in range(4):
    df = pd.read_parquet(f'/kaggle/working/test_data_{idx}.pqt')
    test = feature_eng(df, None, is_train=False)
    preds = model.predict_proba(test)
    temp_df = pd.DataFrame(np.expand_dims(preds[:,1], axis=1), columns=['preds'])
    temp_df['mapping'] = list(test.index)
    temp_df.to_parquet(f'/kaggle/working/pred_{idx}.pqt')
    predictions.append(temp_df)
    
    del df,test,preds,temp_df
    gc.collect()
    print(f'Chunk is finished: {idx}')

predictions_df = pd.concat(predictions)
predictions_df = predictions_df.groupby('mapping', as_index=False).mean()
predictions_df = predictions_df.sort_values(['mapping']).reset_index(drop=True)
predictions_df.to_parquet('/kaggle/working/predictions.pqt')

In [None]:
predictions_df = pd.read_parquet('/kaggle/working/predictions.pqt')

In [None]:
sub = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
sub["mapping"] = sub['customer_ID'].apply(lambda x: int(x[-16:],16) ).astype('int64')
sub = sub.merge(predictions_df, on='mapping', how='left')
sub['prediction'] = sub['preds']
sub = sub.drop(columns=['mapping', 'preds'])

In [None]:
sub

In [None]:
sub.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
del model, predictions, predictions_df, sub