<a href="https://www.kaggle.com/code/foocheechuan/amex-default-prediction-improvement?scriptVersionId=163360553" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv
/kaggle/input/amexfeather/test_data_f32.ftr
/kaggle/input/amexfeather/train_data.ftr
/kaggle/input/amexfeather/train_data_f32.ftr
/kaggle/input/amexfeather/test_data.ftr
/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet
/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet


# This notebook is an improvement to the baseline model 
https://www.kaggle.com/code/foocheechuan/amexdefaultprediction

<a id="table-of-content"></a>
# Table of Content
### [1. Setup](#setup)
- [Import Libraries](#import-libraries)
- [Dataset](#dataset)

### [2. Handling Missing Values](#missing)
- [Removes columns with >50% missing values](#50%-missing)
- [Simple Imputer](#simple-imputer)

### [Go to end](#end)

# Setup
<a id="setup"></a>

# Import Libraries
<a id="import-libraries"></a>

In [14]:
# data preparation
import pandas as pd
import numpy as np
import gc

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score  
from sklearn.metrics import precision_score                         
from sklearn.metrics import recall_score
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier, log_evaluation


# Dataset
<a id="dataset"></a>

- The original dataset provided in csv is too large (50GB)
- The data cannot fit into memory
- [Amex-Feather-Dataset](#https://www.kaggle.com/datasets/munumbutt/amexfeather) provided by [@munum](#https://www.kaggle.com/munumbutt) is a [feather file](#https://arrow.apache.org/docs/python/feather.html) that has smaller size than an equivalent csv file

# Read Data

In [3]:
# train.shape = (5531451,190)
train = pd.read_parquet('/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet')

# use small dataset to prevent insufficient memory
train_small = train.head(100)

In [None]:
# ====================================================
# Read & preprocess data and save it to disk
# ====================================================
def read_preprocess_data():
    
    # train.shape = (5531451,190)
    train = pd.read_parquet('/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet')
    
    # use small dataset to prevent insufficient memory
    train_small = train.head(100)
    
    # removes id and time from training set
    features = train_small.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
    
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    num_features = [col for col in features if col not in cat_features]
    print('Starting training feature engineer...')
    
    # Aggregate the rows by customer_ID reduces the number of rows and add many features
    train_num_agg = train.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])

    # Add aggregation method to the column names
    train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace = True)

    # Feature engineering for categorical columns
    train_cat_agg = train.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace = True)
    
    # read target file
    train_labels = pd.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv')
    
    # join num and cat features then join target using customer_ID
    train = train_num_agg.merge(train_cat_agg, how = 'inner', on = 'customer_ID').merge(train_labels, how = 'inner', on = 'customer_ID')
    del train_num_agg, train_cat_agg
    gc.collect()
    
    # repeat for testing set
    test = pd.read_parquet('/content/data/test.parquet')
    print('Starting test feature engineer...')
    test_num_agg = test.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace = True)
    test_cat_agg = test.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace = True)
    test = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID')
    del test_num_agg, test_cat_agg
    gc.collect()
    
    # Save files to disk
    train.to_parquet('/kaggle/output/Amex_preprocessing/train_fe.parquet')
    test.to_parquet('/kaggle/output/Amex_preprocessing/test_fe.parquet')

## Work in Progress
- Model Optimization
- Try different aggregation method
- Stratified KFold validation
- Feature Engineering

# References
<a id="references"></a>
- [AMEX EDA which makes sense](#https://www.kaggle.com/code/ambrosm/amex-eda-which-makes-sense)
- [AMEX - Light GBM](#https://www.kaggle.com/code/lixinqi98/amex-lightgbm/notebook)
- [AMEX Default Prediction - EDA & Prediction](#https://www.kaggle.com/code/aryanml007/amex-default-prediction-eda-prediction)
- [Amex LGBM Dart CV 0.7963](#https://www.kaggle.com/code/ragnar123/amex-lgbm-dart-cv-0-7963)

# [Back](#table-of-content)
<a id="end"></a>