In [5]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd 
from pathlib import Path

import matplotlib.pyplot as plt
from scipy.stats import chi2
import statsmodels.discrete.discrete_model as dm_

%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
data_path = '../data/home-credit-default-risk/'

In [10]:
def load_data(path):
    credit_card_balance = pd.read_csv(Path(path, 'credit_card_balance.csv'))
    installment_payments = pd.read_csv(Path(path, 'installments_payments.csv'))
    pos_cash_balance = pd.read_csv(Path(path, 'POS_CASH_balance.csv'))
    previous_application = pd.read_csv(Path(path, 'previous_application.csv'))
    return credit_card_balance, installment_payments, pos_cash_balance, previous_application

In [11]:
credit_card_balance = pd.read_csv(Path(data_path, 'credit_card_balance.csv'))


In [12]:
print(credit_card_balance['AMT_DRAWINGS_ATM_CURRENT'][credit_card_balance['AMT_DRAWINGS_ATM_CURRENT'] < 0])
print(credit_card_balance['AMT_DRAWINGS_CURRENT'][credit_card_balance['AMT_DRAWINGS_CURRENT'] < 0])
print(credit_card_balance['CNT_DRAWINGS_ATM_CURRENT'][credit_card_balance['CNT_DRAWINGS_ATM_CURRENT'].isna()])

2047409   -6827.31
Name: AMT_DRAWINGS_ATM_CURRENT, dtype: float64
438776    -1687.50
747302     -519.57
3284667   -6211.62
Name: AMT_DRAWINGS_CURRENT, dtype: float64
45        NaN
47        NaN
49        NaN
52        NaN
60        NaN
           ..
3840272   NaN
3840303   NaN
3840306   NaN
3840307   NaN
3840310   NaN
Name: CNT_DRAWINGS_ATM_CURRENT, Length: 749816, dtype: float64


In [13]:
def credit_card_cleaning(df, **kwargs):
    fill_missing = kwargs.get('fill_missing', False)
    fill_value = kwargs.get('fill_value', 0)
    
    df['AMT_DRAWINGS_ATM_CURRENT'][df['AMT_DRAWINGS_ATM_CURRENT'] < 0] = np.nan
    df['AMT_DRAWINGS_CURRENT'][df['AMT_DRAWINGS_CURRENT'] < 0] = np.nan

    if fill_missing:
        df.fillna(fill_value, inplace=True)
    
    return df

In [14]:
credit_card_test = credit_card_balance.copy()
credit_card_cleaned = credit_card_cleaning(credit_card_test, fill_missing=True, fill_value=-1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [15]:
print(credit_card_cleaned['AMT_DRAWINGS_ATM_CURRENT'][credit_card_cleaned['AMT_DRAWINGS_ATM_CURRENT'] < 0])
print(credit_card_cleaned['AMT_DRAWINGS_CURRENT'][credit_card_cleaned['AMT_DRAWINGS_CURRENT'] < 0])
print(credit_card_cleaned['CNT_DRAWINGS_ATM_CURRENT'][credit_card_cleaned['CNT_DRAWINGS_ATM_CURRENT'].isna()])

45        -1.0
47        -1.0
49        -1.0
52        -1.0
60        -1.0
          ... 
3840272   -1.0
3840303   -1.0
3840306   -1.0
3840307   -1.0
3840310   -1.0
Name: AMT_DRAWINGS_ATM_CURRENT, Length: 749817, dtype: float64
438776    -1.0
747302    -1.0
3284667   -1.0
Name: AMT_DRAWINGS_CURRENT, dtype: float64
Series([], Name: CNT_DRAWINGS_ATM_CURRENT, dtype: float64)


In [64]:
installment_payments = pd.read_csv(Path(data_path, 'installments_payments.csv'))

In [76]:
print(installment_payments['DAYS_ENTRY_PAYMENT'][installment_payments['DAYS_ENTRY_PAYMENT'].isna()])


3764207    NaN
3764208    NaN
3764209    NaN
3764210    NaN
3764211    NaN
            ..
13605396   NaN
13605397   NaN
13605398   NaN
13605399   NaN
13605400   NaN
Name: DAYS_ENTRY_PAYMENT, Length: 2905, dtype: float64


In [68]:
def installment_payment_cleaning(df, **kwargs):
    fill_missing = kwargs.get('fill_missing', False)
    fill_value = kwargs.get('fill_value', 0)

    if fill_missing:
        df.fillna(fill_value, inplace=True)

    return df

In [85]:
installment_payments_test = installment_payments.copy()
installment_payments_cleaned = installment_payment_cleaning(installment_payments_test, fill_missing=True, fill_value=-1)
print(installment_payments_cleaned['DAYS_ENTRY_PAYMENT'][installment_payments_cleaned['DAYS_ENTRY_PAYMENT'].isna()])
print(installment_payments_cleaned['DAYS_ENTRY_PAYMENT'][installment_payments_cleaned['DAYS_ENTRY_PAYMENT'] == -1])


Series([], Name: DAYS_ENTRY_PAYMENT, dtype: float64)
3764207    -1.0
3764208    -1.0
3764209    -1.0
3764210    -1.0
3764211    -1.0
           ... 
13605396   -1.0
13605397   -1.0
13605398   -1.0
13605399   -1.0
13605400   -1.0
Name: DAYS_ENTRY_PAYMENT, Length: 2907, dtype: float64


In [78]:
pos_cash_balance = pd.read_csv(Path(data_path, 'POS_CASH_balance.csv'))

In [81]:
print(pos_cash_balance['CNT_INSTALMENT'][pos_cash_balance['CNT_INSTALMENT'].isna()])


709       NaN
759       NaN
1887      NaN
1899      NaN
1910      NaN
           ..
9998668   NaN
9998696   NaN
9999114   NaN
9999116   NaN
9999511   NaN
Name: CNT_INSTALMENT, Length: 26071, dtype: float64


In [82]:
def pos_cash_cleaning(df, **kwargs):
    fill_missing = kwargs.get('fill_missing', False)
    fill_value = kwargs.get('fill_value', 0)

    if fill_missing:
        df.fillna(fill_value, inplace=True)

    return df

In [84]:
pos_cash_balance_test = pos_cash_balance.copy()
pos_cash_balance_cleaned = pos_cash_cleaning(pos_cash_balance_test, fill_missing=True, fill_value=-1)
print(pos_cash_balance_cleaned['CNT_INSTALMENT'][pos_cash_balance_cleaned['CNT_INSTALMENT'].isna()])
print(pos_cash_balance_cleaned['CNT_INSTALMENT'][pos_cash_balance_cleaned['CNT_INSTALMENT'] == -1])


Series([], Name: CNT_INSTALMENT, dtype: float64)
709       -1.0
759       -1.0
1887      -1.0
1899      -1.0
1910      -1.0
          ... 
9998668   -1.0
9998696   -1.0
9999114   -1.0
9999116   -1.0
9999511   -1.0
Name: CNT_INSTALMENT, Length: 26071, dtype: float64


In [96]:
previous_application = pd.read_csv(Path(data_path, 'previous_application.csv'))

In [97]:
print(previous_application['DAYS_FIRST_DRAWING'][previous_application['DAYS_FIRST_DRAWING'] == 365243])
print(previous_application['DAYS_FIRST_DRAWING'][previous_application['DAYS_FIRST_DRAWING'].isna()])

# print(previous_application['AMT_DRAWINGS_CURRENT'][previous_application['AMT_DRAWINGS_CURRENT'] < 0])
# print(previous_application['CNT_DRAWINGS_ATM_CURRENT'][previous_application['CNT_DRAWINGS_ATM_CURRENT'].isna()])

0          365243.0
1          365243.0
2          365243.0
3          365243.0
5          365243.0
             ...   
1670209    365243.0
1670210    365243.0
1670211    365243.0
1670212    365243.0
1670213    365243.0
Name: DAYS_FIRST_DRAWING, Length: 934444, dtype: float64
4         NaN
6         NaN
7         NaN
8         NaN
9         NaN
           ..
1670197   NaN
1670203   NaN
1670204   NaN
1670205   NaN
1670207   NaN
Name: DAYS_FIRST_DRAWING, Length: 673065, dtype: float64


In [100]:
def prev_application_cleaning(df, **kwargs):
    fill_missing = kwargs.get('fill_missing', False)
    fill_value = kwargs.get('fill_value', 0)

    df['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    df['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    df['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
    df['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
    df['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)

    if fill_missing:
        df.fillna(fill_value, inplace=True)

    return df

In [103]:
previous_application_test = previous_application.copy()
previous_application_cleaned = prev_application_cleaning(previous_application_test, fill_missing=True, fill_value=-1)
print(previous_application_cleaned['DAYS_FIRST_DRAWING'][previous_application_cleaned['DAYS_FIRST_DRAWING'] == 365243])
print(previous_application_cleaned['DAYS_FIRST_DRAWING'][previous_application_cleaned['DAYS_FIRST_DRAWING'].isna()])
print(previous_application_cleaned['DAYS_FIRST_DRAWING'][previous_application_cleaned['DAYS_FIRST_DRAWING'] == -1])


Series([], Name: DAYS_FIRST_DRAWING, dtype: float64)
Series([], Name: DAYS_FIRST_DRAWING, dtype: float64)
0         -1.0
1         -1.0
2         -1.0
3         -1.0
4         -1.0
          ... 
1670209   -1.0
1670210   -1.0
1670211   -1.0
1670212   -1.0
1670213   -1.0
Name: DAYS_FIRST_DRAWING, Length: 1607509, dtype: float64
