In [1]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import lightgbm as lgb
import xgboost as xgb
import missingno as msno
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

In [2]:
pd.options.display.max_rows = 222
pd.set_option("display.max_columns", 50)

In [14]:
path_applications_history = "../geekbrains-competitive-data-analysis/applications_history.csv"
path_bki = "../geekbrains-competitive-data-analysis/bki.csv"
path_payments = "../geekbrains-competitive-data-analysis/payments.csv"
path_client_profile = "../geekbrains-competitive-data-analysis/client_profile.csv"
path_train = "../geekbrains-competitive-data-analysis/train.csv"
path_test = "../geekbrains-competitive-data-analysis/test.csv"

In [3]:
def create_freq_feature(data: pd.DataFrame,
                        feature: str
                       ) -> pd.DataFrame:
    freq = data[feature].value_counts()
    data[feature] = data[feature].map(freq).astype('float')
    data[feature] = data[feature].fillna(0.0).astype('float')
    data[feature] = data[feature] / data.shape[0]
    
    return data

In [4]:
# applications_history

def preprocessing_applications_history(data: pd.DataFrame, 
                                       copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
    
    # ---NAME_CONTRACT_TYPE---
    data['NAME_CONTRACT_TYPE'] = (data['NAME_CONTRACT_TYPE'] == 'Cash').astype(int)
    
    # ---AMOUNT_ANNUITY---
    # ---AMOUNT_GOODS_PAYMENT---
    data = data.fillna(value=
                       {'AMOUNT_ANNUITY': 0, 
                        'AMOUNT_GOODS_PAYMENT': 0})
    
    # ---AMOUNT_PAYMENT---
    data.drop(['AMOUNT_PAYMENT'], axis='columns', inplace=True)
    
    # ---NAME_TYPE_SUITE---
    data = create_freq_feature(data, 'NAME_TYPE_SUITE')
    
    # ---NAME_CONTRACT_STATUS---
    freq = pd.Series(data=[1, 0, 0.5, 0.5], index=['Approved', 'Canceled', 'Refused', 'Unused offer'], dtype='float')
    data['NAME_CONTRACT_STATUS'] = data['NAME_CONTRACT_STATUS'].map(freq).astype('float')
    
    # ---NAME_PAYMENT_TYPE---
    data = create_freq_feature(data, 'NAME_PAYMENT_TYPE')
    
    # ---CODE_REJECT_REASON---
    data = create_freq_feature(data, 'CODE_REJECT_REASON')
    
    # ---NAME_CLIENT_TYPE---
    data = create_freq_feature(data, 'NAME_CLIENT_TYPE')
    
    # ---NAME_GOODS_CATEGORY---
    data = create_freq_feature(data, 'NAME_GOODS_CATEGORY')
    
    # ---NAME_PORTFOLIO---
    # ---NAME_PRODUCT_TYPE---
    # ---NAME_YIELD_GROUP---
    data = create_freq_feature(data, 'NAME_PORTFOLIO')
    data = create_freq_feature(data, 'NAME_PRODUCT_TYPE')
    data = create_freq_feature(data, 'NAME_YIELD_GROUP')

    # ---another---
    data = data.fillna(0)
    
    data = pd.pivot_table(data,  
                          index=['APPLICATION_NUMBER'], 
                          aggfunc=np.max)
    
    return data

In [10]:
# bki

def preprocessing_bki(data: pd.DataFrame, 
                      copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
    
    # ---CREDIT_ACTIVE---
    # ---CREDIT_CURRENCY---
    # ---CREDIT_TYPE---
    data = create_freq_feature(data, 'CREDIT_ACTIVE')
    data = create_freq_feature(data, 'CREDIT_CURRENCY')
    data = create_freq_feature(data, 'CREDIT_TYPE')
    
    # ---DAYS_CREDIT_ENDDATE---
    mean = data.loc[data['DAYS_CREDIT_ENDDATE'].notnull(), ['DAYS_CREDIT_ENDDATE']].mean()
    data['DAYS_CREDIT_ENDDATE'] = data['DAYS_CREDIT_ENDDATE'].fillna(mean[0])
    
    # ---DAYS_ENDDATE_FACT---
    data.loc[data['DAYS_ENDDATE_FACT'].isnull(), ['DAYS_ENDDATE_FACT']] = data['DAYS_CREDIT_ENDDATE']
    
    # ---AMT_CREDIT_MAX_OVERDUE---
    data['AMT_CREDIT_MAX_OVERDUE'] = data['AMT_CREDIT_MAX_OVERDUE'].fillna(0)
    
    # ---another---
    data.drop(['AMT_ANNUITY'], axis='columns', inplace=True)
    data = data.fillna(0)
    
    data = pd.pivot_table(data,  
                          index=['APPLICATION_NUMBER'], 
                          aggfunc=np.max)
    
    return data

In [6]:
# payments

def preprocessing_payments(data: pd.DataFrame, 
                           copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
    
    # ---another---
    data = data.fillna(0)
    
    data = pd.pivot_table(data,  
                          index=['APPLICATION_NUMBER'], 
                          aggfunc=np.median)
    
    return data

In [7]:
# payments

def preprocessing_client_profile(data: pd.DataFrame, 
                                 copy: bool = True) -> pd.DataFrame:
    
    if copy:
        data = data.copy()
    
    # ---GENDER---
    data['GENDER'] = (data['GENDER'] == 'F').astype(int)
    
    # ---FAMILY_STATUS---
    # ---EDUCATION_LEVEL---
    data = create_freq_feature(data, 'FAMILY_STATUS')
    data = create_freq_feature(data, 'EDUCATION_LEVEL')
    
    
    # ---OWN_CAR_AGE---
    # ---EXTERNAL_SCORING_RATING_1---
    # ---EXTERNAL_SCORING_RATING_3---
    data = data.fillna(value=
                       {'OWN_CAR_AGE': 0, 
                        'EXTERNAL_SCORING_RATING_1': data['EXTERNAL_SCORING_RATING_1'].min(),
                        'EXTERNAL_SCORING_RATING_3': data['EXTERNAL_SCORING_RATING_3'].min()
                       })

    # ---another---
    data = data.fillna(0)
    
    return data

In [15]:
#read data

df_client_profile = pd.read_csv(path_client_profile)
df_applications_history = pd.read_csv(path_applications_history)
df_bki = pd.read_csv(path_bki)
df_payments = pd.read_csv(path_payments)
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

In [None]:
def data_merege():
    