In [15]:
# import
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split

In [16]:
data = pd.read_csv("data/Credit_OTP.csv", encoding='cp1251', sep = ';')

In [17]:
def small_categories_aggregation(column, n_samples, value='Укрупненная категория', dataset=data):
    '''
    Укрупняет категории и числовые столбцы в датасете
    :param column: имя стоблца в датасете data
    :param n_samples: порог наблюдений, ниже которого категории объединяются в одну
    :param value: значение, которым будет заполнено
    :param dataset: датасет для изменения
    :return:
    '''

    tmp_small_columns = dataset[column].value_counts()[dataset[column].value_counts() < n_samples].index

    for col in tmp_small_columns:
        dataset.at[dataset[column] == col, column] = value
        
def data_processing(data, dummies=False):
    
    data['PREVIOUS_CARD_NUM_UTILIZED'].fillna(0.0, inplace=True)
    data.drop('AGREEMENT_RK', axis=1, inplace=True)
    data.drop('DL_DOCUMENT_FL', axis=1, inplace=True)

    data['GEN_INDUSTRY'].fillna('Пропуски', inplace=True)
    small_categories_aggregation('GEN_INDUSTRY', 85)

    data['GEN_TITLE'].fillna('Пропуски', inplace=True)
    small_categories_aggregation('GEN_TITLE', 200)

    data['ORG_TP_STATE'].fillna('Пропуски', inplace=True)
    data.at[(data['ORG_TP_STATE'] == data['ORG_TP_STATE'].value_counts().index[-1]), 'ORG_TP_STATE'] = 'Частная компания'

    data['ORG_TP_FCAPITAL'].fillna('С участием', inplace=True)

    data['JOB_DIR'].fillna('Пропуски', inplace=True)
    small_categories_aggregation('JOB_DIR', 102)

    data['TP_PROVINCE'].fillna('Пропуски', inplace=True)
    small_categories_aggregation('TP_PROVINCE', 90)

    data['REGION_NM'].fillna(data['REGION_NM'].value_counts().index[0], inplace=True)

    data.at[data['WORK_TIME'] >= 468, 'WORK_TIME'] = np.NaN
    data['WORK_TIME'].fillna(data['WORK_TIME'].dropna().mean(), inplace=True)

    for i in ['TARGET', 'SOCSTATUS_WORK_FL', 'SOCSTATUS_PENS_FL', 'GENDER', 
              'EDUCATION', 'MARITAL_STATUS', 'GEN_INDUSTRY', 'GEN_TITLE',
              'ORG_TP_STATE', 'ORG_TP_FCAPITAL', 'JOB_DIR', 'REG_ADDRESS_PROVINCE',
              'FACT_ADDRESS_PROVINCE', 'POSTAL_ADDRESS_PROVINCE', 'TP_PROVINCE', 'REGION_NM',
              'REG_FACT_FL', 'FACT_POST_FL', 'REG_POST_FL', 'REG_FACT_POST_FL', 
              'REG_FACT_POST_TP_FL', 'FL_PRESENCE_FL', 'AUTO_RUS_FL', 'HS_PRESENCE_FL',
              'COT_PRESENCE_FL', 'GAR_PRESENCE_FL', 'LAND_PRESENCE_FL', 'GPF_DOCUMENT_FL',
              'FACT_PHONE_FL', 'REG_PHONE_FL', 'GEN_PHONE_FL', 'CHILD_TOTAL', 'DEPENDANTS']:
        if i in data.columns:
            data[i]=data[i].astype('str')

    data['AGE_CATEGORY'] = 0

    data.at[data['AGE'] <= 30, 'AGE_CATEGORY'] = '1'
    data.at[(data['AGE'] >= 31) & (data['AGE'] <= 40), 'AGE_CATEGORY'] = '2'
    data.at[(data['AGE'] >= 41) & (data['AGE'] <= 50), 'AGE_CATEGORY'] = '3'
    data.at[(data['AGE'] >= 51) & (data['AGE'] <= 60), 'AGE_CATEGORY'] = '4'
    data.at[(data['AGE'] >= 61) & (data['AGE'] <= 70), 'AGE_CATEGORY'] = '5'

    data.at[(data['AGE'] >= 64), 'AGE'] = 64
    data.at[(data['AGE'] <= 22), 'AGE'] = 22

#     data['CHILD_TOTAL'] = data['CHILD_TOTAL'].astype('str')
#     small_categories_aggregation('CHILD_TOTAL', 100)

#     data['DEPENDANTS'] = data['DEPENDANTS'].astype('str')
#     small_categories_aggregation('DEPENDANTS', 3000)

    data.at[(data['EDUCATION'] == 'Ученая степень'), 'EDUCATION'] = 'Высшее'
    data.at[(data['EDUCATION'] == 'Два и более высших образования'), 'EDUCATION'] = 'Высшее'

    data.at[(data['FAMILY_INCOME'] == 'до 5000 руб.'), 'FAMILY_INCOME'] = 'от 5000 до 10000 руб.'

    for i in ['PERSONAL_INCOME', 'CREDIT', 'FST_PAYMENT', 'LOAN_AVG_DLQ_AMT', 'LOAN_MAX_DLQ_AMT']:
        if i in data.columns:
            data[i] = data[i].str.replace(',', '.').astype('float')

    small_categories_aggregation('REG_ADDRESS_PROVINCE', 55)

    data.at[(data['OWN_AUTO'] == 2), 'OWN_AUTO'] = 1
    # data['OWN_AUTO'] = data['OWN_AUTO'].astype('str')

    data.at[(data['TERM'] == 22), 'TERM'] = 20
    data.at[(data['TERM'] == 23), 'TERM'] = 20
    data.at[(data['TERM'] == 19), 'TERM'] = 20
    data.at[(data['TERM'] == 30), 'TERM'] = 36
    data.at[(data['TERM'] == 13), 'TERM'] = 14
    data.at[(data['TERM'] == 16), 'TERM'] = 20

    data.at[(data['FACT_LIVING_TERM'] < 0), 'FACT_LIVING_TERM'] = abs(data[data['FACT_LIVING_TERM'] < 0]['FACT_LIVING_TERM'])

    data.at[(data['LOAN_NUM_CLOSED'] >= 6), 'LOAN_NUM_CLOSED'] = 5
    data.at[(data['LOAN_NUM_CLOSED'] == 8), 'LOAN_NUM_CLOSED'] = 5
    data.at[(data['LOAN_NUM_CLOSED'] == 11), 'LOAN_NUM_CLOSED'] = 5
    data.at[(data['LOAN_NUM_CLOSED'] == 6), 'LOAN_NUM_CLOSED'] = 5

    data.at[(data['LOAN_NUM_PAYM'] >= 40), 'LOAN_NUM_PAYM'] = 40
    data.at[(data['LOAN_NUM_PAYM'] == 35), 'LOAN_NUM_PAYM'] = 36
    data.at[(data['LOAN_NUM_PAYM'] == 39), 'LOAN_NUM_PAYM'] = 36
    data.at[(data['LOAN_NUM_PAYM'] == 38), 'LOAN_NUM_PAYM'] = 36

    data.at[(data['LOAN_DLQ_NUM'] >= 5), 'LOAN_DLQ_NUM'] = 5

    data['WITHOUT_DELAY'] = 1
    data.at[(data['LOAN_MAX_DLQ_AMT'] == 0), 'WITHOUT_DELAY'] = 0
    data['WITHOUT_DELAY'] = data['WITHOUT_DELAY'].astype('str')

    data.at[(data['PREVIOUS_CARD_NUM_UTILIZED'] == 2.0), 'PREVIOUS_CARD_NUM_UTILIZED'] = 1.0

    data['REG_ADDRESS_PROVINCE_FREQ'] = data['REG_ADDRESS_PROVINCE'].map(data['REG_ADDRESS_PROVINCE'].value_counts())
    data['FACT_ADDRESS_PROVINCE_FREQ'] = data['FACT_ADDRESS_PROVINCE'].map(data['FACT_ADDRESS_PROVINCE'].value_counts())
    
    data['SOLVENCY'] = data['PERSONAL_INCOME']/(data['CREDIT']/data['TERM'])
    
    categorical_columns = [c for c in data.columns if data[c].dtype.name == 'object']
    numerical_columns   = [c for c in data.columns if data[c].dtype.name != 'object']
    
    if dummies:
        
        data['TARGET'] = data['TARGET'].astype('int')
        data = pd.get_dummies(data)
        return data
    
    else:
        
        for cat in categorical_columns:
            
            tmp_dic = dict.fromkeys(data[cat])
            i = 0
            
            for key in tmp_dic:
                
                tmp_dic[key] = i
                i += 1
            
            data[cat] = data[cat].map(tmp_dic)
            
        for cat in categorical_columns:
            data[cat] = data[cat].astype('str')
        
        return data

In [18]:
processed_data = data_processing(data)

In [19]:
processed_data.head()

Unnamed: 0,TARGET,AGE,SOCSTATUS_WORK_FL,SOCSTATUS_PENS_FL,GENDER,CHILD_TOTAL,DEPENDANTS,EDUCATION,MARITAL_STATUS,GEN_INDUSTRY,...,LOAN_DLQ_NUM,LOAN_MAX_DLQ,LOAN_AVG_DLQ_AMT,LOAN_MAX_DLQ_AMT,PREVIOUS_CARD_NUM_UTILIZED,AGE_CATEGORY,WITHOUT_DELAY,REG_ADDRESS_PROVINCE_FREQ,FACT_ADDRESS_PROVINCE_FREQ,SOLVENCY
0,0,49,0,0,0,0,0,0,0,0,...,2,1,1580.0,1580.0,0.0,0,0,253,250,3.75
1,0,32,0,0,0,1,1,1,0,0,...,1,1,4020.0,4020.0,0.0,1,0,127,127,3.325635
2,0,52,0,0,0,2,2,2,0,1,...,0,0,0.0,0.0,0.0,2,1,202,203,3.260279
3,0,39,0,0,0,3,0,3,0,2,...,3,1,1589.923333,1590.0,0.0,1,0,407,404,17.664058
4,0,30,0,0,1,4,2,1,0,3,...,2,1,1152.15,2230.0,0.0,3,0,127,127,6.548431


In [22]:
processed_data.to_csv('data/processed_data.csv', encoding='utf-8')

In [20]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('TARGET', axis=1), data['TARGET'],
                                                    test_size=0.3, random_state=42)

In [21]:
X_train.head()

Unnamed: 0,AGE,SOCSTATUS_WORK_FL,SOCSTATUS_PENS_FL,GENDER,CHILD_TOTAL,DEPENDANTS,EDUCATION,MARITAL_STATUS,GEN_INDUSTRY,GEN_TITLE,...,LOAN_DLQ_NUM,LOAN_MAX_DLQ,LOAN_AVG_DLQ_AMT,LOAN_MAX_DLQ_AMT,PREVIOUS_CARD_NUM_UTILIZED,AGE_CATEGORY,WITHOUT_DELAY,REG_ADDRESS_PROVINCE_FREQ,FACT_ADDRESS_PROVINCE_FREQ,SOLVENCY
51,37,0,0,1,3,0,1,0,3,9,...,0,0,0.0,0.0,0.0,1,1,292,292,7.914523
4750,23,0,0,1,4,2,2,3,0,0,...,0,0,0.0,0.0,0.0,3,1,407,404,9.2827
2304,40,0,0,0,3,0,0,3,4,9,...,0,0,0.0,0.0,0.0,1,1,331,324,4.376155
13811,43,0,0,1,3,0,3,0,11,3,...,0,0,0.0,0.0,0.0,0,1,130,119,16.026668
7717,56,1,1,0,5,0,1,3,8,8,...,0,0,0.0,0.0,0.0,2,1,459,454,3.163074
