In [144]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import display
plt.rc('font', family='Verdana')

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_auc_score

In [153]:
data = pd.read_csv("data/Credit_OTP.csv", encoding='cp1251', sep = ';')

def small_categories_aggregation(column, n_samples, value='Укрупненная категория', dataset=data):
    '''
    Укрупняет категории и числовые столбцы в датасете
    :param column: имя стоблца в датасете data
    :param n_samples: порог наблюдений, ниже которого категории объединяются в одну
    :param value: значение, которым будет заполнено
    :param dataset: датасет для изменения
    :return:
    '''

    tmp_small_columns = dataset[column].value_counts()[dataset[column].value_counts() < n_samples].index

    for col in tmp_small_columns:
        dataset.at[dataset[column] == col, column] = value
    
data['PREVIOUS_CARD_NUM_UTILIZED'].fillna(0.0, inplace=True)

data.drop('AGREEMENT_RK', axis=1, inplace=True)

data.drop('DL_DOCUMENT_FL', axis=1, inplace=True)

# imputers
data['GEN_INDUSTRY'].fillna('Пропуски', inplace=True)
data['GEN_TITLE'].fillna('Пропуски', inplace=True)
data['ORG_TP_STATE'].fillna('Пропуски', inplace=True)
data['ORG_TP_FCAPITAL'].fillna('Пропуски', inplace=True)
data['JOB_DIR'].fillna('Пропуски', inplace=True)
data['TP_PROVINCE'].fillna('Пропуски', inplace=True)
data['REGION_NM'].fillna(data['REGION_NM'].value_counts().index[0], inplace=True)

data.at[data['WORK_TIME'] >= 468, 'WORK_TIME'] = np.NaN
data['WORK_TIME'].fillna(data['WORK_TIME'].dropna().median(), inplace=True)

for i in ['TARGET', 'SOCSTATUS_WORK_FL', 'SOCSTATUS_PENS_FL', 'GENDER', 
      'EDUCATION', 'MARITAL_STATUS', 'GEN_INDUSTRY', 'GEN_TITLE',
      'ORG_TP_STATE', 'ORG_TP_FCAPITAL', 'JOB_DIR', 'REG_ADDRESS_PROVINCE',
      'FACT_ADDRESS_PROVINCE', 'POSTAL_ADDRESS_PROVINCE', 'TP_PROVINCE', 'REGION_NM',
      'REG_FACT_FL', 'FACT_POST_FL', 'REG_POST_FL', 'REG_FACT_POST_FL', 
      'REG_FACT_POST_TP_FL', 'FL_PRESENCE_FL', 'AUTO_RUS_FL', 'HS_PRESENCE_FL',
      'COT_PRESENCE_FL', 'GAR_PRESENCE_FL', 'LAND_PRESENCE_FL', 'DL_DOCUMENT_FL',
      'GPF_DOCUMENT_FL', 'FACT_PHONE_FL', 'REG_PHONE_FL', 'GEN_PHONE_FL']:
    if i in data.columns:
        data[i]=data[i].astype('str')

for i in ['PERSONAL_INCOME', 'CREDIT', 'FST_PAYMENT', 'LOAN_AVG_DLQ_AMT', 'LOAN_MAX_DLQ_AMT']:
    if i in data.columns:
        data[i] = data[i].str.replace(',', '.').astype('float')

categorical_columns = [c for c in data.columns if data[c].dtype.name == 'object']
numerical_columns   = [c for c in data.columns if data[c].dtype.name != 'object']

data.at[data['EDUCATION'] == 'Ученая степень', 'EDUCATION'] = 'Высшее'
data.at[data['EDUCATION'] == 'Два и более высших образования', 'EDUCATION'] = 'Высшее'

small_categories_aggregation('GEN_INDUSTRY', 100)

data.at[data['GEN_TITLE'] == 'Партнер', 'GEN_TITLE'] = 'Другое'
data.at[data['ORG_TP_STATE'] == 'Частная ком. с инос. капиталом', 'ORG_TP_STATE'] = 'Частная компания'
data.at[data['JOB_DIR'] == 'Реклама и маркетинг', 'JOB_DIR'] = 'Реклама и юр. услуги'
data.at[data['JOB_DIR'] == 'Юридическая служба', 'JOB_DIR'] = 'Реклама и юр. услуги'
data.at[data['TP_PROVINCE'] == 'Кабардино-Балкария', 'TP_PROVINCE'] = 'Ставропольский край'
data.at[data['POSTAL_ADDRESS_PROVINCE'] == 'Эвенкийский АО', 'POSTAL_ADDRESS_PROVINCE'] = 'Красноярский край'
data.at[data['POSTAL_ADDRESS_PROVINCE'] == 'Агинский Бурятский АО', 'POSTAL_ADDRESS_PROVINCE'] = 'Красноярский край'
data.at[data['POSTAL_ADDRESS_PROVINCE'] == 'Усть-Ордынский Бурятский АО', 'POSTAL_ADDRESS_PROVINCE'] = 'Красноярский край'

data['SOLVENCY'] = data['PERSONAL_INCOME']/(data['CREDIT']/data['TERM'])

dummies = False

if dummies:

    data = pd.get_dummies(data)

else:

    for cat in categorical_columns:

        tmp_dic = dict.fromkeys(data[cat])
        i = 0

        for key in tmp_dic:

            tmp_dic[key] = i
            i += 1

        data[cat] = data[cat].map(tmp_dic)

    for cat in categorical_columns:
        data[cat] = data[cat].astype('str')

In [155]:
y = data.loc[:, 'TARGET']
data.drop(['TARGET'], axis=1, inplace=True)
X = data.loc[:, 'AGE':'GEN_PHONE_FL']
logreg_fullset = LogisticRegression().fit(X, y)
forest_fullset = RandomForestClassifier(n_estimators=500, max_depth=12, random_state=152, n_jobs=-1).fit(X, y)

In [161]:
Series(forest_fullset.predict(X)).value_counts()

0    15079
1      144
dtype: int64

In [163]:
forest_fullset.predict_proba(X)[:, 1]

array([ 0.08422696,  0.10831912,  0.08157296, ...,  0.09908449,
        0.03067536,  0.12645127])

In [169]:
forest_fullset.predict(X).astype('int')

array([0, 0, 0, ..., 0, 0, 0])

In [171]:
roc_auc_score(y.astype('int'), forest_fullset.predict(X).astype('int'))

0.53973509933774833

In [149]:
y = data.loc[:, 'TARGET_1']
data.drop(['TARGET_0', 'TARGET_1'], axis=1, inplace=True)
X = data.loc[:, 'AGE':'GEN_PHONE_FL_1']
logreg_fullset = LogisticRegression().fit(X, y)
forest_fullset = RandomForestClassifier(n_estimators=500, max_depth=12, random_state=152, n_jobs=-1).fit(X, y)

KeyError: 'the label [TARGET_1] is not in the [columns]'

In [150]:
data_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15223 entries, 0 to 15222
Columns: 441 entries, AGE to GEN_PHONE_FL_1
dtypes: float64(8), int64(11), uint8(422)
memory usage: 8.3 MB


In [151]:
roc_auc_score(y, forest_fullset.predict(X))

ValueError: Data is not binary and pos_label is not specified

In [102]:
roc_auc_score(y, logreg_fullset.predict(X))

0.50128280086377641