In [27]:
import pandas as pd              #Обработка табличных данных
import numpy as np               #Алгоритмы линейной алгебры
import matplotlib.pyplot as plt  #Различные способы визуализации


%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [28]:
df = pd.read_csv('application_info.csv', index_col='id')
y = pd.read_csv('default_flg.csv', index_col='id')
train_index    = df[df.sample_cd =='train'].sample_cd.index
validate_index = df[df.sample_cd =='validate'].sample_cd.index
train_valid= df[(df.sample_cd =='train')|(df.sample_cd =='validate')].sample_cd.index
test_index     = df[df.sample_cd =='test'].sample_cd.index
train_valid    = df[(df.sample_cd =='train')|(df.sample_cd =='validate')].sample_cd.index

df.drop(['sample_cd','application_dt'],axis=1,inplace=True) #Удаляем два столбца из выборки
category_columns = [col for col in df.columns if df.dtypes[col] == 'object']
df.education_cd.fillna('SCH', inplace=True)
data_category = pd.get_dummies(df[category_columns], drop_first=True)
df.drop(category_columns, axis=1, inplace=True)
df = pd.concat([df, data_category], axis=1)
CorrKoef = df.corr()
CorField = []
for i in CorrKoef:
    for j in CorrKoef.index[CorrKoef[i] > 0.6]:
        if i!=j:
            CorField.append(j)
            print ('%s-->%s: r^2=%f' % (i,j, CorrKoef[i][CorrKoef.index==j].values[0]))

home_address_cd-->work_address_cd: r^2=0.740874
work_address_cd-->home_address_cd: r^2=0.740874
car_own_flg_Y-->car_type_flg_Y: r^2=0.700206
car_type_flg_Y-->car_own_flg_Y: r^2=0.700206


In [29]:
def WoE(target, feature, nbuck=10):
    """Функция, русующая зависимость WoE от среднего значения признака в бакете"""
    #Копируем данные
    df = pd.DataFrame()
    df['target']  = target.copy()
    df['feature'] = feature.copy()
    #Считаем номер бакета
    df['feature_buck'] = df['feature'].rank(method='min', pct=1, na_option = 'top').apply(
        lambda x: int((x- 1/df.shape[0])*nbuck))
    #Запоминаем среднее значение целевой переменной и кол-во наблюдений
    tr  = df['target'].mean()
    cnt = df['target'].count()
    #Агрегируем по каждому бакету
    df = df.groupby('feature_buck')['target' ].agg({'tr': 'mean','cnt': 'size'}).join(
        df.groupby('feature_buck')['feature'].agg({'feature_av': 'mean'}))
    #В случае крайних значений tr делаем сглаживание
    df.tr = df.tr.apply(lambda x: max(0.001, min(0.999, x)))
    #считаем WoE
    df.eval('WoE = log(tr/@tr)-log((1-tr)/(1-@tr))', inplace=True)
    #Погрешность WoE
    df.eval('WoE_d = 2/sqrt(tr*(1-tr)*cnt)', inplace=True) 
    #убираем наблюдения с пустым значением признака
    df.dropna(inplace=True)
    #Рисуем график
    fig = plt.gcf()
    plt.grid(True)
    plt.errorbar(df['feature_av'], df.WoE, yerr=np.array((df['WoE_d'])))
    plt.xlabel('feature')
    plt.ylabel('WoE')
    plt.show()

def IV(target, feature, nbuck=10):
    """Функция, возвращающая IV признака-feature с целевой переменной target"""
    #Копируем данные
    df = pd.DataFrame()
    df['target']  = target.copy()
    df['feature'] = feature.copy()
    #Считаем номер бакета
    df['feature_buck'] = df['feature'].rank(method='min', pct=1, na_option = 'top').apply(
        lambda x: int((x- 1.0/df.shape[0])*nbuck))
    #Запоминаем среднее значение целевой переменной и кол-во наблюдений
    tr  = df['target'].mean()
    cnt = df['target'].count()
    #Агрегируем по каждому бакету
    df = df.groupby('feature_buck')['target'].agg({'tr': 'mean','cnt': 'size'})
    #В случае крайних значений tr делаем сглаживание
    df.tr = df.tr.apply(lambda x: max(0.001, min(0.999, x)))
    #Считаем IV
    df.eval('IV = ( (tr/@tr) - ((1-tr)/(1-@tr))) * (log(tr/@tr) - log((1-tr)/(1-@tr)) ) * (cnt/@cnt)', inplace=True)
    IV = df.IV.sum()
    return IV

In [30]:
df['income'] = df.income.apply(lambda x: np.log(x))
#df['age'] = df.age.apply(lambda x: np.log(x))
df['home_address_cd'] = df.home_address_cd.apply(lambda x: int(x<2))
df['SNA']=df.SNA.apply(lambda x: np.log(x))
df['region_rating']=df.region_rating.apply(lambda x: np.log(x))

#### Логистическая регрессия

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn import cross_validation, svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB


from sklearn.metrics import roc_curve, auc
import pylab as pl



In [35]:
print (IV(y.loc[train_index].default_flg, df.loc[train_index].appl_rej_cnt, nbuck=10))
print (IV(y.loc[train_index].default_flg, df.loc[train_index].appl_rej_cnt.apply(lambda x: int(x <1)), nbuck=10))
print (IV(y.loc[train_index].default_flg, df.loc[train_index].appl_rej_cnt.apply(lambda x: np.log(x)), nbuck=10))

0.10974374576408974
0.10279484257106726
0.10974374576408974


In [38]:
#GB
model_GBC = GradientBoostingClassifier(max_depth =5)
model_GBC.fit(df.loc[train_index], y.loc[train_index].default_flg)
AUC_train = roc_auc_score(y.loc[train_index].default_flg, model_GBC.predict_proba(df.loc[train_index])[:,1])
AUC_validate = roc_auc_score(y.loc[validate_index].default_flg, model_GBC.predict_proba(df.loc[validate_index])[:,1])
print('%.6f %.6f ' %  (AUC_train, AUC_validate))

0.763284 0.719684 


In [None]:
0.761730 0.719217 5.000000
age 0.761730 0.719340 5.000000
+home 0.760745 0.720149 5.000000
+SNA 0.760417 0.716843 5.000000
+log SNA 0.760745 0.720173 5.000000\0.763114 0.721024 

0.763114 0.720948  best try

In [66]:
tmp=df.copy

In [43]:
180816/5*5
14465

144652.8

#cross-valid

Unnamed: 0_level_0,1,2,3,4,5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
180817,0.088132,0.081459,0.075695,0.09522,0.084646
180818,0.127968,0.156485,0.149582,0.133868,0.108715
180819,0.068228,0.071375,0.094486,0.048655,0.076838
180820,0.094123,0.096569,0.116317,0.077346,0.07564
180821,0.053462,0.05599,0.061577,0.059218,0.05713


In [52]:
p6=pd.DataFrame((pred['1']+pred['2']+pred['3']+pred['4']+pred['5'])/5)
p6.head()

1    0.054975
2    0.054538
3    0.056775
4    0.054259
5    0.052743
dtype: float64

In [38]:
roc_auc_score(y.loc[validate_index].default_flg, p6.loc[validate_index])

0.72084797710094184

In [8]:

AUC_validate = roc_auc_score(y.loc[validate_index].default_flg, model_GBC.predict_proba(df.loc[validate_index])[:,1])
print('%.6f %.6f ' %  (AUC_train, AUC_validate))

Int64Index([    1,     2,     3,     4,     5,     6,     7,     8,     9,
               10,
            ...
            66080, 66081, 66082, 66083, 66084, 66085, 66086, 66087, 66088,
            66089],
           dtype='int64', name='id', length=66089)

# final

In [32]:
model_GBC = GradientBoostingClassifier(max_depth =5)
model_GBC.fit(df.loc[train_valid], y.loc[train_valid].default_flg)
test_pd = model_GBC.predict_proba(df.loc[test_index])[:,1]
test_pd.max()

0.73201600878449136

In [33]:
test_pd = pd.DataFrame(test_pd, columns=['score'], index=test_index)
test_pd.to_csv('my_submission.csv')

In [50]:

p6.to_csv('my_submission.csv')
p6.head()

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
180817,0.08503
180818,0.135324
180819,0.071916
180820,0.091999
180821,0.057475


In [168]:
test_pd.head()

Unnamed: 0_level_0,score
id,Unnamed: 1_level_1
180817,0.070596
180818,0.136011
180819,0.069017
180820,0.076142
180821,0.056463


In [240]:
#GB
print ('%8s %8s %4s' % ('Train','Validate','n'))
for n in [4,5,6]:
    model_GBC = GradientBoostingClassifier(max_depth =n)
    model_GBC.fit(df.loc[train_index], y.loc[train_index].default_flg)
    AUC_train = roc_auc_score(y.loc[train_index].default_flg, model_GBC.predict_proba(df.loc[train_index])[:,1])
    AUC_validate = roc_auc_score(y.loc[validate_index].default_flg, model_GBC.predict_proba(df.loc[validate_index])[:,1])
    print('%.6f %.6f %f' %  (AUC_train, AUC_validate, n))

   Train Validate    n
0.751094 0.720173 4.000000
0.763114 0.720981 5.000000
0.780682 0.719704 6.000000


In [34]:
#LR
print ('%8s %8s %4s' % ('Train','Validate','C'))
for C in [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]:
    LR = LogisticRegression(C=C, penalty='l2')
    LR.fit(df.loc[train_index], y.loc[train_index].default_flg)
    AUC_train = roc_auc_score(y.loc[train_index].default_flg, LR.predict_proba(df.loc[train_index])[:,1])
    AUC_validate = roc_auc_score(y.loc[validate_index].default_flg, LR.predict_proba(df.loc[validate_index])[:,1])
    print('%.6f %.6f %f' %  (AUC_train, AUC_validate, C))

   Train Validate    C
0.630605 0.620242 0.000010
0.705415 0.682263 0.000100
0.729035 0.711230 0.001000
0.730102 0.715028 0.010000
0.730218 0.715564 0.100000
0.730202 0.715605 1.000000
0.730198 0.715606 10.000000
0.730198 0.715601 100.000000
