In [493]:
import numpy as np
import shap
from sklearn.inspection import permutation_importance
import pandas as pd
from sklearn.decomposition import PCA
from utils_ import feat_imp,reduce_mem_usage,make_feat,freq_enc,fill_cat_p,st_scal
from sklearn.model_selection import cross_val_score,train_test_split
import datetime
from sklearn.metrics import roc_auc_score,roc_curve
import lightgbm
import xgboost
import catboost
from sklearn.base import clone
from itertools import combinations
from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold,cross_validate
from scipy.stats import ttest_rel

In [225]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [449]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [469]:
def make_pca(x, n=5):
    
    x_train=x.copy()
    
    for col in x_train.columns:
        
        x_train[col]=st_scal(x_train[col])
        
        pca = PCA(n_components=n)
        embedding = pca.fit_transform(x_train.fillna(0.001))
        emb_frame=pd.DataFrame(embedding)
        columns=emb_frame.columns.astype(str)
        emb_frame.columns=columns+'_'+str(x_train.shape[1])
        
        x_train=pd.concat((x_train.reset_index(drop=True),emb_frame), axis=1)
        
        return x_train
        
    

In [447]:
def make_feat(data,feat_list,flag=['s','p','d','m'] ):
    
    x=data.copy()   
    x.fillna(0.001, inplace=True)

    

    for item in combinations(feat_list, 2):
        if 's' in flag :
            feat_name = item[0] + '_' + item[1] + 'sum'
            x[feat_name] = x[item[0]] + x[item[1]]

        if 'p' in flag:
            feat_name = item[0] + '_' + item[1] + 'mult'
            x[feat_name] = x[item[0]] * x[item[1]]

        if 'd' in flag:
            feat_name = item[0] + '_' + item[1] + 'div'
            x[feat_name] = x[item[0]] / x[item[1]]

        if 'm' in flag :
            feat_name = item[0] + '_' + item[1] + 'min'
            x[feat_name] = x[item[0]]-x[item[1]]

        x = x.replace(np.inf, np.nan)
        x = x.replace(-np.inf, np.nan)

    return x

In [494]:
class Credit_Profile:
    
    def __init__(self, x):
        
        self.data_prep =  self.prep_data(x)
        self.data=self.data_prep.drop('TARGET',axis=1)
        self.target=self.data_prep.TARGET
        self.trained_models=[]
        
 
    
    def prep_data(self,x):
        
        
        bki = self.prep_bki()
        client_profile=self.prep_client_profile()
        payments=self.prep_payments()
        applications_history=self.prep_applications_history()

        data_list={
            'bki_gr':bki,
            'client_profile':client_profile,
               'payments_gr':payments,
               'applications_history_gr':applications_history
              }

        total = x.copy()

        for data in data_list.values():

            total=total.merge(data,on='APPLICATION_NUMBER',how='left')

        total=total.replace(np.inf,np.nan)
        total=total.replace(-np.inf,np.nan)
        train_prep=total

        train_prep.EDUCATION_LEVEL=fill_cat_p(train_prep.EDUCATION_LEVEL)
        train_prep.GENDER=fill_cat_p(train_prep.GENDER)
        train_prep.FAMILY_STATUS=fill_cat_p(train_prep.FAMILY_STATUS)
        train_prep.CODE_REJECT_REASON=fill_cat_p(train_prep.CODE_REJECT_REASON)
        train_prep.NAME_PAYMENT_TYPE=fill_cat_p(train_prep.NAME_PAYMENT_TYPE)
        train_prep.NAME_CONTRACT_STATUS=fill_cat_p(train_prep.NAME_CONTRACT_STATUS)

        t=train_prep.groupby('APPLICATION_NUMBER', as_index=False).PREV_APPLICATION_NUMBER_x.count()
        train_prep=train_prep.merge(t, on='APPLICATION_NUMBER',how='left')    
        train_prep['TOTAL_AMOUNT_CREDIT_d'] = train_prep.TOTAL_SALARY/train_prep.AMOUNT_CREDIT_x
        train_prep['TOTAL_AMOUNT_CREDIT_m'] = train_prep.TOTAL_SALARY-train_prep.AMOUNT_CREDIT_x
        train_prep['TOTAL_FAMILY_SIZE'] = train_prep.TOTAL_SALARY/train_prep.FAMILY_SIZE
        train_prep['AMOUNT_CREDIT_FAMILY_SIZE'] = train_prep.AMOUNT_CREDIT_x/train_prep.FAMILY_SIZE 
        train_prep['TOTAL_AMOUNT_CREDIT_p'] = (train_prep.TOTAL_SALARY-train_prep.AMOUNT_CREDIT_x)/train_prep.TOTAL_SALARY

        train_prep['DAYS_AMOUNT_CREDIT'] = train_prep.AMOUNT_CREDIT_x/train_prep.DAYS_CREDIT
        train_prep['DAY_OVERDUE_AMOUNT_CREDIT'] = train_prep.AMOUNT_CREDIT_x/train_prep.CREDIT_DAY_OVERDUE
        train_prep['CREDIT_ENDDATE_AMOUNT_CREDIT'] = train_prep.AMOUNT_CREDIT_x/train_prep.DAYS_CREDIT_ENDDATE
        train_prep['ENDDATE_FACT_AMOUNT_CREDIT'] = train_prep.AMOUNT_CREDIT_x/train_prep.DAYS_ENDDATE_FACT
        train_prep['CREDIT_BUREAU_AMOUNT_CREDIT'] = train_prep.AMOUNT_CREDIT_x/train_prep.AMT_REQ_CREDIT_BUREAU
        train_prep['FIRST_DRAWING_DAYS_CREDIT'] = train_prep.DAYS_FIRST_DRAWING/train_prep.DAYS_CREDIT
        train_prep['DAYS_INSTALMENT_FIRST_DRAWING'] = train_prep.DAYS_INSTALMENT/train_prep.DAYS_FIRST_DRAWING
        train_prep['GENDER_EDUCATION_LEVEL'] = train_prep.GENDER+train_prep.EDUCATION_LEVEL
        train_prep['GENDER_FAMILY_STATUS'] = train_prep.GENDER+train_prep.FAMILY_STATUS

        train_prep=pd.get_dummies(train_prep)
        train_prep=train_prep.groupby('APPLICATION_NUMBER', as_index=False).mean()
        train_prep=train_prep.drop('APPLICATION_NUMBER', axis=1)

        train_prep=train_prep.replace(np.inf,np.nan)
        train_prep=train_prep.replace(-np.inf,np.nan)
        
        
        return train_prep

        
    def prep_bki(self):
    
        bki = pd.read_csv('bki.csv')
    
        CREDIT_TYPE_list=['Consumer credit', 'Credit card', 'Car loan', 'Mortgage',
               'Loan for business development', 'Microloan']   
        cat_feat=['APPLICATION_NUMBER']
        cat_feat.extend(bki.select_dtypes(include='object').columns.tolist())
        bki_gr=bki.groupby(cat_feat, as_index=False).mean()
  
    
        return bki_gr
    
    def prep_client_profile(self):
    
        client_profile = pd.read_csv('client_profile.csv ')

        client_profile.FAMILY_STATUS.replace('Unknown','Married', inplace=True)
        client_profile = pd.read_csv('client_profile.csv ')
        client_profile.fillna(0.01)
        client_profile.FAMILY_STATUS.replace('Unknown','Married', inplace=True)
        client_profile['SALARY_AMOUNT_ANNUITY']= client_profile.TOTAL_SALARY-client_profile.AMOUNT_ANNUITY
        client_profile['SALARY_AMOUNT_ANNUITY_SIZE']= client_profile.TOTAL_SALARY-\
        client_profile.AMOUNT_ANNUITY/client_profile.FAMILY_SIZE    
        client_profile.GENDER.fillna('F',inplace=True)    
        client_profile['TOTAL_SALARY_POPULATION']= client_profile.TOTAL_SALARY*client_profile.REGION_POPULATION
        client_profile['AGE_AMOUNT_ANNUITY']= client_profile.AMOUNT_ANNUITY/client_profile.AGE
        client_profile['AGE_AMOUNT_CREDIT']= client_profile.AMOUNT_CREDIT/client_profile.AGE
        client_profile['AGE_CAR_AGE']= client_profile.OWN_CAR_AGE/client_profile.AGE
        client_profile['LAST_JOB_CAR_AGE']= client_profile.OWN_CAR_AGE/client_profile.DAYS_ON_LAST_JOB
        client_profile['AGE_SALARY']= client_profile.TOTAL_SALARY/client_profile.AGE
        client_profile['LAST_JOB_AMOUNT_ANNUITY']= client_profile.AMOUNT_ANNUITY/client_profile.DAYS_ON_LAST_JOB
        client_profile['LAST_JOB_AMOUNT_CREDIT']= client_profile.AMOUNT_CREDIT/client_profile.DAYS_ON_LAST_JOB
        client_profile['LAST_JOB_SALARY']= client_profile.TOTAL_SALARY/client_profile.DAYS_ON_LAST_JOB
        client_profile['LAST_JOB_AGE']= client_profile.AGE/client_profile.DAYS_ON_LAST_JOB
        client_profile['CREDIT_SCORING_RATING_1']= client_profile.AMOUNT_CREDIT*client_profile.EXTERNAL_SCORING_RATING_1
        client_profile['CREDIT_SCORING_RATING_2']= client_profile.AMOUNT_CREDIT*client_profile.EXTERNAL_SCORING_RATING_2
        client_profile['CREDIT_SCORING_RATING_3']= client_profile.AMOUNT_CREDIT*client_profile.EXTERNAL_SCORING_RATING_3   
        client_profile['ANNUITY_SCORING_RATING_1']= client_profile.AMOUNT_ANNUITY*client_profile.EXTERNAL_SCORING_RATING_1
        client_profile['ANNUITY_SCORING_RATING_2']= client_profile.AMOUNT_ANNUITY*client_profile.EXTERNAL_SCORING_RATING_2
        client_profile['ANNUITY_SCORING_RATING_3']= client_profile.AMOUNT_ANNUITY*client_profile.EXTERNAL_SCORING_RATING_3
        client_profile['CREDIT_SCORING_RATING_p']= client_profile['EXTERNAL_SCORING_RATING_1']*client_profile['EXTERNAL_SCORING_RATING_2']*client_profile['EXTERNAL_SCORING_RATING_3']
        client_profile['AMT_REQ_CREDIT_BUREAU']=(client_profile.AMT_REQ_CREDIT_BUREAU_HOUR+
                                             24*client_profile.AMT_REQ_CREDIT_BUREAU_HOUR+
                                             30*24*client_profile.AMT_REQ_CREDIT_BUREAU_DAY+
                                             30*24*30*client_profile.AMT_REQ_CREDIT_BUREAU_MON+
                                             30*24*30*3*client_profile.AMT_REQ_CREDIT_BUREAU_DAY+
                                             30*24*30*3*4*client_profile.AMT_REQ_CREDIT_BUREAU_DAY)
        return client_profile
    
    def prep_payments(self):
    
        payments= pd.read_csv('payments.csv')

        payments.DAYS_ENTRY_PAYMENT.fillna(payments.DAYS_INSTALMENT, inplace=True)
        payments.AMT_PAYMENT.fillna(payments.AMT_INSTALMENT, inplace=True)
        payments=pd.get_dummies(payments)
        cat_feat=['APPLICATION_NUMBER']
        cat_feat.extend(payments.select_dtypes(include='object').columns.tolist())
        payments_gr=payments.groupby(cat_feat, as_index=False).mean()

        return payments_gr
    
    def prep_applications_history(self):
    
        applications_history= pd.read_csv('applications_history.csv ')

        cat_feat=['APPLICATION_NUMBER']
        cat_feat.extend(applications_history.select_dtypes(include='object').columns.tolist())

        applications_history_gr=applications_history.groupby(cat_feat, as_index=False).mean()

        return applications_history_gr
    
    
#   генерация новых фичей.
#   генерация происходит итерационно по заранее определенным маякам
#   маяки определяются путем удаления малозначимых фичей до момента, когда скор начинает значимо снижатся
#   в качестве основы для будущих фичей выступает топ-20 значимых фичей маяка
#   опционально можно добавить к конечному датасету pca

    def make_features(self,data,filtr_, iter_,preparation=False, pca=False, n_pca=5, *args, **kwargs):
        
        data_f=data.copy()
        
        if preparation:
            
            data_f=self.prep_data(data_f)
            data_f=data_f[filtr_[i]]
        
        for i in range(iter_):
                        
            
            data_f=make_feat(data_f, filtr_[i][:20])
            
        data_f=data_f[filtr_[iter_]]
            
        if pca:
            
            data_f=make_pca(data_f,n_pca)    
            
        return data_f
    
    
    
    def make_predict(self,test,
                     models,models_params=[],n=5, gen_feats=True,make_feat_params={},*args, **kwargs):
        
        predict=[]
        ооf_score=[]
        trained_models=[]
        
     
       
        test_prep=self.prep_data(test)
        
        if gen_feats:
            
            test_prep=self.make_features(test_prep,**make_feat_params)
            self.data=self.make_features(self.data,**make_feat_params)
            
               
        for k in range(n):
    
            kf=StratifiedKFold(n_splits=7, random_state=101+k)
            
               
            for train_index, test_index in kf.split(self.data,self.target):
                
                X_train, X_test = self.data.iloc[train_index], self.data.iloc[test_index]
                y_train, y_test = self.target[train_index], self.target[test_index]
       
        
                for model, param in zip(models,models_params):
                    
                    model.fit(X=X_train,y=y_train)
                    pred=model.predict_proba(test_prep)[:,1]
                    oof_pred=model.predict_proba(X_test)[:,1]
                    predict.append(pred)
                    ооf_score_pred=roc_auc_score(y_test,oof_pred)
                    ооf_score.append(ооf_score_pred)
                    self.trained_models.append((clone(model),ооf_score_pred))
                    
                    
#         возможны варианты

        print(f'ооf_score_mean: {np.mean(ооf_score)}, ооf_score_std: {np.std(ооf_score)}')
    
    
        return np.mean(predict,axis=0)
    
    def prepare_submit(self,test,pred_params,get_pred=True):
               
            pred_test=test.copy()
           
            if get_pred:
            
                pred_test=make_predict(pred_test,**pred_params)
            
            df_test= pd.DataFrame({'APPLICATION_NUMBER':test['APPLICATION_NUMBER'], 'TARGET': pred_test}, )
            df_test.to_csv("sample_submit.csv", index=False)
                
    


In [502]:
if __name__ == "__main__":

    credit_profile=Credit_Profile(train)

In [486]:
credit_profile.data

Unnamed: 0,BUREAU_ID,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,...,GENDER_FAMILY_STATUS_FMarried,GENDER_FAMILY_STATUS_FSeparated,GENDER_FAMILY_STATUS_FSingle / not married,GENDER_FAMILY_STATUS_FWidow,GENDER_FAMILY_STATUS_MCivil marriage,GENDER_FAMILY_STATUS_MMarried,GENDER_FAMILY_STATUS_MSeparated,GENDER_FAMILY_STATUS_MSingle / not married,GENDER_FAMILY_STATUS_MWidow,GENDER_FAMILY_STATUS_XNAMarried
0,6.191950e+06,807.250000,0.0,473.500000,788.750000,854.1075,0.0,1.693390e+05,8.192700e+04,10662.855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,,,,,,,,,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.965158e+06,2037.500000,0.0,1308.000000,1276.000000,19305.0000,0.0,3.917700e+05,,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110088,6.100735e+06,666.416667,0.0,10877.111111,538.416667,5252.2200,0.0,4.234575e+05,3.383752e+05,0.000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
110089,6.788849e+06,1140.500000,0.0,1708.625000,806.250000,,0.0,2.698644e+06,2.198827e+06,247496.445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110090,5.709132e+06,1011.000000,0.0,816.000000,,,0.0,6.750000e+05,4.169835e+05,0.000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
110091,,,,,,,,,,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [487]:
imp_feat0=pd.read_csv('models/feature0_2021-04-22_044833').features.tolist()
imp_feat1=pd.read_csv('models/feature1_2021-04-22_830130').features.tolist()
imp_feat2=pd.read_csv('models/feature3_2021-04-23_428041').features.tolist()

filtr_=[imp_feat0,imp_feat1,imp_feat2]

In [498]:
xg_params = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.06,
#     'tree_method':'gpu_hist', 
#     'gpu_id':0,
#     'colsample_bynode':0.8,
#     'subsample':0.95,
#     'num_parallel_tree':3,
    'reg_alpha':0.6,
    'min_child_weight':30,
#     'max_delta_step':50,
#     "early_stopping_rounds": 100,
    "n_estimators": 600,
    "reg_lambda": 100,
    "max_depth": 4,
    "gamma": 4,
    "nthread": 6,
    "seed": 101
}


In [499]:
cb_params = {
#     "n_estimators": 2000,
    "learning_rate": 0.03,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 0,
    'min_data_in_leaf': 100,
#     'boosting_type':'Ordered',
#     'grow_policy':'SymmetricTree',
#     'cat_features':cat_feat,
    "max_depth": 6,
    "l2_leaf_reg": 200,
    "early_stopping_rounds": 200,
    "thread_count": 10,
    'random_seed': 101,
}

In [500]:
xgb = xgboost.XGBClassifier(**xg_params)
cgb = catboost.CatBoostClassifier(**cb_params)
models=[cgb,xgb]
models_params=[cb_params,xg_params]

In [501]:
make_feat_params={'filtr_':filtr_,'iter_': 2}

In [468]:

credit_profile.make_predict(test,models,models_params,make_feat_params=make_feat_params)

ооf_score_mean: 0.7286592293148201, ооf_score_std: 0.0035364693573328547


array([0.07099273, 0.05581475, 0.04247319, ..., 0.08529904, 0.06033155,
       0.08521349])