# Модель кредитного  скоринга для магистратуры Альфа-Банк + МФТИ
### разработал Журавлев Александр, студент НГУ

In [None]:
from gc import collect
from scipy import stats
import pandas as pd
#import pyarrow
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold , cross_validate
from lightgbm import plot_importance
from catboost import CatBoostClassifier
from xgboost import XGBClassifier 
import xgboost as xgb
import optuna
from optuna.samplers import TPESampler
from optuna.integration import CatBoostPruningCallback

# Импорт данных
### импорт в 2 датафрейма, оперативной памяти не хватило чтобы обработать все данные в одном

In [None]:
train_data_0=pd.concat([pd.read_parquet('train_data_7.pq'),
                     pd.read_parquet('train_data_8.pq'),pd.read_parquet('train_data_9.pq'),
                     pd.read_parquet('train_data_10.pq'),pd.read_parquet('train_data_11.pq'),
                     ],ignore_index=True)
#train_data_0.drop(train_data_0[train_data_0['rn']>25].index,axis=0,inplace=True)
#train_data_0.drop(['pre_since_opened','pre_since_confirmed','pre_till_pclose'],axis=1,inplace=True)


In [None]:
# training
train_data_1=pd.concat([pd.read_parquet('train_data_1.pq'),
                     pd.read_parquet('train_data_0.pq'),pd.read_parquet('train_data_2.pq'),
                     pd.read_parquet('train_data_3.pq'),pd.read_parquet('train_data_4.pq'),
                     pd.read_parquet('train_data_5.pq'),pd.read_parquet('train_data_6.pq'),
                     ],ignore_index=True)

target_train=pd.read_csv('train_target.csv')
target_test=pd.read_csv('test_target.csv')
test_id=pd.read_csv('test_target.csv')
test_data=pd.concat([pd.read_parquet('test_data_0.pq'),pd.read_parquet('test_data_1.pq')],ignore_index=True)
pd.set_option("display.max_rows", None, "display.max_columns", None)

# Разведывательный анализ исходных данных

In [None]:
train_data_0.describe()
#train_data_1.describe()

In [None]:
train_data_0.info()
#train_data_1.info()

In [None]:
train_data_0.isna().sum()

### Аггрегирование категориальных признаков в сумму фиктивных переменных

In [None]:
    
def aggregations( data_frame: pd.DataFrame):
    
    feature_columns = list(data_frame.columns.values)
    feature_columns.remove("id"), feature_columns.remove("rn")
    dummies = pd.get_dummies(data_frame[feature_columns], columns=feature_columns)
    dummy_features = dummies.columns.values
    ohe_features = pd.concat([data_frame, dummies], axis=1)
    ohe_features = ohe_features.drop(columns=feature_columns)
    ohe_features.groupby("id")
    features = ohe_features.groupby("id")[dummy_features].sum().reset_index(drop=False)
    return features

In [None]:
features_0=aggregations(train_data_0)
train_df_0=target_train.merge(features_0,on='id')
feature_cols0=list(train_df_0.columns.values)
feature_cols0.remove("id"), feature_cols0.remove("flag")
y0=train_df_0['flag'].values

In [None]:
features_1=aggregations(train_data_1)
train_df_1=target_train.merge(features_1,on='id')
feature_cols1=list(train_df_1.columns.values)
feature_cols1.remove("id"), feature_cols1.remove("flag")
y1=train_df_1['flag'].values

In [None]:

train_df=pd.concat([train_df_0,train_df_1])
y0df=pd.DataFrame(y0,columns=['flag'])
y1df=pd.DataFrame(y1,columns=['flag'])
y=pd.concat([y0df,y1df])['flag'].values

In [None]:

train_df.fillna(0,inplace=True)

### сопоставляем данные колонки для свопадения размерности в train и test

In [None]:
features_sub=aggregations(test_data)


feature_cols_sub=list(features_sub.columns.values)
feature_cols_sub.remove('id')

feature_cols_both=list(set(feature_cols0) & set(feature_cols1) & set(feature_cols_sub))

X=train_df[feature_cols_both]

X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,stratify=y,
                                                  random_state=42)
del features_0, features_1,  train_df_0, train_df_1, train_data_0, train_data_1
collect()

# catboost


## catboost optuna optimization

In [None]:
def objective(trial):

    param = {
        'iterations': 1500,
        'learning_rate' : trial.suggest_int ('learning_rate',0.001,0.1),
        "objective": trial.suggest_categorical("objective", ["Logloss"]),
        'od_wait':trial.suggest_int('od_wait', 1, 100),
        "depth": trial.suggest_int("depth", 1, 30),
        'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,100),
        'random_strength': trial.suggest_float('random_strength',1,100),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        
        
        "task_type":"GPU",
        
    }
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.01, 10, log=True)


    cv = StratifiedKFold(shuffle=True, random_state=42, n_splits=4)

    model2 = CatBoostClassifier(**param)

    cross_valid = cross_validate(model2, X, y, scoring='roc_auc', cv=cv, return_estimator=True)
    
    print(cross_valid['test_score'])

 
    return min(cross_valid['test_score'])

In [None]:
sampler = TPESampler(seed=1)
study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=200)

# XGboost 

## baseline модель

In [None]:
params={
    'device' : 'cuda',
    'verbosity' : 1,
    'alpha': 48,
    'eta' : 0.003, # learning rate
    'max_depth' : 49,
    'gamma' : 2, # lareger - less overfit
    'min_child_weight' : 1.5, # lareger - less overfit, more conservatrive default 1
    'subsample' : 0.5,
    # 'lambda' : 33,
    'tree_method' : 'hist',
    'objective' : 'binary:logistic',
    'eval_metric' : 'auc',
    #'num_round' : 1000,
    #'early_stopping_rounds' : 150,
    'seed' : 42,
    'n_estimators' : 16000,

}

model2=XGBClassifier(**params)
model2.fit(X_train,y_train,verbose=True,eval_set=[(X_val, y_val)]) # 

In [None]:
roc_auc_score(y_val, model2.predict_proba(X_val)[:,1])
# 0.771165896084853  rn >20
# 0.7717232977884556 rn >23
# 0.7717340881028871 rn >24
# 0.7714754061297703 rn >25, without na cols
# 0.7722474759161622 rn >25 BEST,  0.7719390458455615 0.77177
# 0.772066303790381 rn >26
# 0.7719874717352808 rn >27
# 0.7720687457916587 rn >30
# 0.7720268479841006 rn >42 0.7718465577887845
# 0.7718815699102753 all features
# 0.771790192380146 rn >36
# 0.772010203020203 rn not cut off 
# 0.7721663311507744 rn not cut off and na to 0
#0.7684390792514973

## xgboost optuna


In [None]:
def objective(trial):
    dtrain=xgb.DMatrix(X, label = y)
    param = {
        "n_estimators" : trial.suggest_int("n_estimators", 100, 15000),
        'random_state': trial.suggest_categorical('random_state', [42]),
        "device" : "cuda",
        "verbosity": 1,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear","dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 50.0, log=True),
    }
    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 45)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
    
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-auc")
    history = xgb.cv(param, dtrain,  callbacks=[pruning_callback],early_stopping_rounds=50)
    mean_auc = history["test-auc-mean"].values[-1]
    return mean_auc


In [None]:
pruner = optuna.pruners.HyperbandPruner()
study = optuna.create_study(pruner=pruner, direction="maximize")
study.optimize(objective, n_trials=500)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# Отправка результатов

In [None]:

test_features=extract_count_aggregations(test_data)

X_sub=test_features[feature_cols_both]
X_sub.fillna(0,inplace=True)

preds=model2.predict_proba(X_sub)

In [None]:
X_sub.isna().sum().nunique()

In [None]:
submission = pd.DataFrame({
    "id" : test_id["id"].values,
    "score": preds[:,1]
}) 

In [None]:
submission.to_csv('submission.csv',index=None)