In [26]:
import pandas as pd 
import seaborn as sns
import optuna
import numpy as np
import itertools
import dill

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc, confusion_matrix, classification_report
from matplotlib import pyplot as plt
from optuna.pruners import HyperbandPruner
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
import warnings
warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)
from datetime import datetime

In [34]:
path = 'train_data/'
df_final = pd.read_parquet(path+'df_final.parquet')
print(df_final.shape)
df_final.head()

FileNotFoundError: [Errno 2] No such file or directory: 'train_data/df_final.parquet'

In [None]:
df_final.info()

In [None]:
df_final.flag.value_counts(normalize=True)

In [None]:
X = df_final.drop(['flag'], axis=1)
y = df_final.flag

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                   test_size =0.2,
                                                   random_state=42)



In [None]:
models = [
    LogisticRegression(random_state=42),
    RandomForestClassifier(random_state=42),
    CatBoostClassifier(random_seed=42)
]

In [None]:
for model in models:
    start = datetime.now()
    print('Текущее время:', start)
    score = cross_val_score(model, X_train.drop(columns=['id']), y_train, 
                           scoring='roc_auc', cv=3, n_jobs=-1)
    print(' время ', datetime.now()-start)
    print('Модель: ', model)
    print('Roc-auc:', round(score.mean(),4))
    print()

In [37]:
# CatBoostClassifier дала лучшие результаты

In [None]:
model = CatBoostClassifier(random_seed=42)

In [None]:
def filter_data(df):
    df_temp = df.copy()
    columns_to_drop = ['id']

    return df_temp.drop(columns_to_drop, axis=1)

In [None]:
def new_features(df):
    df_temp = df.copy()
    df_temp['is_zero_loans_risk'] = df_temp[['is_zero_loans5',
                                             'is_zero_loans530',
                                             'is_zero_loans3060',
                                             'is_zero_loans6090',
                                             'is_zero_loans90']].lt(0.6).sum(axis=1)
    df_temp['is_zero_loans_points'] = df_temp['is_zero_loans5']*1 + df_temp['is_zero_loans530']*1.5 + df_temp['is_zero_loans3060']*2 + df_temp['is_zero_loans6090']*2.5 + df_temp['is_zero_loans90']*3

    return df_temp

In [None]:
preprocessor = Pipeline(steps=[
    ('filter', FunctionTransformer(filter_data)),
    ('new_features', FunctionTransformer(new_features))
])

In [None]:
def objective(trial):    
   
    params = {
        "objective": trial.suggest_categorical("objective", ["CrossEntropy"]),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.5),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 35, 40),
        "depth": trial.suggest_int("depth", 8, 10),
        }
                        
    model.set_params(**params)
    
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    start= datetime.now()
    print('Текущая время ', start)
    score = cross_val_score(pipeline, X_train, y_train, cv=3, n_jobs=-1, scoring= 'roc_auc').mean()
    print(' время ', datetime.now()-start)
    return score

study = optuna.create_study(direction="maximize", pruner=HyperbandPruner)
study.optimize(objective, n_trials=100)

print('Лучшие гиперпараметры:', study.best_trial.params)
print('Лучшее значение roc-auc:', study.best_trial.value)


In [None]:
catboost_best = CatBoostClassifier(max_depth=9, objective='CrossEntropy', min_data_in_leaf=37, 
                                   learning_rate=0.06553798310275494, random_seed=42)


In [None]:
catboost_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', catboost_best)
    ])


In [None]:
start= datetime.now()
print('Текущая время ', start)
catboost_pipeline.fit(X,y)
print(' время ', datetime.now()-start)

In [None]:
catboost_pred_test = catboost_pipeline.predict(X_test)

In [None]:
catboost_probs = catboost_pipeline.predict_proba(X_test)[:, 1]

In [None]:
print('Итоговая CatBoostClassifier')
print('Значение roc_auc на тестовой выборке: ', round(roc_auc_score(y_test, catboost_probs),4)) 

In [None]:
catboost_feature_names = catboost_pipeline[1].feature_names_

catboost_importances = pd.Series(catboost_pipeline[1].feature_importances_, index=catboost_feature_names).sort_values(ascending=True)

In [None]:
ax = catboost_importances[-15:].plot.barh(figsize=(10,4))
ax.set_title("CatBoostClassifier. Важность признаков (топ-15)")
ax.figure.tight_layout()


In [None]:
fpr_cat, tpr_cat, treshold_cat = roc_curve(y_test, catboost_probs)
roc_auc_cat = auc(fpr_cat, tpr_cat)

plt.figure(figsize=(5, 4))
plt.plot(fpr_cat, tpr_cat, color='darkorange',
         label='CatBoostClassifier (area = %0.3f)' % roc_auc_cat)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('График ROC-кривой')
plt.legend(loc="lower right")
plt.show()


In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
 
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
 
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
 
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
plot_confusion_matrix(confusion_matrix(y_test, catboost_pred_test), classes=['0','1'], title='CatBoostClassifier')

In [None]:
print(classification_report(y_test, catboost_pred_test))

In [None]:
test_sub = pd.DataFrame(data = {
    'id': X_test['id'],
    'predictions': catboost_pred_test
})

test_sub.head()


In [None]:
test_sub.to_csv('test_sub.csv')

In [None]:
def filter_data(df):
    df_temp = df.copy()
    columns_to_drop = ['id']

    return df_temp.drop(columns_to_drop, axis=1)


In [None]:
def new_features(df):
    df_temp = df.copy()
    df_temp['is_zero_loans_risk'] = df_temp[['is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060', 'is_zero_loans6090', 'is_zero_loans90']].lt(0.6).sum(axis=1)
    df_temp['is_zero_loans_points'] = df_temp['is_zero_loans5']*1 + df_temp['is_zero_loans530']*1.5 + df_temp['is_zero_loans3060']*2 + df_temp['is_zero_loans6090']*2.5 + df_temp['is_zero_loans90']*3
    
    return df_temp

In [None]:
preprocessor = Pipeline(steps=[
    ('filter', FunctionTransformer(filter_data)),
    ('new_features', FunctionTransformer(new_features))])

In [None]:
catboost_best = CatBoostClassifier(max_depth=9, objective='CrossEntropy', min_data_in_leaf=37, 
                                   learning_rate=0.06553798310275494, random_seed=42)


In [None]:
catboost_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', catboost_best)
    ])


In [None]:
start= datetime.now()
print('Текущая время ', start)
catboost_pipeline.fit(X,y)
print(' время ', datetime.now()-start)

In [None]:
with open(path+'credit_risk', 'wb') as file:
    dill.dump({
        'model': catboost_pipeline,
        'metadata': {
            'name': 'Credit_risk',
            'author': 'Zulfiya Usmonova',
            'version': '1',
            'date': '10.07.2024',
        }
    }, file)
