In [5]:
import pandas as pd
import numpy as np

from sklearn.cluster import DBSCAN, OPTICS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score,fbeta_score, f1_score, accuracy_score, recall_score, precision_score, confusion_matrix

from catboost import CatBoostClassifier
import shap

## Определяем типы переменных

In [8]:
data = pd.read_csv('df_promejut.csv')

#отбираем строковые переменные (КАТЕГОРИАЛЬНЫЕ)
xxx = data.select_dtypes(include = 'object')
xxx.head()

'''
смотрим: client_id - не признак
dt - дата
остальные: категориальные.
Правда, среди них могут быть и бинарные признаки. Выделим их:
'''
trash = ['client_id']
dates = ['dt']
xxx_categoricals = list(set(xxx.columns) - set(trash) - set(dates))

binaries = []
categoricals = []
for fea in xxx_categoricals:
    l = len(data[fea].value_counts())
    if l == 2:
        binaries.append(fea)
    elif l <= 1:
        trash.append(fea)
    else: #больше 2 уникальных значений за исключением nan
        categoricals.append(fea)
    
#отбираем числовые переменные
xxx = data.select_dtypes(include = 'number')
xxx.head()

'''
И как следовало ожидать, остальные 284 - числовые
target - исключаем из рассмотрения
'''
xxx.drop('target', axis = 1, inplace = True)
trash.append('target')

continuous = []
for fea in xxx.columns:
    l = len(data[fea].value_counts())
    if l == 2:
        binaries.append(fea)
    elif l <= 1:
        trash.append(fea)
    elif l>2 and l<=10: #больше 2 уникальных значений, но не более 10 за исключением nan
        categoricals.append(fea)
    else:
        continuous.append(fea)

categoricals.remove('hdb_bki_total_currency')
categoricals.remove('hdb_bki_active_oth_cnt')
continuous.append('hdb_bki_total_currency')
continuous.append('hdb_bki_active_oth_cnt')

del data

# '''
# НА ВЫХОДЕ:

# binaries - бинарные признаки
# categoricals - категориальные признаки
# continuous - непрерывные признаки
# dates - даты
# trash - мусор
# '''

## Заполням пропуски

In [None]:
df = pd.read_csv('df_promejut.csv')

Удалим строки с NA

In [4]:
df[df['gender'].isna()]

Unnamed: 0,client_id,dt,accountsalary_flag,country,profit_income_out_rur_amt_6m,clientoutflowstatus,curbal_usd_amt_cm_avg,stratsegfactor,srvpackage,clientsegment,...,transaction_category_cash_percent_cnt_2m,transaction_category_hotels_sum_cnt_m2,transaction_category_sporting_goods_inc_amt_2m,transaction_category_airplane_tickets_percent_amt_2m,transaction_category_transport_percent_amt_2m,transaction_category_restaurants_percent_amt_2m,transaction_category_auto_services_percent_amt_2m,transaction_category_supermarket_sum_cnt_m2,transaction_category_auto_services_sum_cnt_m2,target
11,253b139f225a1206fcee5d69e30bc4c0914c48c31d6d06...,2021-08-31,,,,,,,,,...,,,,,,,,,,0.0
18,0176c793b0c90c07d928dde2bf6c6736da7e110e4aa622...,2021-06-30,,,,,,,,,...,,,,,,,,,,0.0
21,27aaa655aa279c4f325827610cf42b989b724d8af5abd9...,2021-02-28,,,,,,,,,...,,,,,,,,,,0.0
80,4b476be0765aae37cde3e4c297a94438b8e3b1f4abffe6...,2021-10-31,,,,,,,,,...,,,,,,,,,,0.0
112,be74c0eed79feea5d6799eae5260bb6c96036f78d17dcb...,2021-09-30,,,,,,,,,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252677,a6ab1e3458ddc0da67ea317c0dcdb625d85f67c0ed1b5b...,2022-04-30,,,,,,,,,...,,,,,,,,,,1.0
252679,4500fa926a64fbbf1682c9d22353a9bcb6bb5de5d3da6f...,2021-06-30,,,,,,,,,...,,,,,,,,,,1.0
252680,dc465cd303afb7387fd49f699c034ecd39f7b6d7ea56d3...,2022-05-31,,,,,,,,,...,,,,,,,,,,1.0
252693,643b1ea29c7e2a6d9429d89a8cfd3278b56c32d4098ae0...,2021-07-31,,,,,,,,,...,,,,,,,,,,1.0


In [None]:
df.dropna(subset=['gender', 'accountsalary_flag'], inplace=True)

In [None]:
number_list = list(set(continuous).union(set(binaries)))

for i in number_list:
    if 'amount' in i or 'avg_' in i or 'profit_' in i or 'amt' in i or 'sum' in i or 'transaction' in i or 'cnt' in i or 'count' in i or 'diff' in i or 'active' in i or 'limit' in i or 'total_currency' in i or  'total_products' in i:
        df[i]=df[i].fillna(0)
        
for i in number_list:
    if 'life_time_days' in i:
        df[i]=df[i].fillna(0)

for i in number_list:
    if 'days' in i:
        df[i]=df[i].fillna(value=df[i].max()+1)
        
for i in number_list:
    if 'age' in i or 'total_inc' in i:
        df[i]=df[i].fillna(value=df[i].median())
        
for i in categoricals:
        df[i]=df[i].fillna(value='NA')
        
assert df.isna().sum().sum() == 0

## Делим на трейн и тест

In [None]:
X = df.drop('target', axis = 1)
Y = df['target']

In [None]:
rng = np.random.RandomState(0)

X_train,X_test,y_train, y_test=train_test_split(X,Y,random_state=rng,stratify=Y) 

df_train = X_train.copy()
df_train['target']=y_train.copy()
df_test = X_test.copy()
df_test['target']=y_test.copy()

df_train.to_csv('df_train.csv',index=False)
df_test.to_csv('df_test.csv',index=False)

## Ищем аномалии в трейне

In [None]:
X = df_train[continuous]

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X=pd.DataFrame(X)
X.index = ind_x

In [None]:
# eps_vals и min_samples_vals подобраны через Gridsearch так, чтобы доля аномалий была 1%

eps_vals = [30] #0.5, 1, 3, 5, 10, 15, 25, 40, 60, 80, 100
min_samples_vals = [10] #3, 5, 7, 8, 10, 12, 15, 20, 25, 30

col1 = []
col2 = []
col3 = []
for eps in tqdm(eps_vals):
    for min_samp in tqdm(min_samples_vals):
        model = DBSCAN(eps = eps, min_samples = min_samp)
        res = model.fit_predict(X)
        res_outl = np.where(res == -1)[0]
        outl_frac = len(res_outl)/len(res)
        
        col1.append(eps)
        col2.append(min_samp)
        col3.append(outl_frac)
    
ress = pd.DataFrame({
    'eps': col1,
    'min_samples': col2,
    'frac_outliers': col3
})

ress.to_excel('find_outl_with_DBSCAN_ress.xlsx', index = False)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [39]:
X['outl']=res
X['outl'].sum()

-2138

In [None]:
df_train['outl']=X['outl'].copy()

In [43]:
df_train[df_train['outl']==-1]['target'].sum()

1031.0

In [47]:
df_train[df_train['outl']!=-1].drop('outl',axis=1).to_csv('df_train_woutout.csv',index=False)

Обучим baseline Catboost, чтобы посмотреть, как повлияло удаление потенциальных аномалий

In [9]:
all_feats = list(set(continuous) | set(categoricals) | set(binaries))
all_cats = list(set(categoricals) | set(binaries))

### После удаления наблюдений, подозрительных на выбросы, качество модели падает

До удаления выбросов:

In [12]:
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')

In [18]:
X = df_train.drop(['client_id', 'dt', 'target'],axis=1)#.to_numpy()
y = df_train['target'].to_numpy()

#Заменим NAN в категориальных переменных на пустые строки для совместимости с моделью
for x in all_cats:
    X[x].fillna('NA', inplace = True)
    X[x]=X[x].astype('str')

for x in binaries:
    try:
        X[x] = X[x].astype(int)
    except:
        continue
        
model = CatBoostClassifier()
model.fit(X, y, verbose=100, plot=False, cat_features=all_cats)

Learning rate set to 0.096758
0:	learn: 0.5506481	total: 473ms	remaining: 7m 52s
100:	learn: 0.1371325	total: 24s	remaining: 3m 33s
200:	learn: 0.1308619	total: 46.3s	remaining: 3m 4s
300:	learn: 0.1255685	total: 1m 8s	remaining: 2m 38s
400:	learn: 0.1211015	total: 1m 30s	remaining: 2m 15s
500:	learn: 0.1168431	total: 1m 52s	remaining: 1m 52s
600:	learn: 0.1130944	total: 2m 14s	remaining: 1m 29s
700:	learn: 0.1096478	total: 2m 36s	remaining: 1m 6s
800:	learn: 0.1066820	total: 2m 58s	remaining: 44.3s
900:	learn: 0.1037201	total: 3m 19s	remaining: 21.9s
999:	learn: 0.1012555	total: 3m 41s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fada84b07f0>

In [13]:
X_t = df_test.drop(['client_id', 'dt', 'target'],axis=1)#.to_numpy()
y_t = df_test['target'].to_numpy()
    
for x in binaries:
    try:
        X_t[x] = X_t[x].astype(int)
    except:
        continue

In [57]:
pred_proba = model.predict_proba(X_t)[:,1]
print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
pred = model.predict(X_t)
print('f1_score: ', round(f1_score(y_t, pred),3))
print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
print('recall_score: ', round(recall_score(y_t, pred),3))
print('precision_score: ', round(precision_score(y_t, pred),3))

gini:  0.755
f1_score:  0.277
accuracy_score:  0.954
recall_score:  0.176
precision_score:  0.653


После удаления выбросов:

In [6]:
df_train_woutout = pd.read_csv('df_train_woutout.csv')

In [10]:
X1 = df_train_woutout.drop(['client_id', 'dt', 'target'],axis=1)#.to_numpy()
y1 = df_train_woutout['target'].to_numpy()

#Заменим NAN в категориальных переменных на пустые строки для совместимости с моделью
for x in all_cats:
    X1[x].fillna('NA', inplace = True)
    X1[x]=X1[x].astype('str')

model1 = CatBoostClassifier()
model1.fit(X1, y1, verbose=100, plot=False, cat_features=all_cats)

Learning rate set to 0.096291
0:	learn: 0.5476182	total: 412ms	remaining: 6m 51s
100:	learn: 0.1336594	total: 23.6s	remaining: 3m 30s
200:	learn: 0.1274579	total: 46.3s	remaining: 3m 4s
300:	learn: 0.1223811	total: 1m 8s	remaining: 2m 39s
400:	learn: 0.1178050	total: 1m 32s	remaining: 2m 17s
500:	learn: 0.1140364	total: 1m 54s	remaining: 1m 54s
600:	learn: 0.1104348	total: 2m 17s	remaining: 1m 31s
700:	learn: 0.1071851	total: 2m 39s	remaining: 1m 8s
800:	learn: 0.1040342	total: 3m 2s	remaining: 45.3s
900:	learn: 0.1011864	total: 3m 24s	remaining: 22.5s
999:	learn: 0.0986484	total: 3m 47s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f8ae2ce6dc0>

In [50]:
pred_proba = model1.predict_proba(X_t)[:,1]
print('gini: ', round(2*roc_auc_score(y_t, pred_proba)-1,3))
pred = model1.predict(X_t)
print('f1_score: ', round(f1_score(y_t, pred),3))
print('accuracy_score: ', round(accuracy_score(y_t, pred),3))
print('recall_score: ', round(recall_score(y_t, pred),3))
print('precision_score: ', round(precision_score(y_t, pred),3))

gini:  0.755
f1_score:  0.274
accuracy_score:  0.953
recall_score:  0.174
precision_score:  0.642


Качество упало, значит, мы удалили значимые наблюдения, а не выбросы