In [90]:
import numpy as np
import pandas as pd

https://archive.ics.uci.edu/ml/datasets/Bank+Marketing 

Датасет посвящен банковским клиентам, которые согласились на предложение после телефонных звонков.

Задача: тестируем метод **random negative sampling**. Он применяется для задач Look-alike ("выглядят подобными", поиск потенциальных клиентов, также эта задача называется "Positive and unlabeled"). 

Сначала обучаем классификатор в обычном режиме. 
Затем искусственно создаем ситуацию, когда часть данных имеет положительный label, a часть данных не размечена.  

**1. Загрузка датасета**

In [91]:
data = pd.read_csv('/Users/Alisa/Downloads/bank+marketing/bank/bank-full.csv', ';')

In [92]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [93]:
data.shape

(45211, 17)

In [94]:
data['y'].value_counts() #есть дисбаланс классов

no     39922
yes     5289
Name: y, dtype: int64

In [95]:
data.duplicated().sum() #нет дублей

0

In [96]:
np.where(pd.isnull(data))  #проверим на пропуски

(array([], dtype=int64), array([], dtype=int64))

In [97]:
np.where(data.applymap(lambda x: x == ''))

(array([], dtype=int64), array([], dtype=int64))

**2. Обучаем классификатор в обычном режиме**

In [98]:
data['y'] = np.where(data['y'] == 'yes', 1, 0)

In [99]:
df_0 = data.loc[data['y'] == 0] #исправляем дисбаланс классов

In [100]:
df_0.shape

(39922, 17)

In [101]:
df_0 = df_0.sample(5289)
df_1 = data.loc[data['y'] == 1]

In [102]:
frames = [df_0, df_1]
data = pd.concat(frames)
data = data.sample(frac = 1, random_state = 1) #шафлируем датасет

In [103]:
X = data.drop(columns=['y'])
y = data['y']

Делим на train и test

In [104]:
from sklearn.model_selection import train_test_split

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

Создаем пайплайн для обработки признаков: категориальные признаки будут кодироваться при помощи OneHotEncoder: в виде матрицы 0 и 1. Для этого сначала разделяем признаки на категориальные и непрерывные (количественные).

In [106]:
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
continuous_columns = ['age', 'balance', 'day', 'duration', 'pdays', 'previous']

In [107]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [108]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    

class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [109]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    
    final_transformers.append((cont_col, cont_transformer))

In [110]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

Готовим таблицу для сравнения моделей

In [111]:
models_results = {
    'data_type': [],
    'threshold': [],
    'f_score': [],
    'precision': [],
    'recall': [],
}

**Обучаем CatBoostclassifier**

In [112]:
from catboost import CatBoostClassifier

In [113]:
catboost_pipeline = Pipeline([
    ('features', feats),
    ('catboost_classifier', CatBoostClassifier(iterations=25, random_state=1)),
])

In [114]:
catboost_pipeline.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.4928351	total: 5.2ms	remaining: 125ms
1:	learn: 0.4387065	total: 11ms	remaining: 126ms
2:	learn: 0.4079422	total: 16ms	remaining: 117ms
3:	learn: 0.3902545	total: 21.7ms	remaining: 114ms
4:	learn: 0.3815947	total: 29.1ms	remaining: 117ms
5:	learn: 0.3677395	total: 36.8ms	remaining: 116ms
6:	learn: 0.3570835	total: 42.6ms	remaining: 110ms
7:	learn: 0.3478791	total: 49.7ms	remaining: 106ms
8:	learn: 0.3407937	total: 56.1ms	remaining: 99.7ms
9:	learn: 0.3366552	total: 61.9ms	remaining: 92.8ms
10:	learn: 0.3330748	total: 68.7ms	remaining: 87.4ms
11:	learn: 0.3289751	total: 74.2ms	remaining: 80.4ms
12:	learn: 0.3249500	total: 79.4ms	remaining: 73.2ms
13:	learn: 0.3173694	total: 85.4ms	remaining: 67.1ms
14:	learn: 0.3133153	total: 90.5ms	remaining: 60.4ms
15:	learn: 0.3119181	total: 96ms	remaining: 54ms
16:	learn: 0.3069874	total: 102ms	remaining: 47.8ms
17:	learn: 0.3045061	total: 107ms	remaining: 41.5ms
18:	learn: 0.3017829	total: 113ms	remaining: 35.7m

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('job',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='job')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='job'))])),
                                                ('marital',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='marital')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='marital'))])),
                                                ('education',
                                                 Pipeline(steps=[('selector',
                       

In [115]:
y_preds = catboost_pipeline.predict(X_test)

Оценим модель

In [116]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve

In [117]:
precision, recall, thresholds = precision_recall_curve(y_test, y_preds)

fscore = (2 * precision * recall) / (precision + recall)

ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=1, F-Score=0.859, Precision=0.841, Recall=0.878


In [118]:
models_results['data_type'].append('Normal')
models_results['threshold'].append(thresholds[ix])
models_results['f_score'].append(fscore[ix])
models_results['precision'].append(precision[ix])
models_results['recall'].append(recall[ix])

**3. Подготовим датасет для тестирования random negative sampling.** 
Поделим датасет на сегмент с позитивными метками и сегмент c неразмеченными данными

In [119]:
data = pd.read_csv('/Users/Alisa/Downloads/bank+marketing/bank/bank-full.csv', ';')

In [120]:
data_yes = data[data['y'] == 'yes']

In [121]:
data_yes.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
83,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
86,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
87,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
129,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
168,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [122]:
data_yes.shape

(5289, 17)

In [123]:
data_positive = data_yes.iloc[0:2700] # выбираем позитивные размеченные

In [124]:
data_yes_unlabel = data_yes.iloc[2700:] #позитивные неразмеченные

In [125]:
data_yes_unlabel.shape

(2589, 17)

In [126]:
data_no = data[data['y'] == 'no'] #негативные будут неразмеченными данными 

In [127]:
data_no = data_no.sample(5289)

In [128]:
frames = [data_yes_unlabel, data_no]

In [129]:
data_unlabel = pd.concat(frames)
data_unlabel = data_unlabel.sample(frac = 1)

In [130]:
data_unlabel.head(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
41566,41,admin.,divorced,secondary,no,6046,yes,yes,telephone,14,sep,185,2,-1,0,unknown,yes
24866,37,housemaid,single,tertiary,no,561,no,no,cellular,18,nov,188,1,165,2,failure,no
40158,38,management,married,tertiary,no,1187,yes,yes,cellular,5,jun,76,2,123,1,failure,no


**4. Применим random negative sampling для построения классификатора**

Теперь мы причислим все unlabel к условно негативным данным

In [131]:
data_unlabel['y'].values[:] = 0

In [132]:
data_unlabel.head() #неразмеченные данные, которые считаются условно негативными

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
41566,41,admin.,divorced,secondary,no,6046,yes,yes,telephone,14,sep,185,2,-1,0,unknown,0
24866,37,housemaid,single,tertiary,no,561,no,no,cellular,18,nov,188,1,165,2,failure,0
40158,38,management,married,tertiary,no,1187,yes,yes,cellular,5,jun,76,2,123,1,failure,0
22459,36,management,divorced,tertiary,no,276,yes,no,cellular,22,aug,131,2,-1,0,unknown,0
42629,47,admin.,single,secondary,no,3696,no,no,cellular,12,jan,758,4,-1,0,unknown,0


In [133]:
data_positive.head(5) #размеченные позитивные данные

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
83,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
86,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
87,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
129,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
168,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [134]:
data_positive['y'].values[:] = 1

In [135]:
frames = [data_unlabel, data_positive] #обЪединяем датасеты и шафлируем данные
data_UP = pd.concat(frames)
data_UP = data_UP.sample(frac = 1)

In [136]:
X_train = data_UP.drop(columns=['y'])
y_train = data_UP['y']

Обучаем Catboost-модель и оцениваем результаты

In [137]:
catboost_pipeline.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.4393159	total: 6.38ms	remaining: 153ms
1:	learn: 0.3389506	total: 13.6ms	remaining: 156ms
2:	learn: 0.3167870	total: 20.1ms	remaining: 148ms
3:	learn: 0.3006233	total: 29ms	remaining: 152ms
4:	learn: 0.2943925	total: 36.6ms	remaining: 146ms
5:	learn: 0.2855723	total: 44.8ms	remaining: 142ms
6:	learn: 0.2826512	total: 52.1ms	remaining: 134ms
7:	learn: 0.2698045	total: 60ms	remaining: 128ms
8:	learn: 0.2645818	total: 66.3ms	remaining: 118ms
9:	learn: 0.2613276	total: 74.8ms	remaining: 112ms
10:	learn: 0.2589121	total: 81.9ms	remaining: 104ms
11:	learn: 0.2539773	total: 88.9ms	remaining: 96.3ms
12:	learn: 0.2513306	total: 95.5ms	remaining: 88.1ms
13:	learn: 0.2477300	total: 102ms	remaining: 80.5ms
14:	learn: 0.2448226	total: 110ms	remaining: 73ms
15:	learn: 0.2432623	total: 116ms	remaining: 65.4ms
16:	learn: 0.2370212	total: 124ms	remaining: 58.3ms
17:	learn: 0.2350679	total: 130ms	remaining: 50.7ms
18:	learn: 0.2291266	total: 138ms	remaining: 43.5ms
1

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('job',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='job')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='job'))])),
                                                ('marital',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='marital')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='marital'))])),
                                                ('education',
                                                 Pipeline(steps=[('selector',
                       

In [138]:
y_preds = catboost_pipeline.predict(X_test)

In [139]:
precision, recall, thresholds = precision_recall_curve(y_test, y_preds)

fscore = (2 * precision * recall) / (precision + recall)

ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0, F-Score=0.659, Precision=0.491, Recall=1.000


In [140]:
models_results['data_type'].append('Random_negative_sampling')
models_results['threshold'].append(thresholds[ix])
models_results['f_score'].append(fscore[ix])
models_results['precision'].append(precision[ix])
models_results['recall'].append(recall[ix])

**5. Сравниваем результаты**

In [141]:
models_results = pd.DataFrame(models_results)
models_results

Unnamed: 0,data_type,threshold,f_score,precision,recall
0,Normal,1,0.859187,0.841445,0.877692
1,Random_negative_sampling,0,0.659062,0.491493,1.0
