In [1]:
import pandas as pd
import numpy as np

In [2]:
from catboost import CatBoostClassifier
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split

В проекте использованы реальные данные сотового оператора Мегафон

Исходные данные представляют собой **2 таблицы**: 

**Первая таблица** содержит данные о пользователях и услугах, которые им предлагались. Каждому пользователю может быть сделано несколько предложений в разное время, каждое из которых он может или принять, или отклонить. id - id пользователя, vas_id - услуга, buy_time - время предложения, target - целевая переменная, показывает, согласился ли абонент на услугу. Первая таблица разделена на **train** (data_train) и **test** (data_test). 

**Вторая таблица** - информация о пользователях: анонимизированный набор признаков, характеризующий профиль потребления абонента. Эти данные привязаны к определенному времени, поскольку профиль абонента может меняться с течением времени. Из нее данные будут извлекаться по id пользователя. 

In [82]:
import dask.dataframe as dd 

Вес второй таблицы составляет 20 гигабайт, что усложняет работу. Для ее загрузки устанавливаем специальную библиотеку dask.
Кроме того, в данном случае выбираю работу на собственном компьютере, а не на виртуальной машине google colab, так как объем памяти гугл-диска ограничен, скорость загрузки данных на него медленная. 

In [43]:
df_train = pd.read_csv('/Users/features/data_train.csv')
df_train.head()

Unnamed: 0.1,Unnamed: 0,id,vas_id,buy_time,target
0,0,540968,8.0,1537131600,0.0
1,1,1454121,4.0,1531688400,0.0
2,2,2458816,1.0,1534107600,0.0
3,3,3535012,5.0,1535922000,0.0
4,4,1693214,1.0,1535922000,0.0


In [44]:
df_train.shape

(831653, 5)

Посмотрим на **соотношение классов 0 и 1**

In [45]:
df_train['target'].value_counts()

0.0    771467
1.0     60186
Name: target, dtype: int64

Выберем часть данных (насколько позволяет вычислительная мощность компьютера), предварительно изменив соотношение классов.

In [46]:
df_train_0 = df_train.loc[df_train['target'] == 0]

In [47]:
df_train_0['target'].value_counts()

0.0    771467
Name: target, dtype: int64

In [48]:
df_train_0 = df_train_0.sample(frac = 0.025, random_state = 1)

In [49]:
df_train_0.shape

(19287, 5)

In [50]:
df_train_1 = df_train.loc[df_train['target'] == 1]

In [51]:
df_train_1.shape

(60186, 5)

In [52]:
df_train_1 = df_train_1.sample(frac = 0.35, random_state = 1) 

In [53]:
df_train_1.shape

(21065, 5)

Получаем **объединенный датасет**,  удаляем лишние столбцы с индексами

In [54]:
frames = [df_train_0, df_train_1]
df_train = pd.concat(frames)

In [55]:
df_train.shape

(40352, 5)

In [56]:
from sklearn.utils import shuffle

In [57]:
df_train = shuffle(df_train)

In [59]:
df_train = df_train.drop(df_train.columns[0], axis=1)

In [60]:
df_train.head(3)

Unnamed: 0,id,vas_id,buy_time,target
223664,3330027,4.0,1541365200,1.0
688341,2650728,6.0,1546203600,0.0
354454,347992,6.0,1532898000,1.0


Теперь нам необходимо дополнить датасет признаками: у нас есть таблица с признаками (анонимизированными), нужно выбрать из нее данные нужных пользователей

In [83]:
df_features = dd.read_csv('/Users/features/features.csv', sep='\t')

In [84]:
df_features = df_features.drop(df_features.columns[0], axis=1)

Признаков более 250, это, вероятно, избыточное количество. В качестве вынужденной меры случайным образом, но относительно равномерно, избавлюсь от значительной части признаков (иначе вычисления не производятся при доступной мощности).

In [85]:
list_of_cols = np.arange(0, 252, 12) 
list_of_cols

array([  0,  12,  24,  36,  48,  60,  72,  84,  96, 108, 120, 132, 144,
       156, 168, 180, 192, 204, 216, 228, 240])

In [86]:
df_features = df_features[['id', 'buy_time','0', '12', '24', '36', '48', '60', '72', '84', '96', '108', '120', '132', '144', '156', '168', '180', '192', '204', '216', '228', '240', '252']]

In [87]:
df_features.head(10)

Unnamed: 0,id,buy_time,0,12,24,36,48,60,72,84,...,144,156,168,180,192,204,216,228,240,252
0,2013026,1531688400,18.910029,-2.558912,-0.009327,-0.700765,-10.717958,-0.265234,-2.60662,-0.134088,...,-70.470802,-2.548856,-51.984826,-0.00189,0.533317,0.039208,-8.3e-05,-33.302382,-512.437331,0.0
1,2014722,1539550800,36.690029,-2.558912,-0.009327,-0.700765,-10.717958,-0.265234,-2.60662,-0.134088,...,57.495858,35.451144,-51.984826,-0.00189,-0.466683,0.039208,-8.3e-05,-33.302382,-509.437331,0.0
2,2015199,1545598800,-67.019971,-2.558912,-0.009327,1.839235,-10.716588,0.244766,-2.60662,-0.134088,...,-43.587468,-2.548856,12.548504,-0.00189,-0.466683,0.039208,-8.3e-05,-19.302382,-512.437331,0.0
3,2021765,1534107600,7.010029,-2.558912,-0.009327,-0.700765,-10.717958,-0.265234,-2.60662,-0.134088,...,11.079198,-2.548856,-51.984826,-0.00189,0.533317,0.039208,-8.3e-05,-33.302382,-512.437331,1.0
4,2027465,1533502800,-90.439971,-2.558912,-0.009327,-0.700765,-10.717958,-0.265234,-2.60662,-0.134088,...,-116.020802,52.451144,-51.984826,-0.00189,0.533317,0.039208,-8.3e-05,-33.302382,-496.437331,0.0
5,2028410,1534107600,-96.799971,-2.558912,-0.009327,-0.700765,-10.717958,0.054766,-2.60662,-0.134088,...,-4.070802,-2.548856,-18.551496,-0.00189,0.533317,0.039208,-8.3e-05,-33.302382,-512.437331,0.0
6,2030773,1544994000,284.560029,-2.558912,-0.009327,-0.700765,-10.717958,-0.265234,-2.60662,-0.134088,...,-116.020802,26.451144,-51.984826,-0.00189,-0.466683,0.039208,-8.3e-05,-33.302382,-512.437331,1.0
7,2032337,1537736400,92.550029,31.341088,-0.009327,-0.700765,-10.678718,0.704766,31.29338,-0.134088,...,-100.404136,-2.548856,-37.718159,-0.00189,-0.466683,0.039208,-8.3e-05,-33.302382,1694.562669,1.0
8,2033486,1545598800,-93.749971,-2.558912,-0.009327,-0.700765,-10.717958,0.034766,-2.60662,-0.134088,...,-77.4208,-2.548856,-45.834824,-0.00189,-0.466683,0.039208,-8.3e-05,-33.302382,-510.437331,1.0
9,2034066,1544389200,228.520029,-2.558912,-0.009327,-0.700765,-10.717958,0.174766,-2.60662,-0.134088,...,2.395862,-2.548856,-51.984826,-0.00189,0.533317,0.039208,-8.3e-05,-33.302382,-512.437331,0.0


In [66]:
df_train_id = df_train['id']
df_train_id.head(3)

223664    3330027
688341    2650728
354454     347992
Name: id, dtype: int64

Находим нужных пользователей:

In [67]:
df_features_train = df_features[df_features['id'].isin(df_train_id)]
df_features_train.head(5)

Unnamed: 0,id,buy_time,0,12,24,36,48,60,72,84,...,144,156,168,180,192,204,216,228,240,252
19,2070757,1540760400,-96.799971,-2.558912,-0.009327,-0.700765,-10.717958,-0.265234,-2.60662,-0.134088,...,-116.020802,47.451144,-51.984826,-0.00189,0.533317,0.039208,-8.3e-05,-33.302382,-512.437331,1.0
27,2085648,1539550800,68.440029,-2.558912,-0.009327,-0.700765,-10.711879,0.214766,-2.60662,-0.134088,...,14.829198,-2.548856,157.998514,-0.00189,-0.466683,0.039208,-8.3e-05,-30.302382,-512.437331,0.0
123,2508658,1537736400,-84.579971,-2.558912,-0.009327,-0.700765,-10.717958,0.284766,-2.60662,-0.134088,...,-76.204132,-2.548856,15.08184,-0.00189,-0.466683,0.039208,-8.3e-05,-33.302382,-512.437331,0.0
224,2931982,1534107600,-96.799971,-2.558912,-0.009327,-0.700765,-10.717958,-0.005234,-2.60662,-0.134088,...,-69.520802,-2.548856,-41.001493,-0.00189,0.533317,0.039208,-8.3e-05,-1.302382,-511.437331,1.0
500,3749290,1534712400,-96.799971,-2.558912,-0.009327,-0.700765,-10.717958,0.234766,-2.60662,-0.134088,...,-54.487469,-2.548856,-23.784826,-0.00189,-0.466683,0.039208,-8.3e-05,-33.302382,-512.437331,1.0


Осуществляем соединение таблиц по id и времени

In [68]:
df_train_feat = dd.multi.merge_asof(df_train, df_features_train.sort_values('buy_time'), by='id', on='buy_time', direction='nearest')

Сохраняем данные: **объединенный train датасет с профилем выбранных пользователей**

In [69]:
df_train_feat.to_csv('/Users/features/train_feat_balanced.csv', index=False, single_file=True)

['C:/Users/features/train_feat_balanced.csv']

In [142]:
df_train_feat = pd.read_csv('/Users/features/train_feat_balanced.csv')

In [143]:
df_train_feat['target'].value_counts()

1.0    21065
0.0    19287
Name: target, dtype: int64

In [144]:
df_train_feat.head(20)

Unnamed: 0,buy_time,id,vas_id,target,0,12,24,36,48,60,...,144,156,168,180,192,204,216,228,240,252
0,1531083600,630641,4.0,0.0,-92.559971,-2.558912,-0.009327,-0.700765,-10.717958,0.284766,...,200.845868,-2.548856,-51.984826,-0.00189,-0.466683,0.039208,-8.3e-05,-33.302382,-105.437331,1.0
1,1531083600,3982682,2.0,0.0,38.230029,-2.558912,-0.009327,1.419235,-10.717958,0.314766,...,43.212538,-2.548856,-13.301496,-0.00189,-0.466683,0.039208,-8.3e-05,-33.302382,-512.437331,1.0
2,1531083600,2784493,1.0,1.0,-96.799971,-2.558912,-0.009327,-0.700765,-10.717958,0.144766,...,-96.18747,-2.548856,-41.734826,-0.00189,-0.466683,0.039208,-8.3e-05,-33.302382,-446.437331,0.0
3,1531083600,921434,2.0,0.0,-96.799971,-2.558912,-0.009327,-0.700765,-10.717958,-0.265234,...,-116.020802,-2.548856,-51.984826,-0.00189,0.533317,0.039208,-8.3e-05,-33.302382,-511.437331,0.0
4,1531083600,2914443,1.0,0.0,-45.959971,-2.558912,-0.009327,-0.700765,-10.717958,-0.265234,...,-104.270803,-2.548856,-47.234827,-0.00189,-0.466683,0.039208,-8.3e-05,-33.302382,-512.437331,0.0
5,1531083600,3577906,2.0,0.0,-81.959971,-2.558912,-0.009327,14.139235,-10.717958,-0.005234,...,559.895798,-2.548856,-51.818159,-0.00189,0.533317,0.039208,-8.3e-05,-33.302382,3256.562669,0.0
6,1531083600,3474769,2.0,0.0,-2.549971,-2.558912,-0.009327,-0.700765,-10.717958,-0.155234,...,28.262538,-2.548856,28.448508,-0.00189,-0.466683,0.039208,-8.3e-05,-33.302382,-504.437331,0.0
7,1531083600,4214780,2.0,0.0,497.950029,-2.558912,-0.009327,0.829235,-10.717958,-0.155234,...,297.879198,-2.548856,87.681844,-0.00189,-0.466683,0.039208,-8.3e-05,-2.302382,-363.437331,1.0
8,1531083600,338838,2.0,0.0,-96.799971,-2.558912,-0.009327,-0.700765,-10.717958,0.514766,...,310.762528,-2.548856,23.481844,-0.00189,-0.466683,0.039208,-8.3e-05,-33.302382,8312.562669,0.0
9,1531083600,966530,4.0,1.0,-71.379971,-2.558912,-0.009327,-0.700765,-10.717958,-0.265234,...,192.695868,-2.548856,35.048504,-0.00189,0.533317,0.039208,-8.3e-05,-33.302382,346.562669,1.0


In [145]:
df_train_feat.shape

(40352, 26)

Столбец '252' - выглядит как target. Это вызывает вопросы. С target он не совпадает. Возможно, стоит его убрать. Попробуем его оставить. 

Здесь нам необходимо провести **проверку на пропуски**: 

In [146]:
df_train_feat.isnull().sum().sum()

0

Приведем признаки к **единому масштабу**:

In [147]:
from sklearn import preprocessing

In [148]:
scaler = preprocessing.StandardScaler()

In [149]:
df_train_feat_scaled = scaler.fit_transform(df_train_feat[['0', '12', '24', '36', '48', '60', '72', '84', '96', '108', '120', '132', '144', '156', '168', '180', '192', '204', '216', '228', '240', '252']])

In [150]:
scaled = pd.DataFrame(df_train_feat_scaled, columns = ['0', '12', '24', '36', '48', '60', '72', '84', '96', '108', '120', '132', '144', '156', '168', '180', '192', '204', '216', '228', '240', '252'])
scaled.head(3)

Unnamed: 0,0,12,24,36,48,60,72,84,96,108,...,144,156,168,180,192,204,216,228,240,252
0,-0.489226,-0.072862,-0.004978,-0.161858,-0.032645,0.876874,-0.073895,-0.01378,-0.061762,-0.24948,...,1.100575,-0.17601,-0.599613,-0.010451,-0.938542,0.204053,-0.004978,-0.033266,-0.059386,1.697322
1,0.194283,-0.072862,-0.004978,0.339066,-0.032645,0.96958,-0.073895,-0.01378,-0.061762,0.529213,...,0.240201,-0.17601,-0.149743,-0.010451,-0.938542,0.204053,-0.004978,-0.033266,-0.317511,1.697322
2,-0.511384,-0.072862,-0.004978,-0.161858,-0.032645,0.444245,-0.073895,-0.01378,-0.061762,-0.24948,...,-0.520654,-0.17601,-0.48041,-0.010451,-0.938542,0.204053,-0.004978,-0.033266,-0.275653,-0.580678


In [151]:
df_train_f = df_train_feat[['buy_time', 'id', 'vas_id']].copy()

In [152]:
df_train_feat_scaled = pd.concat([df_train_f, scaled], axis = 1)
df_train_feat_scaled.head(7)

Unnamed: 0,buy_time,id,vas_id,0,12,24,36,48,60,72,...,144,156,168,180,192,204,216,228,240,252
0,1531083600,630641,4.0,-0.489226,-0.072862,-0.004978,-0.161858,-0.032645,0.876874,-0.073895,...,1.100575,-0.17601,-0.599613,-0.010451,-0.938542,0.204053,-0.004978,-0.033266,-0.059386,1.697322
1,1531083600,3982682,2.0,0.194283,-0.072862,-0.004978,0.339066,-0.032645,0.96958,-0.073895,...,0.240201,-0.17601,-0.149743,-0.010451,-0.938542,0.204053,-0.004978,-0.033266,-0.317511,1.697322
2,1531083600,2784493,1.0,-0.511384,-0.072862,-0.004978,-0.161858,-0.032645,0.444245,-0.073895,...,-0.520654,-0.17601,-0.48041,-0.010451,-0.938542,0.204053,-0.004978,-0.033266,-0.275653,-0.580678
3,1531083600,921434,2.0,-0.511384,-0.072862,-0.004978,-0.161858,-0.032645,-0.822742,-0.073895,...,-0.628905,-0.17601,-0.599613,-0.010451,1.065482,0.204053,-0.004978,-0.033266,-0.316877,-0.580678
4,1531083600,2914443,1.0,-0.245694,-0.072862,-0.004978,-0.161858,-0.032645,-0.822742,-0.073895,...,-0.564773,-0.17601,-0.544372,-0.010451,-0.938542,0.204053,-0.004978,-0.033266,-0.317511,-0.580678
5,1531083600,3577906,2.0,-0.43383,-0.072862,-0.004978,3.344612,-0.032645,-0.019287,-0.073895,...,3.060294,-0.17601,-0.597674,-0.010451,1.065482,0.204053,-0.004978,-0.033266,2.072844,-0.580678
6,1531083600,3474769,2.0,-0.018834,-0.072862,-0.004978,-0.161858,-0.032645,-0.482819,-0.073895,...,0.158603,-0.17601,0.335792,-0.010451,-0.938542,0.204053,-0.004978,-0.033266,-0.312437,-0.580678


Здесь мы могли бы снизить размерность признаков, но будем считать, что в данном случае количество признаков было и так достаточно уменьшено. 

In [153]:
df_train_feat_scaled_time = df_train_feat_scaled

Изменим формат времени

In [154]:
import datetime 

In [155]:
df_train_feat_scaled['buy_time'] = df_train_feat_scaled['buy_time'].apply(lambda x: datetime.datetime.fromtimestamp(x))

In [156]:
df_train_feat_scaled.head(3)

Unnamed: 0,buy_time,id,vas_id,0,12,24,36,48,60,72,...,144,156,168,180,192,204,216,228,240,252
0,2018-07-09,630641,4.0,-0.489226,-0.072862,-0.004978,-0.161858,-0.032645,0.876874,-0.073895,...,1.100575,-0.17601,-0.599613,-0.010451,-0.938542,0.204053,-0.004978,-0.033266,-0.059386,1.697322
1,2018-07-09,3982682,2.0,0.194283,-0.072862,-0.004978,0.339066,-0.032645,0.96958,-0.073895,...,0.240201,-0.17601,-0.149743,-0.010451,-0.938542,0.204053,-0.004978,-0.033266,-0.317511,1.697322
2,2018-07-09,2784493,1.0,-0.511384,-0.072862,-0.004978,-0.161858,-0.032645,0.444245,-0.073895,...,-0.520654,-0.17601,-0.48041,-0.010451,-0.938542,0.204053,-0.004978,-0.033266,-0.275653,-0.580678


Подготовим **тестовый датасет**: выбираем из таблицы признаков необходимые id и объединение таблицы по id и buy_time

In [103]:
df_test = pd.read_csv('/Users/features/data_test.csv')
df_test.head(5)

Unnamed: 0.1,Unnamed: 0,id,vas_id,buy_time
0,0,3130519,2.0,1548018000
1,1,2000860,4.0,1548018000
2,2,1099444,2.0,1546808400
3,3,1343255,5.0,1547413200
4,4,1277040,2.0,1546808400


In [104]:
df_test = df_test.drop(df_test.columns[0], axis=1)

In [105]:
df_test_id = df_test['id']
df_test_id.head(7)

0    3130519
1    2000860
2    1099444
3    1343255
4    1277040
5     720364
6    3195325
Name: id, dtype: int64

In [106]:
df_features_test = df_features[df_features['id'].isin(df_test_id)]

In [107]:
df_test_feat = dd.multi.merge_asof(df_test, df_features_test.sort_values('buy_time'), by='id', on='buy_time', direction='nearest')

Сохраняем **тестовый датасет**

In [108]:
df_test_feat.to_csv('/Users/features/test_feat.csv', index=False, single_file=True)

['C:/Users/features/test_feat.csv']

In [117]:
df_test_feat = pd.read_csv('/Users/features/test_feat.csv')

In [118]:
df_test_feat.shape

(71231, 25)

Проверяем **пропуски** в тестовом датасете, **масштабируем** признаки

In [119]:
df_test_feat.isnull().sum().sum()

0

In [120]:
df_test_feat_scaled = scaler.transform(df_test_feat[['0', '12', '24', '36', '48', '60', '72', '84', '96', '108', '120', '132', '144', '156', '168', '180', '192', '204', '216', '228', '240', '252']])

In [121]:
scaled_2 = pd.DataFrame(df_test_feat_scaled, columns = ['0', '12', '24', '36', '48', '60', '72', '84', '96', '108', '120', '132', '144', '156', '168', '180', '192', '204', '216', '228', '240', '252'])
scaled_2.head(3)

Unnamed: 0,0,12,24,36,48,60,72,84,96,108,...,144,156,168,180,192,204,216,228,240,252
0,-0.433883,-0.072862,-0.004978,-0.161858,-0.032645,-0.822742,-0.073895,-0.01378,-0.061762,0.017363,...,-0.625721,-0.17601,-0.592829,-0.010451,1.065482,0.204053,-0.004978,-0.033266,-0.317511,1.697322
1,1.725187,-0.072862,-0.004978,-0.161858,-0.032645,-0.822742,-0.073895,-0.01378,-0.061762,-0.160462,...,-0.410673,-0.17601,-0.599613,-0.010451,-0.938542,0.204053,-0.004978,-0.033266,-0.313706,-0.580678
2,0.108681,-0.072862,-0.004978,-0.161858,-0.032645,2.174763,-0.073895,-0.01378,-0.061762,-0.24948,...,0.143957,-0.17601,0.354205,-0.010451,1.065482,0.204053,-0.004978,-0.033266,-0.317511,-0.580678


In [122]:
df_test_f = df_test_feat[['buy_time', 'id', 'vas_id']].copy()

In [123]:
df_test_feat_scaled = pd.concat([df_test_f, scaled_2], axis = 1)
df_test_feat_scaled.head(3)

Unnamed: 0,buy_time,id,vas_id,0,12,24,36,48,60,72,...,144,156,168,180,192,204,216,228,240,252
0,1546808400,1099444,2.0,-0.433883,-0.072862,-0.004978,-0.161858,-0.032645,-0.822742,-0.073895,...,-0.625721,-0.17601,-0.592829,-0.010451,1.065482,0.204053,-0.004978,-0.033266,-0.317511,1.697322
1,1546808400,1277040,2.0,1.725187,-0.072862,-0.004978,-0.161858,-0.032645,-0.822742,-0.073895,...,-0.410673,-0.17601,-0.599613,-0.010451,-0.938542,0.204053,-0.004978,-0.033266,-0.313706,-0.580678
2,1546808400,3726752,1.0,0.108681,-0.072862,-0.004978,-0.161858,-0.032645,2.174763,-0.073895,...,0.143957,-0.17601,0.354205,-0.010451,1.065482,0.204053,-0.004978,-0.033266,-0.317511,-0.580678


Изменяем формат времени

In [124]:
df_test_feat_scaled['buy_time'] = df_test_feat_scaled['buy_time'].apply(lambda x: datetime.datetime.fromtimestamp(x))

In [125]:
df_test_feat_scaled.head(3)

Unnamed: 0,buy_time,id,vas_id,0,12,24,36,48,60,72,...,144,156,168,180,192,204,216,228,240,252
0,2019-01-07,1099444,2.0,-0.433883,-0.072862,-0.004978,-0.161858,-0.032645,-0.822742,-0.073895,...,-0.625721,-0.17601,-0.592829,-0.010451,1.065482,0.204053,-0.004978,-0.033266,-0.317511,1.697322
1,2019-01-07,1277040,2.0,1.725187,-0.072862,-0.004978,-0.161858,-0.032645,-0.822742,-0.073895,...,-0.410673,-0.17601,-0.599613,-0.010451,-0.938542,0.204053,-0.004978,-0.033266,-0.313706,-0.580678
2,2019-01-07,3726752,1.0,0.108681,-0.072862,-0.004978,-0.161858,-0.032645,2.174763,-0.073895,...,0.143957,-0.17601,0.354205,-0.010451,1.065482,0.204053,-0.004978,-0.033266,-0.317511,-0.580678


Начнем **обучать модели**. 
Выберем модели, которые хороши для бинарной классификации с множеством признаков, - CatBoost и XGBoost. 

In [128]:
X = df_train_feat_scaled
y = df_train_feat['target']

In [135]:
df_train_feat['target'].value_counts()

1.0    21065
0.0    19287
Name: target, dtype: int64

In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 1)

In [130]:
from catboost import CatBoostClassifier

In [131]:
model_cat = CatBoostClassifier(
    random_seed=1,
    iterations=200,
    learning_rate=0.05
)

model_cat.fit(X_train, y_train)


0:	learn: 0.6489658	total: 219ms	remaining: 43.6s
1:	learn: 0.6152063	total: 257ms	remaining: 25.4s
2:	learn: 0.5863056	total: 292ms	remaining: 19.2s
3:	learn: 0.5586754	total: 311ms	remaining: 15.2s
4:	learn: 0.5299183	total: 325ms	remaining: 12.7s
5:	learn: 0.5057343	total: 336ms	remaining: 10.9s
6:	learn: 0.4860171	total: 345ms	remaining: 9.52s
7:	learn: 0.4701553	total: 354ms	remaining: 8.5s
8:	learn: 0.4553110	total: 363ms	remaining: 7.7s
9:	learn: 0.4421273	total: 372ms	remaining: 7.07s
10:	learn: 0.4289106	total: 382ms	remaining: 6.56s
11:	learn: 0.4183124	total: 391ms	remaining: 6.13s
12:	learn: 0.4062986	total: 401ms	remaining: 5.76s
13:	learn: 0.3972990	total: 415ms	remaining: 5.51s
14:	learn: 0.3915413	total: 428ms	remaining: 5.28s
15:	learn: 0.3854717	total: 439ms	remaining: 5.04s
16:	learn: 0.3804127	total: 447ms	remaining: 4.82s
17:	learn: 0.3744607	total: 457ms	remaining: 4.62s
18:	learn: 0.3679881	total: 467ms	remaining: 4.44s
19:	learn: 0.3651143	total: 476ms	remaining

171:	learn: 0.3133082	total: 2.01s	remaining: 327ms
172:	learn: 0.3132746	total: 2.02s	remaining: 315ms
173:	learn: 0.3131999	total: 2.03s	remaining: 304ms
174:	learn: 0.3131387	total: 2.04s	remaining: 292ms
175:	learn: 0.3130965	total: 2.05s	remaining: 280ms
176:	learn: 0.3130597	total: 2.06s	remaining: 268ms
177:	learn: 0.3129102	total: 2.07s	remaining: 256ms
178:	learn: 0.3128286	total: 2.08s	remaining: 244ms
179:	learn: 0.3126798	total: 2.09s	remaining: 232ms
180:	learn: 0.3126285	total: 2.1s	remaining: 221ms
181:	learn: 0.3125802	total: 2.11s	remaining: 209ms
182:	learn: 0.3124939	total: 2.12s	remaining: 197ms
183:	learn: 0.3124304	total: 2.13s	remaining: 185ms
184:	learn: 0.3123587	total: 2.14s	remaining: 173ms
185:	learn: 0.3123210	total: 2.15s	remaining: 162ms
186:	learn: 0.3121856	total: 2.17s	remaining: 151ms
187:	learn: 0.3121109	total: 2.18s	remaining: 139ms
188:	learn: 0.3120342	total: 2.2s	remaining: 128ms
189:	learn: 0.3119938	total: 2.21s	remaining: 117ms
190:	learn: 0.

<catboost.core.CatBoostClassifier at 0x24735c0d4c0>

In [132]:
preds = model_cat.predict_proba(X_test)[:, 1]
preds[:10]

array([0.91071222, 0.97649529, 0.08682014, 0.11601149, 0.09778327,
       0.91052438, 0.17879344, 0.90427808, 0.80067923, 0.14568389])

In [133]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [134]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)

ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.6039962112537884, F-Score=0.896, Precision=0.882, Recall=0.909


In [136]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [157]:
import xgboost as xgb

params = {
    'max_depth': 4,
    'eta': 0.01,
    'subsample': 0.4,
    'min_child_weight': 7,
    'n': 580,
}

In [168]:
X = df_train_feat_scaled.drop(['buy_time'], axis = 1)
y = df_train_feat['target']

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 1)

In [170]:
model_xgb = xgb.XGBClassifier(params=params, random_seed = 1)

In [171]:
model_xgb.fit(X_train, y_train)

Parameters: { "params", "random_seed" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1,
              params={'eta': 0.01, 'max_depth': 4, 'min_child_weight': 7,
                      'n': 580, 'subsample': 0.4},
              predictor='auto', random_seed=1, random_state=0, ...)

In [172]:
preds = model_xgb.predict_proba(X_test)[:, 1]
preds[:10]

array([0.7211797 , 0.6622319 , 0.22508654, 0.13426228, 0.22755662,
       0.9187575 , 0.2397975 , 0.9511618 , 0.8576836 , 0.16798325],
      dtype=float32)

In [173]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)

ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.5345136523246765, F-Score=0.823, Precision=0.860, Recall=0.788


Таким образом, отдаем предпочтение модели **СatBoost**

Есть подозрение, что в таблице признаков в колонке 252 скрыт таргет. Проверим себя. Обучим модель без этой колонки. 

In [175]:
df_train_feat_scaled_without_252 = df_train_feat_scaled.drop(['252'], axis = 1)

In [176]:
X = df_train_feat_scaled_without_252
y = df_train_feat['target']

In [177]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 1)

In [178]:
model_cat_2 = CatBoostClassifier(
    random_seed=1,
    iterations=200,
    learning_rate=0.05
)

model_cat_2.fit(X_train, y_train)

0:	learn: 0.6535070	total: 9.37ms	remaining: 1.86s
1:	learn: 0.6141578	total: 18.8ms	remaining: 1.86s
2:	learn: 0.5837308	total: 27.7ms	remaining: 1.82s
3:	learn: 0.5531385	total: 37.2ms	remaining: 1.82s
4:	learn: 0.5264003	total: 46.8ms	remaining: 1.82s
5:	learn: 0.5048367	total: 56.3ms	remaining: 1.82s
6:	learn: 0.4840947	total: 65.5ms	remaining: 1.81s
7:	learn: 0.4664190	total: 74.8ms	remaining: 1.79s
8:	learn: 0.4529799	total: 83.7ms	remaining: 1.77s
9:	learn: 0.4412161	total: 92.5ms	remaining: 1.76s
10:	learn: 0.4279703	total: 101ms	remaining: 1.74s
11:	learn: 0.4180022	total: 110ms	remaining: 1.72s
12:	learn: 0.4088818	total: 119ms	remaining: 1.72s
13:	learn: 0.4003136	total: 128ms	remaining: 1.7s
14:	learn: 0.3927442	total: 137ms	remaining: 1.69s
15:	learn: 0.3864884	total: 146ms	remaining: 1.68s
16:	learn: 0.3817418	total: 155ms	remaining: 1.67s
17:	learn: 0.3758048	total: 164ms	remaining: 1.66s
18:	learn: 0.3692048	total: 173ms	remaining: 1.65s
19:	learn: 0.3659883	total: 184m

175:	learn: 0.3117130	total: 1.77s	remaining: 241ms
176:	learn: 0.3115604	total: 1.78s	remaining: 231ms
177:	learn: 0.3114837	total: 1.79s	remaining: 222ms
178:	learn: 0.3113627	total: 1.8s	remaining: 212ms
179:	learn: 0.3112406	total: 1.81s	remaining: 202ms
180:	learn: 0.3112141	total: 1.82s	remaining: 191ms
181:	learn: 0.3111367	total: 1.83s	remaining: 182ms
182:	learn: 0.3110582	total: 1.85s	remaining: 172ms
183:	learn: 0.3109567	total: 1.86s	remaining: 162ms
184:	learn: 0.3108767	total: 1.88s	remaining: 153ms
185:	learn: 0.3107923	total: 1.9s	remaining: 143ms
186:	learn: 0.3107527	total: 1.91s	remaining: 133ms
187:	learn: 0.3106546	total: 1.93s	remaining: 123ms
188:	learn: 0.3106427	total: 1.94s	remaining: 113ms
189:	learn: 0.3105601	total: 1.95s	remaining: 103ms
190:	learn: 0.3105239	total: 1.97s	remaining: 92.9ms
191:	learn: 0.3103776	total: 1.99s	remaining: 82.9ms
192:	learn: 0.3102510	total: 2.01s	remaining: 72.8ms
193:	learn: 0.3101636	total: 2.02s	remaining: 62.5ms
194:	learn

<catboost.core.CatBoostClassifier at 0x247001468e0>

In [179]:
preds = model_cat_2.predict_proba(X_test)[:, 1]
preds[:10]

array([0.91265781, 0.9749781 , 0.07804904, 0.11892619, 0.1019637 ,
       0.90353968, 0.17863238, 0.91234112, 0.80493021, 0.13532812])

In [180]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)

ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.5967780948808138, F-Score=0.895, Precision=0.882, Recall=0.909


Показатели изменились незначительно. Следовательно 252 не была колонкой, которая слишком сильно влияла на модель. 

**Сохраняем модель**

In [174]:
import pickle

In [193]:
from pathlib import Path

In [203]:
filepath = r'C:/Users/features/model_Pickle.pkl'
pickle.dump(model_cat, open(filepath, 'wb'))

**Сохраняем датасет с прогнозом**: вероятностью подключения услуги

In [182]:
preds_new = model_cat.predict_proba(df_test_feat_scaled)[:, 1]

In [184]:
preds_new = pd.DataFrame(preds_new, columns = ['positive_probability'])

In [185]:
preds_new.head(3)

Unnamed: 0,positive_probability
0,0.018962
1,0.013876
2,0.009507


In [190]:
answers_test = pd.concat([df_test, preds_new], axis = 1)

In [191]:
answers_test.head(3)

Unnamed: 0,id,vas_id,buy_time,positive_probability
0,3130519,2.0,1548018000,0.018962
1,2000860,4.0,1548018000,0.013876
2,1099444,2.0,1546808400,0.009507


In [204]:
answers_test.to_csv('C:/Users/features/answers_test.csv', sep='\t')