In [1]:
import os
import numpy as np
import pandas as pd
from textwrap import dedent
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from category_encoders.target_encoder import TargetEncoder
import lightgbm as lgbm
from lightgbm import LGBMRegressor
import catboost as cat
from catboost import CatBoostRegressor
import xgboost as xgb
from xgboost import XGBRegressor

os.environ['NUMEXPR_MAX_THREADS'] = '16'
pd.set_option('display.max_columns', 1500)
pd.set_option('display.max_rows', 500)

In [2]:
df_train_val = pd.read_pickle('Final_sample/train.pkl').reset_index(drop=True)
df_test = pd.read_pickle('Final_sample/test.pkl').reset_index(drop=True)
df_oot = pd.read_pickle('Final_sample/oot.pkl').reset_index(drop=True)

# Preprocessing

In [3]:
df_train_val.head()

Unnamed: 0,billing_account_currency,billing_account_id,billing_account_is_crm_account,billing_account_is_fraud,billing_account_is_isv,billing_account_is_var,billing_account_person_type,billing_account_state,billing_account_usage_status,billing_record_cost_rub,billing_record_credit_rub,billing_record_msk_date,billing_record_total_rub,br_week_day,br_week_num,crm_partner_manager,crm_segment,days_not_used,next_14d_cons,next_1d_cons,next_21d_cons,next_30d_cons,next_45d_cons,next_7d_cons,prev_14d_cons,prev_1d_cons,prev_21d_cons,prev_30d_cons,prev_45d_cons,prev_7d_cons,sku_group_is_Adjustments,sku_group_is_Cloud_Native,sku_group_is_Data_Storage_and_Analytics,sku_group_is_Infrastructure,sku_group_is_ML_and_AI,sku_group_is_Marketplace,sku_group_is_Support,sku_group_is_Tracker,sku_lazy,sku_name_is_compute_image,sku_name_is_compute_snapshot,sku_name_is_compute_vm_cpu_50_v2,sku_name_is_compute_vm_cpu_c05,sku_name_is_compute_vm_cpu_c05_v2,sku_name_is_compute_vm_cpu_c100,sku_name_is_compute_vm_cpu_c100_v2,sku_name_is_compute_vm_cpu_c20_v2,sku_name_is_compute_vm_ram,sku_name_is_compute_vm_ram_preemptible_v2,sku_name_is_compute_vm_ram_v2,sku_name_is_cr_bucket_used_space_standard,sku_name_is_marketplace_windows_cpu_c05,sku_name_is_marketplace_windows_cpu_c100,sku_name_is_mdb_cluster_network_hdd_pg,sku_name_is_mdb_cluster_network_nvme_mysql,sku_name_is_mdb_cluster_network_nvme_pg,sku_name_is_mdb_cluster_pg_v2_ram,sku_name_is_nbs_network_hdd_allocated,sku_name_is_nbs_network_nvme_allocated,sku_name_is_network_egress_inet,sku_name_is_network_ingress_inet,sku_name_is_network_ingress_inet_antiddos_qrator,sku_name_is_network_public_fips,sku_name_is_network_public_fips_deallocated,sku_name_is_network_public_fips_lb,sku_name_is_nlb_balancer_active,sku_name_is_nlb_vip_bytes_ingress,sku_name_is_other,sku_name_is_storage_api_delete,sku_name_is_storage_api_get_standard,sku_name_is_storage_api_head_standard,sku_name_is_storage_api_network_cloud_egress,sku_name_is_storage_api_network_cloud_ingress,sku_name_is_storage_api_network_inet_egress,sku_name_is_storage_api_network_inet_ingress,sku_name_is_storage_api_post_cold,sku_name_is_storage_api_put_cold,sku_name_is_storage_api_put_standard,sku_name_is_storage_bucket_used_space_cold,sku_name_is_storage_bucket_used_space_standard,sku_name_is_support_standard_fixed_consumption_v1,sku_service_is_adjustments,sku_service_is_api_gateway,sku_service_is_cloud_ai,sku_service_is_cloud_network,sku_service_is_compute,sku_service_is_cr,sku_service_is_datalens,sku_service_is_dns,sku_service_is_iot,sku_service_is_kms,sku_service_is_marketplace,sku_service_is_mdb,sku_service_is_mk8s,sku_service_is_monitoring,sku_service_is_nlb,sku_service_is_serverless,sku_service_is_storage,sku_service_is_support,sku_service_is_tracker,sku_service_is_ydb,sku_service_is_ymq,sku_subservice_is_clickhouse,sku_subservice_is_cpu,sku_subservice_is_delete,sku_subservice_is_egress_inet,sku_subservice_is_fips,sku_subservice_is_get,sku_subservice_is_head,sku_subservice_is_image,sku_subservice_is_ingress_inet,sku_subservice_is_mongo,sku_subservice_is_mysql,sku_subservice_is_nbs_hdd,sku_subservice_is_nbs_ssd,sku_subservice_is_nlb,sku_subservice_is_postgres,sku_subservice_is_put,sku_subservice_is_ram,sku_subservice_is_redis,sku_subservice_is_snapshot,sku_subservice_is_speech,sku_subservice_is_standard,sku_subservice_is_storage,sku_subservice_is_used_space,sku_subservice_is_windows,sku_subservice_name_is_other,vm_age_days_avg,vm_age_days_max,vm_age_days_min,vm_age_days_sum,vm_cores_avg,vm_cores_max,vm_cores_min,vm_cores_sum,vm_count,vm_gpus_avg,vm_gpus_max,vm_gpus_min,vm_gpus_sum,vm_memory_avg,vm_memory_max,vm_memory_min,vm_memory_sum
0,,dn2003nke4qtrmuhn4vm,,,,,,,,0.0,0.0,2021-04-01,0.0,5,0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0
1,,dn2004lij1dk31jjokbt,,,,,,,,0.0,0.0,2021-04-06,0.0,3,0,,,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0
2,RUB,dn20062cndjuoar4rqbm,0.0,0.0,0.0,0.0,individual,suspended,trial,3.665497,-3.665497,2021-04-07,0.0,4,0,No Partner Manager,Mass,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0
3,,dn200nk6o8l8450r6ci0,,,,,,,,0.0,0.0,2021-04-03,0.0,7,0,,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0
4,,dn200o00ug2bjafsui19,,,,,,,,0.0,0.0,2021-04-02,0.0,6,0,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0


In [4]:
cols_all = df_train_val.columns.tolist()
cols_id = ['billing_account_id', 'billing_record_msk_date']
cols_future = ['next_14d_cons', 'next_1d_cons', 'next_21d_cons',
               'next_30d_cons', 'next_45d_cons', 'next_7d_cons']
cols_target = ['next_14d_cons']
cols_all_features = list(set(cols_all) - set(cols_id) - set(cols_future))
cols_cat_features = list(
    set(df_train_val.dtypes[df_train_val.dtypes==object].index.tolist()) &
    set(cols_all_features)
)
cols_num_features = list(set(cols_all_features) - set(cols_cat_features))

In [5]:
df_train_val.loc[:,cols_cat_features] = df_train_val[cols_cat_features].fillna("MISSING")
df_test.loc[:,cols_cat_features] = df_test[cols_cat_features].fillna("MISSING")
df_oot.loc[:,cols_cat_features] = df_oot[cols_cat_features].fillna("MISSING")

df_train_val[cols_cat_features] = df_train_val[cols_cat_features].astype(str)
df_test[cols_cat_features] = df_test[cols_cat_features].astype(str)
df_oot[cols_cat_features] = df_oot[cols_cat_features].astype(str)

In [6]:
cols_ignore = df_train_val.nunique()[df_train_val.nunique()==1].index.tolist() + ['br_week_num']
cols_ignore

['billing_account_is_fraud',
 'sku_group_is_Tracker',
 'sku_name_is_network_ingress_inet',
 'sku_name_is_storage_api_delete',
 'sku_name_is_storage_api_network_cloud_egress',
 'sku_name_is_storage_api_network_cloud_ingress',
 'sku_name_is_storage_api_network_inet_ingress',
 'sku_service_is_dns',
 'sku_service_is_tracker',
 'sku_subservice_is_delete',
 'sku_subservice_is_ingress_inet',
 'br_week_num']

In [7]:
cols_all_features = list(set(cols_all_features) - set(cols_ignore))
cols_cat_features = list(set(cols_cat_features) - set(cols_ignore))
cols_num_features = list(set(cols_num_features) - set(cols_ignore))

#### PSI

In [8]:
def usr_pv_psi(dts1, dts2, n_groups, force_as_categorical=False):
    """
    PSI (не самый лучший, но хоть какой-то)
    """
    try:
        dts1 = dts1.copy()
        dts2 = dts2.copy()
#         As numeric
#         Срабатывает, если у dts1 и dts2 числовой формат
        if pd.api.types.is_numeric_dtype(dts1) & pd.api.types.is_numeric_dtype(dts2) & \
                (not force_as_categorical) & (dts1.nunique() > 2):
            if any(dts1.isna()) & any(dts2.isna()):
                rate1 = dts1.isna().sum() / len(dts1)
                rate2 = dts2.isna().sum() / len(dts2)
                psi_null = (rate1 - rate2) * np.log(rate1 / rate2)
                comment = ''
            elif any(dts1.isna()) | any(dts2.isna()):
                psi_null = 0 
                if any(dts1.isna()):
                    comment = 'Non-matching NaN is found. (T) PSI -> without NaN'
                else:
                    comment = 'Non-matching NaN is found. (V) PSI -> without NaN'
            else:
                psi_null = 0
                comment = ''

            dts1.dropna(inplace=True)
            dts2.dropna(inplace=True)

            dts1 = dts1[np.isfinite(dts1)]
            dts2 = dts2[np.isfinite(dts2)]

            dts1, bins_ = pd.qcut(dts1, n_groups, labels=False, retbins=True, duplicates='drop')
#             Ручное исправление границ (чтобы не ловить ошибку, когда dts2 шире чем dts1)
            bins_[0] = min(bins_[0], dts2.min())-1
            bins_[-1] = max(bins_[-1], dts2.max())+1
            dts2 = pd.cut(dts2, bins=bins_, labels=False, include_lowest=True)

#         As categorical
        else:
            psi_null = 0
            comment = ''
            dts1.fillna("NaN", inplace=True)
            dts2.fillna("NaN", inplace=True)
            extra_values = set()
            if len(set(dts1)-set(dts2)):
                extra_values = extra_values & (set(dts1)-set(dts2))
                comment = 'Non-matching groups only in (T)'
            if len(set(dts2)-set(dts1)):
                extra_values = extra_values & (set(dts2)-set(dts1))
                comment += '{}Non-matching new groups in (V)'.format('. ' if (len(comment)>=1) else '')

        rates1 = dts1.value_counts() / len(dts1)
        rates2 = dts2.value_counts() / len(dts2)
        psi = np.sum((rates1 - rates2) * np.log(rates1 / rates2)) + psi_null
        
    except:
        psi = np.nan
        comment = "Error"
        
    return psi, comment

In [9]:
# Available dates
dates = {}
for dt in df_train_val.billing_record_msk_date.unique():
    dates[dt] = pd.to_datetime(dt)

sorted_dates = pd.Series(dates).sort_values().index.tolist()
first_part_num = int((len(sorted_dates)+0.5)//2)
print('Total:     ', len(sorted_dates))
print('First part:', first_part_num)
sorted_dates

Total:      56
First part: 28


['2021-04-01',
 '2021-04-02',
 '2021-04-03',
 '2021-04-04',
 '2021-04-05',
 '2021-04-06',
 '2021-04-07',
 '2021-04-08',
 '2021-04-09',
 '2021-04-10',
 '2021-04-11',
 '2021-04-12',
 '2021-04-13',
 '2021-04-14',
 '2021-04-15',
 '2021-04-16',
 '2021-04-17',
 '2021-04-18',
 '2021-04-19',
 '2021-04-20',
 '2021-04-21',
 '2021-04-22',
 '2021-04-23',
 '2021-04-24',
 '2021-04-25',
 '2021-04-26',
 '2021-04-27',
 '2021-04-28',
 '2021-04-29',
 '2021-04-30',
 '2021-05-01',
 '2021-05-02',
 '2021-05-03',
 '2021-05-04',
 '2021-05-05',
 '2021-05-06',
 '2021-05-07',
 '2021-05-08',
 '2021-05-09',
 '2021-05-10',
 '2021-05-11',
 '2021-05-12',
 '2021-05-13',
 '2021-05-14',
 '2021-05-15',
 '2021-05-16',
 '2021-05-17',
 '2021-05-18',
 '2021-05-19',
 '2021-05-20',
 '2021-05-21',
 '2021-05-22',
 '2021-05-23',
 '2021-05-24',
 '2021-05-25',
 '2021-05-26']

In [10]:
# Splitting in time (50/50)
index1 = df_train_val.billing_record_msk_date.isin(sorted_dates[:first_part_num])
index2 = ~index1

In [11]:
df_PSI = pd.DataFrame(columns=['variable', 'PSI', 'comment'])
for col in tqdm(cols_all_features):
    as_cat = col in cols_cat_features
    psi_, comment_ = usr_pv_psi(df_train_val[col][index1],
                                df_train_val[col][index2],
                                n_groups=8,
                                force_as_categorical=as_cat)
    df_PSI = df_PSI.append({'variable' : col,
                            'PSI' : psi_,
                            'comment' : comment_},
                           ignore_index=True)

  0%|          | 0/124 [00:00<?, ?it/s]

In [12]:
# Non-stable in time
A = set(df_PSI[df_PSI.PSI>0.15].variable)
drop_list = list(A)

drop_list

['days_not_used']

We are ignoring drop_list as we have non-uniform distribution. Features in drop_list may affect significantly.

#### Mean target encoder

In [13]:
class MyEncoder:
    
    def __init__(self, name):
        self.name = name
        self.columns = None
        self.dict = {}
    
    def fit(self, X, y, list_to_encode):
        self.columns = list_to_encode
        for column in list_to_encode:
            self.dict[column] = TargetEncoder().fit(X[column].astype(str), y)
            
    def transform(self, X):
        for column in self.columns:
            try:
                X[column] = self.dict[column].transform(X[column].astype(str))
            except:
                print(column)
            
    def fit_transform(self, X, y, list_to_encode: list):
        self.fit(X, y, list_to_encode)
        self.transform(X)

In [14]:
enc = MyEncoder('MyTargetEncoder')
enc.fit(df_train_val[cols_all_features], df_train_val[cols_target], cols_cat_features)
enc.transform(df_train_val)
enc.transform(df_test)
enc.transform(df_oot)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [15]:
X_train_val, y_train_val, id_train_val = (
    df_train_val.loc[:,cols_all_features],
    df_train_val.loc[:,cols_target[0]],
    df_train_val.loc[:,cols_id]
)
X_test, y_test, id_test = (
    df_test.loc[:,cols_all_features],
    df_test.loc[:,cols_target[0]],
    df_test.loc[:,cols_id]
)
X_oot, y_oot, id_oot = (
    df_oot.loc[:,cols_all_features],
    df_oot.loc[:,cols_target[0]],
    df_oot.loc[:,cols_id]
)

In [16]:
X_train_val.to_pickle('Train_sample/X_train_val.pkl')
y_train_val.to_pickle('Train_sample/y_train_val.pkl')
id_train_val.to_pickle('Train_sample/id_train_val.pkl')
X_test.to_pickle('Train_sample/X_test.pkl')
y_test.to_pickle('Train_sample/y_test.pkl')
id_test.to_pickle('Train_sample/id_test.pkl')
X_oot.to_pickle('Train_sample/X_oot.pkl')
y_oot.to_pickle('Train_sample/y_oot.pkl')
id_oot.to_pickle('Train_sample/id_oot.pkl')