In [1]:
import os
import spyt
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import pyspark.sql.functions as F 
import pyspark.sql.types as T 
from pyspark.sql.functions import col, lit, broadcast
from pyspark.sql.window import Window
from clan_tools.secrets.Vault import Vault
from clan_tools.data_adapters.YTAdapter import YTAdapter
os.environ['NUMEXPR_MAX_THREADS'] = '16'

pd.set_option('display.max.rows', 1000)
pd.set_option("display.max_columns", 250)
DEFAULT_SPARK_CONF = {
    "spark.executor.memory": "6G",
    "spark.executor.cores": 2,
    "spark.sql.session.timeZone": "UTC",
    "spark.dynamicAllocation.maxExecutors": 6,
    "spark.dynamicAllocation.enabled":True,
    "spark.sql.autoBroadcastJoinThreshold":-1,
    "spark.cores.min":12,
    "spark.driver.memory": "4G",
    "spark.executor.instances":6,
}

In [2]:
Vault().get_secrets()
yt_adapter = YTAdapter()
yt = yt_adapter.yt

spark = spyt.connect(spark_conf_args=DEFAULT_SPARK_CONF)
spyt.info(spark)

2021-08-13 17:46:34,511 - INFO - spyt.client - SPYT Cluster version: 3.0.1-1.12.0+yandex
2021-08-13 17:46:34,513 - INFO - spyt.client - SPYT library version: 1.3.5


# Dataset

### Features

In [3]:
# period to consider (including bounds)

period_start = '2021-04-01'
period_end = '2021-06-30'
numbers_of_days_to_consider = [1, 7, 14, 21, 30, 45]
rewrite_flag = True

In [4]:
TARGET_PATH = "//home/cloud_analytics/ml/scoring/trial_to_paid/tables/target"
FEATURES_YC_CONS = "//home/cloud_analytics/ml/scoring/trial_to_paid/tables/features_yc_cons"
FEATURES_VM_CUBE = "//home/cloud_analytics/ml/scoring/trial_to_paid/tables/features_vm_cube"

In [5]:
def max_by(x, y):
    return F.expr(f'max_by({x}, {y})')
def min_by(x, y):
    return F.expr(f'min_by({x}, {y})')

#### yc_consumption

In [6]:
yc_cons_pre = (
    spark.read
    .yt("//home/cloud-dwh/data/prod/cdm/dm_yc_consumption")
    .select(
        "billing_account_id",
        col("billing_record_msk_date"),
        col("billing_record_cost_rub"),
        col("billing_record_credit_rub"),
        col("billing_record_total_rub"),
        col("billing_account_usage_status"),
        col("billing_account_person_type"),
        col("billing_account_currency"),
        col("billing_account_state"),
        col("billing_account_is_fraud"),
        col("billing_account_is_isv"),
        col("billing_account_is_var"),
        (~col("crm_account_id").isNull()).alias("billing_account_is_crm_account"),
        col("crm_partner_manager"),
        col("crm_segment"),
        col("sku_lazy"),
        col("sku_name"),
        col("sku_service_group"),
        col("sku_service_name"),
        col("sku_subservice_name")
    )
)

yc_cons_pre.limit(5).toPandas()

Unnamed: 0,billing_account_id,billing_record_msk_date,billing_record_cost_rub,billing_record_credit_rub,billing_record_total_rub,billing_account_usage_status,billing_account_person_type,billing_account_currency,billing_account_state,billing_account_is_fraud,billing_account_is_isv,billing_account_is_var,billing_account_is_crm_account,crm_partner_manager,crm_segment,sku_lazy,sku_name,sku_service_group,sku_service_name,sku_subservice_name
0,dn2cij3745utdqcsq8mp,2019-08-03,4.752,-4.752,0.0,paid,individual,RUB,active,False,False,False,False,No Partner Manager,Mass,0.0,compute.vm.ram.v2,Infrastructure,compute,ram
1,dn2cij72btgcpv77nqir,2019-08-03,39.701875,-39.701875,0.0,trial,individual,RUB,suspended,False,False,False,False,No Partner Manager,Mass,1.0,nbs.network-nvme.allocated,Infrastructure,compute,nbs_ssd
2,dn2cijklad9hqpedobc3,2019-08-03,0.20847,-0.20847,0.0,trial,individual,RUB,suspended,False,False,False,False,No Partner Manager,Mass,1.0,nbs.network-hdd.allocated,Infrastructure,compute,nbs_hdd
3,dn2cils6rhuhtjqiv17o,2019-08-03,3.6576,0.0,3.6576,paid,individual,RUB,active,False,False,False,False,No Partner Manager,Mass,1.0,network.public_fips,Infrastructure,cloud_network,fips
4,dn2cils6rhuhtjqiv17o,2019-08-03,4.752,0.0,4.752,paid,individual,RUB,active,False,False,False,False,No Partner Manager,Mass,1.0,network.public_fips.deallocated,Infrastructure,cloud_network,fips


In [7]:
sku_name_cnts = yc_cons_pre[['sku_name']].groupby('sku_name').count().toPandas()
sku_name_cnts['ratio'] = sku_name_cnts['count']/sku_name_cnts['count'].sum()*100
sku_name_cnts = sku_name_cnts.sort_values('ratio', ascending=False).reset_index(drop=True)
sku_name_cnts['mostfreq_coverage'] = sku_name_cnts['ratio'].cumsum()

# 41 most frequent sku names covers 90% of all rows
N_mf_sku_nms = 41
names_mf_sku_nms = sku_name_cnts['sku_name'][:N_mf_sku_nms].tolist()
sku_name_cols = []

for name in names_mf_sku_nms:
    col_name = name.replace(' ', '_').replace('.', '_').replace('-', '_')
    sku_name_cols.append(
        F.sum(
            F.when(col("sku_name")==name, col("billing_record_total_rub"))
            .otherwise(lit(0))
        )
        .alias(f"sku_name_is_{col_name}")
    )

sku_name_cols.append(
    F.sum(
        F.when(~col("sku_name").isin(names_mf_sku_nms), col("billing_record_total_rub"))
        .otherwise(lit(0))
    )
    .alias("sku_name_is_other")
)

In [8]:
sku_group_cnts = yc_cons_pre[['sku_service_group']].groupby('sku_service_group').count().toPandas()
sku_group_cnts['ratio'] = sku_group_cnts['count']/sku_group_cnts['count'].sum()*100
sku_group_cnts = sku_group_cnts.sort_values('ratio', ascending=False).reset_index(drop=True)
sku_group_cnts['mostfreq_coverage'] = sku_group_cnts['ratio'].cumsum()

# we have only 8 sku groups so encode them all
names_all_sku_sgps = sku_group_cnts['sku_service_group'].tolist()
sku_group_cols = []

for name in names_all_sku_sgps:
    col_name = name.replace(' ', '_').replace('.', '_').replace('-', '_')
    sku_group_cols.append(
        F.sum(
            F.when(col("sku_service_group")==name, col("billing_record_total_rub"))
            .otherwise(lit(0))
        )
        .alias(f"sku_group_is_{col_name}")
    )

In [9]:
sku_service_cnts = yc_cons_pre[['sku_service_name']].groupby('sku_service_name').count().toPandas()
sku_service_cnts['ratio'] = sku_service_cnts['count']/sku_service_cnts['count'].sum()*100
sku_service_cnts = sku_service_cnts.sort_values('ratio', ascending=False).reset_index(drop=True)
sku_service_cnts['mostfreq_coverage'] = sku_service_cnts['ratio'].cumsum()

# we have only 20 sku services so encode them all
names_all_sku_srvs = sku_service_cnts['sku_service_name'].tolist()
sku_service_cols = []

for name in names_all_sku_srvs:
    col_name = name.replace(' ', '_').replace('.', '_').replace('-', '_')
    sku_service_cols.append(
        F.sum(
            F.when(col("sku_service_name")==name, col("billing_record_total_rub"))
            .otherwise(lit(0))
        )
        .alias(f"sku_service_is_{col_name}")
    )

In [10]:
sku_subservice_cnts = yc_cons_pre[['sku_subservice_name']].groupby('sku_subservice_name').count().toPandas()
sku_subservice_cnts['ratio'] = sku_subservice_cnts['count']/sku_subservice_cnts['count'].sum()*100
sku_subservice_cnts = sku_subservice_cnts.sort_values('ratio', ascending=False).reset_index(drop=True)
sku_subservice_cnts['mostfreq_coverage'] = sku_subservice_cnts['ratio'].cumsum()

# 24 most frequent sku subservices covers 95% of all rows
N_mf_sku_sbsrvs = 24
names_mf_sku_sbsrvs = sku_subservice_cnts['sku_subservice_name'][:24][:N_mf_sku_sbsrvs].tolist()
sku_subservice_cols = []

for name in names_mf_sku_sbsrvs:
    col_name = name.replace(' ', '_').replace('.', '_').replace('-', '_')
    sku_subservice_cols.append(
        F.sum(
            F.when(col("sku_subservice_name")==name, col("billing_record_total_rub"))
            .otherwise(lit(0))
        )
        .alias(f"sku_subservice_is_{col_name}")
    )

sku_subservice_cols.append(
    F.sum(
        F.when(~col("sku_subservice_name").isin(names_mf_sku_sbsrvs), col("billing_record_total_rub"))
        .otherwise(lit(0))
    )
    .alias("sku_subservice_name_is_other")
)

In [11]:
# yc consumption features
yc_cons = (
    yc_cons_pre
    .groupby("billing_account_id", "billing_record_msk_date")
    .agg(
        F.sum("billing_record_cost_rub").alias("billing_record_cost_rub"),
        F.sum("billing_record_credit_rub").alias("billing_record_credit_rub"),
        F.sum("billing_record_total_rub").alias("billing_record_total_rub"),
        max_by("billing_account_usage_status", "billing_record_msk_date")
            .alias("billing_account_usage_status"),
        max_by("billing_account_person_type", "billing_record_msk_date").alias("billing_account_person_type"),
        max_by("billing_account_currency", "billing_record_msk_date").alias("billing_account_currency"),
        max_by("billing_account_state", "billing_record_msk_date").alias("billing_account_state"),
        (F.sum(col("billing_account_is_fraud").cast("int"))>0).alias("billing_account_is_fraud"),
        (F.sum(col("billing_account_is_isv").cast("int"))>0).alias("billing_account_is_isv"),
        (F.sum(col("billing_account_is_var").cast("int"))>0).alias("billing_account_is_var"),
        (F.sum(col("billing_account_is_crm_account").cast("int"))>0).alias("billing_account_is_crm_account"),
        max_by("crm_partner_manager", "billing_record_msk_date").alias("crm_partner_manager"),
        max_by("crm_segment", "billing_record_msk_date").alias("crm_segment"),
        (F.sum(col("sku_lazy"))>0).alias("sku_lazy"),
        *sku_name_cols,
        *sku_group_cols,
        *sku_service_cols,
        *sku_subservice_cols,
    )
)

yc_cons.limit(5).toPandas()

Unnamed: 0,billing_account_id,billing_record_msk_date,billing_record_cost_rub,billing_record_credit_rub,billing_record_total_rub,billing_account_usage_status,billing_account_person_type,billing_account_currency,billing_account_state,billing_account_is_fraud,billing_account_is_isv,billing_account_is_var,billing_account_is_crm_account,crm_partner_manager,crm_segment,sku_lazy,sku_name_is_nbs_network_hdd_allocated,sku_name_is_network_public_fips,sku_name_is_network_egress_inet,sku_name_is_network_ingress_inet,sku_name_is_nbs_network_nvme_allocated,sku_name_is_compute_vm_ram_v2,sku_name_is_network_public_fips_deallocated,sku_name_is_storage_bucket_used_space_standard,sku_name_is_compute_snapshot,sku_name_is_compute_vm_cpu_c05_v2,sku_name_is_compute_vm_cpu_c100_v2,sku_name_is_compute_vm_ram,sku_name_is_storage_api_network_inet_egress,sku_name_is_storage_bucket_used_space_cold,sku_name_is_storage_api_get_standard,sku_name_is_compute_vm_cpu_50_v2,sku_name_is_compute_vm_cpu_c20_v2,sku_name_is_storage_api_network_inet_ingress,sku_name_is_compute_vm_cpu_c100,sku_name_is_storage_api_put_standard,sku_name_is_compute_vm_cpu_c05,sku_name_is_storage_api_network_cloud_egress,sku_name_is_marketplace_windows_cpu_c100,sku_name_is_storage_api_head_standard,sku_name_is_mdb_cluster_network_nvme_pg,sku_name_is_cr_bucket_used_space_standard,sku_name_is_compute_image,sku_name_is_storage_api_delete,sku_name_is_storage_api_network_cloud_ingress,sku_name_is_storage_api_put_cold,sku_name_is_support_standard_fixed_consumption_v1,sku_name_is_marketplace_windows_cpu_c05,sku_name_is_network_ingress_inet_antiddos_qrator,sku_name_is_mdb_cluster_pg_v2_ram,sku_name_is_nlb_balancer_active,sku_name_is_nlb_vip_bytes_ingress,sku_name_is_compute_vm_ram_preemptible_v2,sku_name_is_network_public_fips_lb,sku_name_is_ai_speech_tts,sku_name_is_mdb_cluster_network_nvme_mysql,sku_name_is_mdb_cluster_network_hdd_pg,sku_name_is_other,sku_group_is_Infrastructure,sku_group_is_Data_Storage_and_Analytics,sku_group_is_Marketplace,sku_group_is_ML_and_AI,sku_group_is_Support,sku_group_is_Cloud_Native,sku_group_is_Adjustments,sku_group_is_Tracker,sku_service_is_compute,sku_service_is_cloud_network,sku_service_is_storage,sku_service_is_mdb,sku_service_is_marketplace,sku_service_is_cloud_ai,sku_service_is_cr,sku_service_is_nlb,sku_service_is_support,sku_service_is_serverless,sku_service_is_mk8s,sku_service_is_kms,sku_service_is_ymq,sku_service_is_ydb,sku_service_is_monitoring,sku_service_is_api_gateway,sku_service_is_iot,sku_service_is_adjustments,sku_service_is_datalens,sku_service_is_tracker,sku_subservice_is_nbs_hdd,sku_subservice_is_fips,sku_subservice_is_cpu,sku_subservice_is_ram,sku_subservice_is_egress_inet,sku_subservice_is_ingress_inet,sku_subservice_is_nbs_ssd,sku_subservice_is_used_space,sku_subservice_is_storage,sku_subservice_is_snapshot,sku_subservice_is_postgres,sku_subservice_is_get,sku_subservice_is_windows,sku_subservice_is_put,sku_subservice_is_mysql,sku_subservice_is_head,sku_subservice_is_mongo,sku_subservice_is_speech,sku_subservice_is_clickhouse,sku_subservice_is_standard,sku_subservice_is_image,sku_subservice_is_delete,sku_subservice_is_redis,sku_subservice_is_ingress_qrator,sku_subservice_name_is_other
0,dn2000o2g9kv69el386u,2019-07-25,39.699807,-39.699807,0.0,trial,individual,RUB,suspended,False,False,False,False,No Partner Manager,Mass,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,dn2000o2g9kv69el386u,2019-09-02,39.691594,-39.691594,0.0,trial,individual,RUB,suspended,False,False,False,False,No Partner Manager,Mass,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,dn2000o2g9kv69el386u,2019-10-11,39.701875,-39.701875,0.0,trial,individual,RUB,suspended,False,False,False,False,No Partner Manager,Mass,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,dn2000pf31bpfdhm83co,2018-12-08,2.664998,0.0,2.664998,paid,individual,RUB,active,False,False,False,False,No Partner Manager,Mass,True,2.664998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.664998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.664998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.664998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,dn2000pf31bpfdhm83co,2018-12-12,2.664998,0.0,2.664998,paid,individual,RUB,active,False,False,False,False,No Partner Manager,Mass,True,2.664998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.664998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.664998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.664998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### grid

In [12]:
# billing_account_id with all dates (as an event_date column) 
#           starting from max of its first mention or period_start

date_grid = (
    spark
    .createDataFrame(pd.DataFrame({"event_date": pd.date_range(period_start, period_end)}))
    .select(col("event_date").cast("date"))
)

all_ba_grid = (
    yc_cons
    .groupby("billing_account_id")
    .agg(F.min("billing_record_msk_date").alias("start_date"), 
         F.sum(col("billing_account_is_fraud").cast("int")).alias("is_fraud"), 
         min_by("billing_account_usage_status", "billing_record_msk_date").alias("ba_usage_status"))
    .filter(col("is_fraud")==0)
    .filter(col("ba_usage_status")!="service")
    .join(date_grid, on=(col("start_date")<=col("event_date")), how="cross")
    .select("billing_account_id", col("event_date").alias("billing_record_msk_date"))
)

#### yc_cons_full

In [13]:
colnames_yc_cons = yc_cons.limit(0).toPandas().columns.tolist()

def sparkNVL(colname, replace_val):
    return F.when(col(colname).isNull(), lit(replace_val)).otherwise(col(colname)).alias(colname)

yc_cons_full = (
    all_ba_grid
    .join(yc_cons, on=["billing_account_id", "billing_record_msk_date"], how="left")
    .select(
        "billing_account_id",
        F.date_format(col("billing_record_msk_date"),"yyyy-MM-dd").alias("billing_record_msk_date"),
        sparkNVL("billing_record_cost_rub", 0),
        sparkNVL("billing_record_credit_rub", 0),
        sparkNVL("billing_record_total_rub", 0),
        "billing_account_usage_status",
        "billing_account_person_type",
        "billing_account_currency",
        "billing_account_state",
        "billing_account_is_fraud",
        "billing_account_is_isv",
        "billing_account_is_var",
        "billing_account_is_crm_account",
        "crm_partner_manager",
        "crm_segment",
        col("sku_lazy").cast("int"),
        *[sparkNVL(colname, 0) for colname in colnames_yc_cons[16:]]
    )
)

yc_cons_full.limit(5).toPandas()

Unnamed: 0,billing_account_id,billing_record_msk_date,billing_record_cost_rub,billing_record_credit_rub,billing_record_total_rub,billing_account_usage_status,billing_account_person_type,billing_account_currency,billing_account_state,billing_account_is_fraud,billing_account_is_isv,billing_account_is_var,billing_account_is_crm_account,crm_partner_manager,crm_segment,sku_lazy,sku_name_is_nbs_network_hdd_allocated,sku_name_is_network_public_fips,sku_name_is_network_egress_inet,sku_name_is_network_ingress_inet,sku_name_is_nbs_network_nvme_allocated,sku_name_is_compute_vm_ram_v2,sku_name_is_network_public_fips_deallocated,sku_name_is_storage_bucket_used_space_standard,sku_name_is_compute_snapshot,sku_name_is_compute_vm_cpu_c05_v2,sku_name_is_compute_vm_cpu_c100_v2,sku_name_is_compute_vm_ram,sku_name_is_storage_api_network_inet_egress,sku_name_is_storage_bucket_used_space_cold,sku_name_is_storage_api_get_standard,sku_name_is_compute_vm_cpu_50_v2,sku_name_is_compute_vm_cpu_c20_v2,sku_name_is_storage_api_network_inet_ingress,sku_name_is_compute_vm_cpu_c100,sku_name_is_storage_api_put_standard,sku_name_is_compute_vm_cpu_c05,sku_name_is_storage_api_network_cloud_egress,sku_name_is_marketplace_windows_cpu_c100,sku_name_is_storage_api_head_standard,sku_name_is_mdb_cluster_network_nvme_pg,sku_name_is_cr_bucket_used_space_standard,sku_name_is_compute_image,sku_name_is_storage_api_delete,sku_name_is_storage_api_network_cloud_ingress,sku_name_is_storage_api_put_cold,sku_name_is_support_standard_fixed_consumption_v1,sku_name_is_marketplace_windows_cpu_c05,sku_name_is_network_ingress_inet_antiddos_qrator,sku_name_is_mdb_cluster_pg_v2_ram,sku_name_is_nlb_balancer_active,sku_name_is_nlb_vip_bytes_ingress,sku_name_is_compute_vm_ram_preemptible_v2,sku_name_is_network_public_fips_lb,sku_name_is_ai_speech_tts,sku_name_is_mdb_cluster_network_nvme_mysql,sku_name_is_mdb_cluster_network_hdd_pg,sku_name_is_other,sku_group_is_Infrastructure,sku_group_is_Data_Storage_and_Analytics,sku_group_is_Marketplace,sku_group_is_ML_and_AI,sku_group_is_Support,sku_group_is_Cloud_Native,sku_group_is_Adjustments,sku_group_is_Tracker,sku_service_is_compute,sku_service_is_cloud_network,sku_service_is_storage,sku_service_is_mdb,sku_service_is_marketplace,sku_service_is_cloud_ai,sku_service_is_cr,sku_service_is_nlb,sku_service_is_support,sku_service_is_serverless,sku_service_is_mk8s,sku_service_is_kms,sku_service_is_ymq,sku_service_is_ydb,sku_service_is_monitoring,sku_service_is_api_gateway,sku_service_is_iot,sku_service_is_adjustments,sku_service_is_datalens,sku_service_is_tracker,sku_subservice_is_nbs_hdd,sku_subservice_is_fips,sku_subservice_is_cpu,sku_subservice_is_ram,sku_subservice_is_egress_inet,sku_subservice_is_ingress_inet,sku_subservice_is_nbs_ssd,sku_subservice_is_used_space,sku_subservice_is_storage,sku_subservice_is_snapshot,sku_subservice_is_postgres,sku_subservice_is_get,sku_subservice_is_windows,sku_subservice_is_put,sku_subservice_is_mysql,sku_subservice_is_head,sku_subservice_is_mongo,sku_subservice_is_speech,sku_subservice_is_clickhouse,sku_subservice_is_standard,sku_subservice_is_image,sku_subservice_is_delete,sku_subservice_is_redis,sku_subservice_is_ingress_qrator,sku_subservice_name_is_other
0,dn2000o2g9kv69el386u,2021-04-18,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,dn2000o2g9kv69el386u,2021-05-07,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,dn2000o2g9kv69el386u,2021-06-05,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,dn2000o2g9kv69el386u,2021-06-18,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,dn2000pf31bpfdhm83co,2021-04-21,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
if rewrite_flag:
    _ = (
        yc_cons_full
        .write
        .yt(FEATURES_YC_CONS, mode="overwrite")
    )

#### vm_cube

In [15]:
vm_cube = (
    spark.read
    .yt("//home/cloud_analytics/compute_logs/vm_cube/vm_cube")
    .groupby("ba_id", F.to_date(F.from_unixtime("slice_time")).alias("billing_record_msk_date"), "vm_id")
    .agg(F.max("vm_cores").alias("cores"),
         F.max("vm_gpus").alias("gpus"),
         F.max("vm_memory").alias("memory"),
         F.max("vm_age_days").alias("vm_age_days"))
    .select(["billing_record_msk_date", "vm_id", "cores", "gpus", "memory", 
             "vm_age_days", col("ba_id").alias("billing_account_id")])
    .groupby("billing_account_id", "billing_record_msk_date")
    .agg(
        F.sum("cores").alias("vm_cores_sum"),
        F.mean("cores").alias("vm_cores_avg"),
        F.min("cores").alias("vm_cores_min"),
        F.max("cores").alias("vm_cores_max"),
        F.sum("gpus").alias("vm_gpus_sum"),
        F.mean("gpus").alias("vm_gpus_avg"),
        F.min("gpus").alias("vm_gpus_min"),
        F.max("gpus").alias("vm_gpus_max"),
        F.sum("memory").alias("vm_memory_sum"),
        F.mean("memory").alias("vm_memory_avg"),
        F.min("memory").alias("vm_memory_min"),
        F.max("memory").alias("vm_memory_max"),
        F.sum("vm_age_days").alias("vm_age_days_sum"),
        F.mean("vm_age_days").alias("vm_age_days_avg"),
        F.min("vm_age_days").alias("vm_age_days_min"),
        F.max("vm_age_days").alias("vm_age_days_max"),
        F.count("vm_id").alias("vm_count")
    )
)

vm_cube.limit(5).toPandas()

Unnamed: 0,billing_account_id,billing_record_msk_date,vm_cores_sum,vm_cores_avg,vm_cores_min,vm_cores_max,vm_gpus_sum,vm_gpus_avg,vm_gpus_min,vm_gpus_max,vm_memory_sum,vm_memory_avg,vm_memory_min,vm_memory_max,vm_age_days_sum,vm_age_days_avg,vm_age_days_min,vm_age_days_max,vm_count
0,dn2t9japes0heqaov4dd,2021-07-23,32583.0,2.148707,1.0,224.0,19,0.001253,0,8,25483.0,1.680493,0.5,952.0,96996.0,6.396465,0.0,20.0,15164
1,dn2t9japes0heqaov4dd,2020-08-17,20093.0,2.024076,1.0,64.0,11,0.001108,0,4,14822.5,1.49315,0.5,384.0,210403.0,21.195024,0.0,182.0,9927
2,dn2char50jflpgthkm57,2021-04-12,56561.0,3.92376,1.0,80.0,524,0.036351,0,4,230236.5,15.972008,0.5,640.0,356506.0,24.731599,0.0,420.0,14415
3,dn26ipesh7lofl1nchrh,2019-06-02,1.0,1.0,1.0,1.0,0,0.0,0,0,8.0,8.0,8.0,8.0,12.0,12.0,12.0,12.0,1
4,dn2k2sb3esiic6r5one4,2021-07-21,60112.0,30.451874,2.0,48.0,0,0.0,0,0,257824.0,130.609929,2.0,144.0,138.0,0.069909,0.0,18.0,1974


#### vm_cube_full

In [16]:
colnames_vm_cube = vm_cube.limit(0).toPandas().columns.tolist()

vm_cube_full = (
    all_ba_grid
    .join(vm_cube, on=["billing_account_id", "billing_record_msk_date"], how="left")
    .select(
        "billing_account_id",
        F.date_format(col("billing_record_msk_date"),"yyyy-MM-dd").alias("billing_record_msk_date"),
        *[sparkNVL(colname, 0) for colname in colnames_vm_cube[2:]]
    )
)

vm_cube_full.limit(5).toPandas()

Unnamed: 0,billing_account_id,billing_record_msk_date,vm_cores_sum,vm_cores_avg,vm_cores_min,vm_cores_max,vm_gpus_sum,vm_gpus_avg,vm_gpus_min,vm_gpus_max,vm_memory_sum,vm_memory_avg,vm_memory_min,vm_memory_max,vm_age_days_sum,vm_age_days_avg,vm_age_days_min,vm_age_days_max,vm_count
0,dn2000o2g9kv69el386u,2021-04-09,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,dn2000o2g9kv69el386u,2021-04-12,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,dn2000o2g9kv69el386u,2021-04-23,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,dn2000o2g9kv69el386u,2021-06-02,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,dn2000pf31bpfdhm83co,2021-04-26,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [17]:
if rewrite_flag:
    _ = (
        vm_cube_full
        .write
        .yt(FEATURES_VM_CUBE, mode="overwrite")
    )

#### target

In [18]:
target_pre = (
    spark
    .read
    .yt("//home/cloud_analytics/ml/scoring/trial_to_paid/tables/features_yc_cons")
    .select(
        "billing_account_id",
        F.to_date("billing_record_msk_date").alias("billing_record_msk_date"),
        "billing_record_cost_rub",
        "billing_record_total_rub"
    )
)

In [19]:
# calculate targets
def date_to_bigint(dt_col):
    return F.datediff(dt_col, lit("1900-01-01")).cast("bigint")

def make_window(N):
    if (N == 0):
        raise ValueError("N must be greater or less than 0")
    if N > 0:
        w = (
            Window
            .partitionBy("billing_account_id")
            .orderBy(date_to_bigint("billing_record_msk_date"))
            .rangeBetween(1, N)
        )
    else:
        w = (
            Window
            .partitionBy("billing_account_id")
            .orderBy(date_to_bigint("billing_record_msk_date"))
            .rangeBetween(N, -1)
        )
    return w

def next_N_days_cons(N):
    ans = F.sum("billing_record_total_rub").over(make_window(N))
    w = Window.partitionBy("billing_account_id")
    cond = (F.datediff(F.max("billing_record_msk_date").over(w), col("billing_record_msk_date"))>=N)
    colname = f"prev_{abs(N)}d_cons" if (N<0) else f"next_{N}d_cons"
    return F.when(cond, ans).otherwise(lit(None)).alias(colname)

target = (
    target_pre
    .select(
        "billing_account_id",
        "billing_record_msk_date",
        F.sum(
            F.when(col("billing_record_cost_rub")==0, lit(1))
            .otherwise(lit(0))
        ).over(make_window(Window.unboundedPreceding)).alias("days_not_used"),
        *[next_N_days_cons(-N) for N in numbers_of_days_to_consider],
        *[next_N_days_cons(N) for N in numbers_of_days_to_consider]
    )
)

target.limit(5).toPandas()

Unnamed: 0,billing_account_id,billing_record_msk_date,days_not_used,prev_1d_cons,prev_7d_cons,prev_14d_cons,prev_21d_cons,prev_30d_cons,prev_45d_cons,next_1d_cons,next_7d_cons,next_14d_cons,next_21d_cons,next_30d_cons,next_45d_cons
0,dn2000o2g9kv69el386u,2021-04-01,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0
1,dn2000o2g9kv69el386u,2021-04-02,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,dn2000o2g9kv69el386u,2021-04-03,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,dn2000o2g9kv69el386u,2021-04-04,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,dn2000o2g9kv69el386u,2021-04-05,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
if rewrite_flag:
    _ = (
        target
        .withColumn("billing_record_msk_date", F.date_format(col("billing_record_msk_date"),"yyyy-MM-dd"))
        .write
        .yt(TARGET_PATH, mode="overwrite")
    )

#### Reset spark

In [21]:
spyt.stop(spark)

spark = spyt.connect(spark_conf_args=DEFAULT_SPARK_CONF)
spyt.info(spark)

2021-08-13 18:09:02,686 - INFO - spyt.client - SPYT Cluster version: 3.0.1-1.12.0+yandex
2021-08-13 18:09:02,688 - INFO - spyt.client - SPYT library version: 1.3.5


# Samples

#### general dataset

In [22]:
target_spdf = spark.read.yt(TARGET_PATH)
yc_cons_spdf = spark.read.yt(FEATURES_YC_CONS)
vm_cube_spdf = spark.read.yt(FEATURES_VM_CUBE)

general_dts = (
    target_spdf
    .join(yc_cons_spdf, on =["billing_account_id", "billing_record_msk_date"])
    .join(vm_cube_spdf, on =["billing_account_id", "billing_record_msk_date"])
    .withColumn("billing_record_msk_date", F.to_date("billing_record_msk_date"))
    .select(
        "*",
        F.floor(F.datediff(col("billing_record_msk_date"), lit(period_start))/7)
            .cast("int")
            .alias("br_week_num"),
        F.dayofweek("billing_record_msk_date").alias("br_week_day")
    )
)

general_dts.printSchema()

root
 |-- billing_account_id: string (nullable = true)
 |-- billing_record_msk_date: date (nullable = true)
 |-- days_not_used: long (nullable = true)
 |-- prev_1d_cons: double (nullable = true)
 |-- prev_7d_cons: double (nullable = true)
 |-- prev_14d_cons: double (nullable = true)
 |-- prev_21d_cons: double (nullable = true)
 |-- prev_30d_cons: double (nullable = true)
 |-- prev_45d_cons: double (nullable = true)
 |-- next_1d_cons: double (nullable = true)
 |-- next_7d_cons: double (nullable = true)
 |-- next_14d_cons: double (nullable = true)
 |-- next_21d_cons: double (nullable = true)
 |-- next_30d_cons: double (nullable = true)
 |-- next_45d_cons: double (nullable = true)
 |-- billing_record_cost_rub: double (nullable = true)
 |-- billing_record_credit_rub: double (nullable = true)
 |-- billing_record_total_rub: double (nullable = true)
 |-- billing_account_usage_status: string (nullable = true)
 |-- billing_account_person_type: string (nullable = true)
 |-- billing_account_curre

#### test2_dts

In [23]:
test2_dt = (
    general_dts
    .filter(~col("next_14d_cons").isNull())
    .agg(F.max("billing_record_msk_date"))
    .collect()[0][0]
)

test2_dt

datetime.date(2021, 6, 16)

In [24]:
test2_dts = (
    general_dts
    .filter(col("billing_record_msk_date")==test2_dt)
)

test2_dts.groupby("billing_record_msk_date").count().show()

+-----------------------+-----+
|billing_record_msk_date|count|
+-----------------------+-----+
|             2021-06-16|87107|
+-----------------------+-----+



In [25]:
if rewrite_flag:
    _ = (
        test2_dts
        .withColumn("billing_record_msk_date", F.date_format(col("billing_record_msk_date"),"yyyy-MM-dd"))
        .write
        .yt("//home/cloud_analytics/ml/scoring/trial_to_paid/tables/samples/test2", 
            mode="overwrite")
    )

#### main_dts

In [26]:
main_dts = (
    general_dts
    .filter(col("billing_record_msk_date")<test2_dt)
)

def max_by(x, y):
    return F.expr(f'max_by({x}, {y})')

shuffling_rule = (
    main_dts
    .select(
        "billing_account_id",
        "br_week_num",
        "br_week_day",
        F.rand(seed=42).alias("random_col")
    )
    .groupby("billing_account_id", "br_week_num")
    .agg(
        max_by("br_week_day", "random_col").alias("br_week_day")
    )
)

filtered_main_dts = (
    shuffling_rule
    .join(main_dts, on=["billing_account_id", "br_week_num", "br_week_day"], how="left")
    .cache()
)

shuffling_rule.groupby("br_week_day").agg(F.count("*").alias("cnt")).sort("br_week_day").show()

+-----------+------+
|br_week_day|   cnt|
+-----------+------+
|          1|137374|
|          2|137501|
|          3|137883|
|          4|124396|
|          5|136782|
|          6|137259|
|          7|137066|
+-----------+------+



In [27]:
all_bas = (
    filtered_main_dts
    .select("billing_account_id")
    .distinct()
    .select(
        "billing_account_id",
        F.rand(seed=42).alias("random_col")
    )
)

tr_bas = all_bas.filter(col("random_col")<=0.7)
ntr_bas = all_bas.filter(col("random_col")>0.7)

print("tr", tr_bas.count())

print("ntr", ntr_bas.count())

tr 61253
ntr 25854


In [28]:
num_weeks = main_dts.agg(F.max("br_week_num")).collect()[0][0]
oot_border = int(num_weeks*0.7)

print('Averall weeks count:', num_weeks)
print('OOT-border start from week num:', oot_border)

Averall weeks count: 10
OOT-border start from week num: 7


In [29]:
main_dts.groupby("br_week_num").count().sort("br_week_num").show(50)

+-----------+------+
|br_week_num| count|
+-----------+------+
|          0|576838|
|          1|583273|
|          2|592606|
|          3|602342|
|          4|609558|
|          5|609749|
|          6|609749|
|          7|609749|
|          8|609749|
|          9|609749|
|         10|522642|
+-----------+------+



#### sampling

In [30]:
train_base = (
    main_dts.
    join(tr_bas[["billing_account_id"]], on=["billing_account_id"], how="right")
)

test_base = (
    main_dts.
    join(ntr_bas[["billing_account_id"]], on=["billing_account_id"], how="right")
)

train = (
    train_base
    .filter(col("br_week_num")<=oot_border)
)

oot_tr = (
    train_base
    .filter(col("br_week_num")>oot_border)
    .filter(col("br_week_num")<num_weeks)
)

test = (
    test_base
    .filter(col("br_week_num")<=oot_border)
)

oot_te = (
    test_base
    .filter(col("br_week_num")>oot_border)
    .filter(col("br_week_num")<num_weeks)
)

In [31]:
if rewrite_flag:
    _ = (
        train
        .withColumn("billing_record_msk_date", F.date_format(col("billing_record_msk_date"),"yyyy-MM-dd"))
        .write
        .yt("//home/cloud_analytics/ml/scoring/trial_to_paid/tables/samples/train", 
            mode="overwrite")
    )
    
    _ = (
        test
        .withColumn("billing_record_msk_date", F.date_format(col("billing_record_msk_date"),"yyyy-MM-dd"))
        .write
        .yt("//home/cloud_analytics/ml/scoring/trial_to_paid/tables/samples/test", 
            mode="overwrite")
    )
    
    _ = (
        oot_te
        .withColumn("billing_record_msk_date", F.date_format(col("billing_record_msk_date"),"yyyy-MM-dd"))
        .write
        .yt("//home/cloud_analytics/ml/scoring/trial_to_paid/tables/samples/oot", 
            mode="overwrite")
    )
    
    _ = (
        oot_tr
        .withColumn("billing_record_msk_date", F.date_format(col("billing_record_msk_date"),"yyyy-MM-dd"))
        .write
        .yt("//home/cloud_analytics/ml/scoring/trial_to_paid/tables/samples/oot_tr", 
            mode="overwrite")
    )

In [32]:
spyt.stop(spark)