In [1]:
import pandas as pd, datetime, ast, os,sys, pymysql
module_path = os.path.abspath(os.path.join('/home/ktereshin/yandex/arcadia/cloud/analytics/python/work'))
if module_path not in sys.path:
    sys.path.append(module_path)
from data_loader import clickhouse
from global_variables import (
    metrika_clickhouse_param_dict,
    cloud_clickhouse_param_dict
)
from nile.api.v1 import (
    clusters,
    aggregators as na,
    extractors as ne,
    filters as nf,
    Record
)
from vault_client import instances

In [2]:
def works_with_emails(mail_):
    mail_parts = str(mail_).split('@')
    if len(mail_parts) > 1:
        if 'yandex.' in mail_parts[1].lower() or 'ya.' in mail_parts[1].lower():
            domain = 'yandex.ru'
            login = mail_parts[0].lower().replace('.', '-')
            return login + '@' + domain
        else:
            return str(mail_).lower()
    else:
        return str(mail_).lower()

def get_last_not_empty_table(folder_path):
    tables_list = sorted([folder_path + '/' + x for x in job.driver.list(folder_path)], reverse=True)
    last_table_rows = 0
    last_table = ''
    for table in tables_list:
        try:
            table_ = job.driver.read(table)
        except:
            continue
        
        if table_.row_count > last_table_rows:
            last_table_rows =  table_.row_count
            last_table = table
    if last_table:
        return last_table
    else:
        return tables_list[0]

def apply_types_in_project(schema_):
    apply_types_dict = {}
    for col in schema_:
        
        if schema_[col] == str:
            apply_types_dict[col] = ne.custom(lambda x: str(x) if x not in ['', None] else None, col)
            
        elif schema_[col] == int:
            apply_types_dict[col] = ne.custom(lambda x: int(x) if x not in ['', None] else None, col)
            
        elif schema_[col] == float:
            apply_types_dict[col] = ne.custom(lambda x: float(x) if x not in ['', None] else None, col)
    return apply_types_dict


client = instances.Production()
yt_creds = client.get_version('ver-01d33pgv8pzc7t99s3egm24x47')
crm_sql_creds = client.get_version('ver-01d3ktedjm6ptsvwf1xq161hwk')
metrika_creds = client.get_version('ver-01d2z36msatt9mp9pcfptezksp')
yc_ch_creds = client.get_version('ver-01d2z39xj02xw7gqvv9wq757ne')

cluster = clusters.yt.Hahn(
    token = yt_creds['value']['token'],
    pool = yt_creds['value']['pool']
)
metrika_clickhouse_param_dict['user'] = metrika_creds['value']['login']
metrika_clickhouse_param_dict['password'] = metrika_creds['value']['pass']

cloud_clickhouse_param_dict['user'] = yc_ch_creds['value']['login']
cloud_clickhouse_param_dict['password'] = yc_ch_creds['value']['pass']

In [3]:
query = 'DROP TABLE IF EXISTS cloud_analytics_testing.calls_cohorts_weekly'
cloud_clickhouse_param_dict['query'] = query

clickhouse.get_clickhouse_data(**cloud_clickhouse_param_dict)

query = '''
CREATE TABLE cloud_analytics_testing.calls_cohorts_weekly_temp 
ENGINE = MergeTree()
ORDER BY(call_date, delta_date) PARTITION BY toYYYYMM(call_date)
AS
SELECT
  lead_id,
  billing_account_id,
  ba_payment_cycle_type,
  ba_state,
  ba_person_type,
  ba_usage_status,
  call_tag,
  channel,
  lead_source,
  lead_source_description,
  sales_name,
  segment,
  block_reason,
  call_date,
  date AS delta_date,
  first_paid_consumption,
  toRelativeWeekNum(delta_date) - toRelativeWeekNum(call_date) AS delta
FROM
  (
    SELECT
      lead_id,
      billing_account_id,
      ba_payment_cycle_type,
      ba_state,
      ba_person_type,
      ba_usage_status,
      call_tag,
      channel,
      lead_source,
      lead_source_description,
      sales_name,
      segment,
      block_reason,
      groupArray(date) AS dates,
      arrayMap(
        x -> (x > 0),
        arrayCumSum(groupArray(first_paid_consumption))
      ) AS first_paid_consumptions,
      dates [1] AS call_date
    FROM
      (
        SELECT
          t0.*,
          t1.first_paid_consumption
        FROM
          (
            SELECT
              lead_id,
              billing_account_id,
              ba_payment_cycle_type,
              ba_state,
              ba_person_type,
              ba_usage_status,
              call_tag,
              channel,
              lead_source,
              lead_source_description,
              sales_name,
              segment,
              block_reason,
              date
            FROM
              (
                SELECT
                  lead_id,
                  billing_account_id,
                  ba_payment_cycle_type,
                  ba_state,
                  ba_person_type,
                  ba_usage_status,
                  call_tag,
                  channel,
                  lead_source,
                  lead_source_description,
                  sales_name,
                  segment,
                  block_reason,
                  arrayMap(
                    x -> addWeeks(toMonday(toDate(event_time)), x),
                    range(
                      toUInt32(
                        (
                          toRelativeWeekNum(toMonday(toDate(now()))) - toRelativeWeekNum(toMonday(toDate(event_time)))
                        ) + 1
                      )
                    )
                  ) AS date_range
                FROM
                  (
                    SELECT
                      lead_id,
                      groupArray(ba_payment_cycle_type) [1] AS ba_payment_cycle_type,
                      groupArray(billing_account_id) [1] AS billing_account_id,
                      groupArray(ba_state) [1] AS ba_state,
                      groupArray(ba_person_type) [1] AS ba_person_type,
                      groupArray(ba_usage_status) [1] AS ba_usage_status,
                      groupArray(call_tag) [1] AS call_tag,
                      groupArray(channel) [1] AS channel,
                      groupArray(lead_source) [1] AS lead_source,
                      groupArray(lead_source_description) [1] AS lead_source_description,
                      groupArray(sales_name) [1] AS sales_name,
                      groupArray(segment) [1] AS segment,
                      groupArray(block_reason) [1] AS block_reason,
                      groupArray(event_time) [1] AS event_time
                    FROM
                      (
                        SELECT
                          *
                        FROM
                          cloud_analytics_testing.crm_lead_cube_test
                        ORDER BY
                          event_time ASC
                      )
                    WHERE
                      event = 'call' AND call_status = 'reachible'
                    GROUP BY
                      lead_id
                  )
              ) ARRAY
              JOIN date_range AS date
          ) AS t0 ANY
          LEFT JOIN (
            SELECT
              lead_id,
              toMonday(toDate(event_time)) AS date,
              1 AS first_paid_consumption
            FROM
              cloud_analytics_testing.crm_lead_cube_test
            WHERE
              lead_state = 'first_paid_consumption'
          ) AS t1 ON (t1.lead_id = t0.lead_id)
          AND (t0.date = t1.date)
      )
    GROUP BY
      lead_id,
      billing_account_id,
      ba_payment_cycle_type,
      ba_state,
      ba_person_type,
      ba_usage_status,
      call_tag,
      channel,
      lead_source,
      lead_source_description,
      sales_name,
      segment,
      block_reason
  ) ARRAY
  JOIN dates AS date,
  first_paid_consumptions AS first_paid_consumption
'''
cloud_clickhouse_param_dict['query'] = query

clickhouse.get_clickhouse_data(**cloud_clickhouse_param_dict)

query = 'RENAME TABLE cloud_analytics_testing.calls_cohorts_weekly_temp TO cloud_analytics_testing.calls_cohorts_weekly'
cloud_clickhouse_param_dict['query'] = query

clickhouse.get_clickhouse_data(**cloud_clickhouse_param_dict)

query = 'DROP TABLE IF EXISTS cloud_analytics_testing.calls_cohorts_weekly_temp'
cloud_clickhouse_param_dict['query'] = query

clickhouse.get_clickhouse_data(**cloud_clickhouse_param_dict)

Empty Result
Empty Result
Empty Result
Empty Result


In [4]:
query = 'DROP TABLE IF EXISTS cloud_analytics_testing.calls_cohorts_monthly'
cloud_clickhouse_param_dict['query'] = query

clickhouse.get_clickhouse_data(**cloud_clickhouse_param_dict)

query = '''
CREATE TABLE cloud_analytics_testing.calls_cohorts_monthly_temp 
ENGINE = MergeTree()
ORDER BY(call_date, delta_date) PARTITION BY toYYYYMM(call_date)
AS
SELECT
  lead_id,
  billing_account_id,
  ba_payment_cycle_type,
  ba_state,
  ba_person_type,
  ba_usage_status,
  call_tag,
  channel,
  lead_source,
  lead_source_description,
  sales_name,
  segment,
  block_reason,
  call_date,
  date AS delta_date,
  first_paid_consumption,
  toRelativeMonthNum(delta_date) - toRelativeMonthNum(call_date) AS delta
FROM
  (
    SELECT
      lead_id,
      billing_account_id,
      ba_payment_cycle_type,
      ba_state,
      ba_person_type,
      ba_usage_status,
      call_tag,
      channel,
      lead_source,
      lead_source_description,
      sales_name,
      segment,
      block_reason,
      groupArray(date) AS dates,
      arrayMap(
        x -> (x > 0),
        arrayCumSum(groupArray(first_paid_consumption))
      ) AS first_paid_consumptions,
      dates[1] AS call_date
    FROM
      (
        SELECT
          t0.*,
          t1.first_paid_consumption
        FROM
          (
            SELECT
              lead_id,
              billing_account_id,
              ba_payment_cycle_type,
              ba_state,
              ba_person_type,
              ba_usage_status,
              call_tag,
              channel,
              lead_source,
              lead_source_description,
              sales_name,
              segment,
              block_reason,
              date
            FROM
              (
                SELECT
                  lead_id,
                  billing_account_id,
                  ba_payment_cycle_type,
                  ba_state,
                  ba_person_type,
                  ba_usage_status,
                  call_tag,
                  channel,
                  lead_source,
                  lead_source_description,
                  sales_name,
                  segment,
                  block_reason,
                arrayMap(
                    x -> addMonths(toStartOfMonth(toDate(event_time)), x),
                    range(
                      toUInt32(
                        (
                          toRelativeMonthNum(toStartOfMonth(toDate(now()))) - toRelativeMonthNum(toStartOfMonth(toDate(event_time)))
                        ) + 1
                      )
                    )
                ) AS date_range
                FROM
                  (
                    SELECT
                      lead_id,
                      groupArray(ba_payment_cycle_type) [1] AS ba_payment_cycle_type,
                      groupArray(billing_account_id) [1] AS billing_account_id,
                      groupArray(ba_state) [1] AS ba_state,
                      groupArray(ba_person_type) [1] AS ba_person_type,
                      groupArray(ba_usage_status) [1] AS ba_usage_status,
                      groupArray(call_tag) [1] AS call_tag,
                      groupArray(channel) [1] AS channel,
                      groupArray(lead_source) [1] AS lead_source,
                      groupArray(lead_source_description) [1] AS lead_source_description,
                      groupArray(sales_name) [1] AS sales_name,
                      groupArray(segment) [1] AS segment,
                      groupArray(block_reason) [1] AS block_reason,
                      groupArray(event_time) [1] AS event_time
                    FROM
                      (
                        SELECT
                          *
                        FROM
                          cloud_analytics_testing.crm_lead_cube_test
                        ORDER BY
                          event_time ASC
                      )
                    WHERE
                      event = 'call' AND call_status = 'reachible'
                    GROUP BY
                      lead_id
                  )
              ) ARRAY
              JOIN date_range AS date
          ) AS t0 ANY
          LEFT JOIN (
            SELECT
              lead_id,
              toStartOfMonth(toDate(event_time)) AS date,
              1 AS first_paid_consumption
            FROM
              cloud_analytics_testing.crm_lead_cube_test
            WHERE
              lead_state = 'first_paid_consumption'
          ) AS t1 ON (t1.lead_id = t0.lead_id)
          AND (t0.date = t1.date)
      )
    GROUP BY
      lead_id,
      billing_account_id,
      ba_payment_cycle_type,
      ba_state,
      ba_person_type,
      ba_usage_status,
      call_tag,
      channel,
      lead_source,
      lead_source_description,
      sales_name,
      segment,
      block_reason
  ) ARRAY
  JOIN dates AS date,
  first_paid_consumptions AS first_paid_consumption
'''
cloud_clickhouse_param_dict['query'] = query

clickhouse.get_clickhouse_data(**cloud_clickhouse_param_dict)

query = 'RENAME TABLE cloud_analytics_testing.calls_cohorts_monthly_temp TO cloud_analytics_testing.calls_cohorts_monthly'
cloud_clickhouse_param_dict['query'] = query

clickhouse.get_clickhouse_data(**cloud_clickhouse_param_dict)

query = 'DROP TABLE IF EXISTS cloud_analytics_testing.calls_cohorts_monthly_temp'
cloud_clickhouse_param_dict['query'] = query

clickhouse.get_clickhouse_data(**cloud_clickhouse_param_dict)

Empty Result
Empty Result
Empty Result
Empty Result
