In [1]:
import numpy as np
import scipy as sp
import scipy.stats as sps
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from dateutil.parser import parse
from datetime import datetime, timedelta
from statsmodels.stats.power import tt_ind_solve_power
from statsmodels.stats import multitest
os.chdir('/Users/lunin-dv/Desktop/Library/')
import importlib
import robot_lib as lib
importlib.reload(lib)
os.chdir('/Users/lunin-dv/Desktop/Upsell/analysis')
from kostyas_script import make_kostya_req
os.chdir('/Users/lunin-dv/Desktop/data/Upsell')
from collections import defaultdict
import statsmodels.stats.api as sms
import time
sns.set()
%matplotlib inline

In [2]:
leads_table = lib.find_tables_in_hahn_folder(
    "//home/cloud_analytics/dwh/raw/crm/leads")[-1]
oppotunities_table = lib.find_tables_in_hahn_folder(
    "//home/cloud_analytics/dwh/raw/crm/opportunities")[-1]
tag_table = lib.find_tables_in_hahn_folder(
    "//home/cloud_analytics/dwh/raw/crm/tags")[-1]
tag_lead_table = lib.find_tables_in_hahn_folder(
    "//home/cloud_analytics/dwh/raw/crm/tag_bean_rel")[-1]

In [3]:
def create_core_upsell_test_df_info(days_to_add_to_experiment=0):
    """
    Создание таблицы с основной информацией по аккаунту, попавшему в upsell 
    (или в sales с последующей привязкой к тегам 'upsell_test')
    
    Параметры
    ----------
    days_to_add_to_experiment : int
        Количество дней, на которые надо сместить начало эксперимента 
        (наример, чтобы впоследствии смотерть относительно новой даты разницу в чеке)
    
    Возвращает
    -------
    df: pandas DataFrame
        Таблица со столбцами:
        - `lead_id`
        - `billing_account_id`
        - `last_state` (значения: ['Disqualified', 'Converted'])
        - `end_date_before_consumption` - дата попадания к телесейлзу, 
        - `lead_source_description` (['upsell', 
                                      'contact more then 70 days', 
                                      'upsell_test' (если lead_source == 'sales')])
        смещенная на days_to_add_to_experiment  
    """
    ##############################################################################
    
    upsell_billings_req = f"""
    SELECT
        lead_id,
        billing_account_id,
        if(states[-1] == 'Recycled', 'Disqualified',
                states[-1]) as last_state,
        addDays(toDate(event_time), 
                {days_to_add_to_experiment}) as end_date_before_consumption,
        multiIf (
lead_source_description == 'landing page' and sales_name in ('dmtroe', 'gingerkote'), 'landing page',
        lead_source_description == 'new from upsell', 'new from upsell',
        lead_source_description == 'big-3', 'big-3',
        lead_source_description == 'upsell', 'upsell',
        lead_source_description == 'contact more then 70 days', 'contact more then 70 days',
        lead_source == 'sales', 'sales_type', 
        '') as lead_source_description
    FROM cloud_analytics_testing.crm_lead_cube_test as a
    INNER JOIN (
        SELECT
            lead_id,
            arraySort((x, y) -> y, groupArray(lead_state), 
            groupArray(event_time)) as states
        FROM cloud_analytics_testing.crm_lead_cube_test
        WHERE lead_state IN ('New', 'Assigned', 'In Process', 'Recycled', 'Converted')
        GROUP BY lead_id
    ) as b
    ON a.lead_id == b.lead_id
    WHERE ((lead_state == 'Assigned' and lead_source_description != 'sales_type') OR
    (lead_state == 'New' and lead_source_description == 'sales_type'))
    AND lead_source_description != ''
    AND last_state in ('Disqualified', 'Converted')
    AND isNotNull(billing_account_id)
    FORMAT TabSeparatedWithNames
    """
    upsell_billings_df = lib.grafana_execute_query(upsell_billings_req)
    
    assert set(upsell_billings_df.columns) == set(['lead_id', 'billing_account_id',
                                                   'last_state', 
                                                   'end_date_before_consumption',
                                                   "lead_source_description"])
    assert set(upsell_billings_df['last_state']) == set(['Disqualified', 'Converted'])
    return upsell_billings_df

In [4]:
def create_tag_df_for_upsell():
    req = f"""
    SELECT
        bean_id as lead_id,
        name_lower
    FROM "{tag_lead_table}" as a
    INNER JOIN (
        SELECT
            id,
            name_lower
        FROM "{tag_table}"
        WHERE 
            name_lower == 'upsell_test'
        OR 
            name_lower == 'гипотеза_upsell'
    ) as b
    ON a.tag_id == b.id
    FORMAT TabSeparatedWithNames
    """.encode('utf-8')
    tag_df = lib.execute_query(req)
    return tag_df

In [5]:
def create_oppotunity_df_for_upsell():
    """
    Создание таблицы с информацией о закрытых oppotunity, для всех клиентов
    
    Параметры
    ----------
    
    Возвращает
    -------
    opp_df: pandas DataFrame 
            Таблица со столбцами:
            - `lead_id`
            - `start_date_after_consumption` - дата завершения oppotunity
            - `status` (значение 'win')
            В таблице только те, у кого есть статус 'Closed Won' для данного lead_id
    """
    ##############################################################################
    
    oppotunity_req = f"""
        SELECT
            id as lead_id,
            date_closed as start_date_after_consumption,
            'win' as status
        FROM (
            SELECT
                id,
                opportunity_id,
                toDate(date_entered) as date_entered
            FROM "{leads_table}"
        ) as a
        INNER JOIN (
            SELECT
                id,
                'Closed Won' as sales_status,
                toDate(date_closed_timestamp) as date_closed
            FROM "{oppotunities_table}"
            WHERE sales_stage == 'win'
        ) as b
        ON a.opportunity_id == b.id
        ORDER BY id, date_closed
        FORMAT TabSeparatedWithNames
    """
    #print(oppotunity_req)
    opp_df = lib.execute_query(oppotunity_req)
    assert set(opp_df.columns) == set(['lead_id', 'start_date_after_consumption', 
                                       'status'])
    assert set(opp_df['status']) == set(['win'])
    return opp_df

In [6]:
def create_call_df_for_upsell():
    """
    Создание таблицы с информацие, дозвонились ли до клиента и когда, по всем клиентам
    
    Параметры
    ----------
    
    Возвращает
    -------
    call_df: pandas DataFrame
             Таблица со столбцами:
             - `lead_id` - id тех, кому НЕ дозвонились
             - `start_date_after_consumption` - 
                дата первого недозвона по этому lead_id, если вообще не дозвонились,
                иначе дата первого успешного звонка
             - `status` (значение 'unreachible', 'reachible')
    """
    ##############################################################################
    
    call_req = f"""
    SELECT
        id as lead_id,
        start_date_after_consumption,
        status
    FROM (
        SELECT
            id
        FROM "{leads_table}"
    ) as a
    INNER JOIN (
        SELECT
            lead_id,
            any(billing_account_id) as billing_account_id, 
            groupUniqArray(call_status) as call_statuses,
            min(toDate(event_time)) as min_call_date,
            if (hasAll(['unreachible', 'unreachible,unreachible'], call_statuses) == 1, 
                'unreachible', 'reachible') as status,
            min(if (call_status != 'unreachible' and call_status != 'unreachible,unreachible',
                    toDate(event_time), null)) as good_call_date,
            if (status == 'unreachible', min_call_date, good_call_date) 
            as start_date_after_consumption
        FROM "//home/cloud_analytics_test/cubes/crm_leads/cube"
        WHERE event == 'call'
        GROUP BY lead_id
    ) as b
    ON a.id == b.lead_id
    FORMAT TabSeparatedWithNames
    """
    call_df = lib.execute_query(call_req)
    assert set(call_df.columns) == set(['lead_id', 'start_date_after_consumption', 
                                       'status'])
    assert set(call_df['status']) == set(['unreachible', 'reachible'])
    return call_df

In [7]:
def create_upsell_df(days_to_add_to_experiment=0, use_one_date=False):
    """
    Создание таблицы с основной информацией по upsell клиентам
    
    Параметры
    ----------
    days_to_add_to_experiment : int
        Количество дней, на которые надо сместить начало эксперимента 
        (наример, чтобы впоследствии смотреть относительно новой даты разницу в чеке)
    use_one_date: bool
         Если True, то `end_date_before_consumption` = `start_date_after_consumption`
         И разница в чеке смотрится относительно одной и той же даты 
         (`start_date_after_consumption` приравнивается к `end_date_before_consumption`)
         Иначе этого не происходит
    Возвращает
    -------
    upsell_table: pandas DataFrame
                  Таблица со столбцами:
                  - `lead_id`
                  - `billing_account_id`
                  - `lead_source_description`
                  - `end_date_before_consumption`
                  - `start_date_after_consumption` 
                    - дата завершения oppotunity (status='win')
                    - дата первого недозвона по этому lead_id 
                      (если до лида ни разу не дозвонились, status='unreachible') 
                    - дата певого звонка, когда дозвонились
                    - `end_date_before_consumption` если нет записи в таблице звонков
                  - `last_state` - ['disqualified', 'converted'] (только маленькие буквы)
                  - `status` (значение 'unreachible', 'win', 'converted', 'disqualified',
                              где 2 последних эквивалетнны `last_state`, 
                              если не попали в 2 других статуса)
                  В таблице все те, кто попал в upsell 
                  (lead_source_description == 'upsell', 
                  'upsell_test', 'contact more then 70 days')
    """
    ##############################################################################
    
    # 1. Создание основной таблицы со всеми upsell
    core_upsell_table = create_core_upsell_test_df_info()
    core_upsell_table['last_state'] = core_upsell_table['last_state'].apply(
        lambda x: x.lower())
    tag_df = create_tag_df_for_upsell()


    upsell_hypotesis = core_upsell_table[
        core_upsell_table["lead_id"].isin(
            tag_df[tag_df['name_lower'] == 'гипотеза_upsell']["lead_id"])]
    upsell_hypotesis['lead_source_description'] = ['гипотеза_upsell'] * len(upsell_hypotesis)


    upsell_test = core_upsell_table[
        core_upsell_table["lead_id"].isin(
            tag_df[tag_df['name_lower'] == 'upsell_test']["lead_id"])]
    upsell_test['lead_source_description'] = ['upsell_test'] * len(upsell_test)

    upsell_core = core_upsell_table[
        (~core_upsell_table["lead_id"].isin(upsell_hypotesis["lead_id"])) &
        (~core_upsell_table["lead_id"].isin(upsell_test["lead_id"])) &
        (core_upsell_table['lead_source_description'] != 'sales_type')
    ]

    
    upsell_test = upsell_test[
        (upsell_test["end_date_before_consumption"] > '2019-11-01')   
    ]

    core_upsell_table = lib.concatenate_tables([upsell_core, upsell_test, upsell_hypotesis])
    # 2. oppotunity и call таблицы
    oppotunity_df = create_oppotunity_df_for_upsell()
    call_df = create_call_df_for_upsell()
    
    # 3. склейка
    upsell_call = pd.merge(core_upsell_table, call_df, on='lead_id', how='inner')
    
    upsell_call["status"] = upsell_call[["status", 'lead_id']].apply(
        lambda row: 
        'win' if row['lead_id'] in set(oppotunity_df['lead_id']) else row['status'],
    axis=1)

    upsell_call['status'] =\
    upsell_call[['last_state', 'status']].apply(
        lambda row: row['status'] if row['status'] != 'reachible' else row['last_state'],
        axis=1
    )
    
    # 4. формирование итоговой таблицы
    upsell_table = upsell_call
    
    # 4.a те, кто не попал в таблицу звонков и в опти
    remaining_upsell =\
    core_upsell_table[~core_upsell_table['lead_id'].isin(upsell_table['lead_id'])]
    

    remaining_upsell.loc[:, 'start_date_after_consumption'] =\
    remaining_upsell.loc[:, 'end_date_before_consumption']
    remaining_upsell.loc[:, 'status'] = remaining_upsell.loc[:, 'last_state']
    
    # 5 Финальное склеивание
    upsell_table = lib.concatenate_tables([upsell_call,
                                           remaining_upsell])
    
    if use_one_date:
        upsell_table.loc[:, 'start_date_after_consumption'] =\
        upsell_table.loc[:, 'end_date_before_consumption']
        
    # Check
    assert set(upsell_table.columns) == set(['lead_id', 'billing_account_id',
                                             'lead_source_description',
                                            'end_date_before_consumption',
                                            'start_date_after_consumption',
                                            'last_state', 'status'])
    assert set(upsell_table["last_state"]) == set(['disqualified', 'converted'])
    assert set(upsell_table["status"]) == set(['unreachible', 'win', 
                                               'converted', 'disqualified'])
    assert set(upsell_table["lead_id"]) == set(core_upsell_table["lead_id"])
    assert upsell_table["lead_id"].shape[0] == core_upsell_table.shape[0]
    assert not use_one_date or\
           np.array_equal(upsell_table['start_date_after_consumption'],
                          upsell_table['end_date_before_consumption'])
    return upsell_table

In [8]:
def _make_columns_for_difference_days(day):
    req = f"""
    SUM(if (toDate(event_time) < toDate(end_date_before_consumption)
            AND toDate(event_time) >= addDays(toDate(end_date_before_consumption), 
                                              -{day}),
            real_consumption + trial_consumption, 0)
        ) as before_consumption_{day},

    SUM(if (toDate(event_time) >= toDate(start_date_after_consumption)
            AND toDate(event_time) < addDays(toDate(start_date_after_consumption), 
                                             {day}),
            real_consumption + trial_consumption, 0)
        ) as after_consumption_{day},

    after_consumption_{day} - before_consumption_{day} as consumption_difference_period_{day},
    
    """
    return req

In [9]:
def _make_columns_for_after_consumption_reversed(day):
    req = f"""
    SUM(if (toDate(event_time) >= toDate(start_date_after_consumption)
            AND toDate(event_time) < addDays(toDate(NOW()), 
                                             -{day}),
            real_consumption + trial_consumption, 0)
        ) as after_reversed_consumption_{day},
    """
    return req

In [10]:
def _make_columns_for_consumption_to_day(day):
    req = f"""
    SUM(if (toDate(event_time) == addDays(toDate(end_date_before_consumption), -{day}),
            real_consumption + trial_consumption, 0)
        ) as before_consumption_in_{day},

    SUM(if (toDate(event_time) == addDays(toDate(start_date_after_consumption), 
                                                 {day}),
            real_consumption + trial_consumption, 0)
        ) as after_consumption_in_{day},
    """
    return req

In [11]:
def _make_current_consumption_req(columns_in_answer, column_maker_func, periods_in_days):
    """
    Создание запроса с нужной информацией про потребление
    Параметры
    ----------
    columns_in_answer : list of strings
                        Столбцы для возврата
    column_maker_func : function
                        Функция для создания частичного подзапроса
    period_in_days: list
                    массив в котором находятся числа
                    Числа отвечают за количество дней, в течение которого 
                    считается потребление 
                    до `end_date_before_consumption` и 
                    после `start_date_after_consumption`
    """
    ##############################################################################

    part_request = ""
    for day in periods_in_days:
        part_request += column_maker_func(day)

    columns_in_answer_str = ", ".join(columns_in_answer) # чтобы прописать внутри запроса

    consumption_req = f"""
    SELECT
        {columns_in_answer_str},
        last_active_billing,
        lead_source_description
    FROM (
        SELECT
            {part_request}
            last_active_billing,
            lead_source_description
        FROM "//home/cloud_analytics/cubes/acquisition_cube/cube" as a
        INNER JOIN (
            SELECT
                billing_account_id,
                last_active_billing,
                upsell_info.*
            FROM "//home/cloud_analytics/lunin-dv/meta_cube/meta_id_information_cube" as a
            inner JOIN (
                SELECT
                    *
                FROM "//home/cloud_analytics/lunin-dv/tmp/upsell_tmp"
            ) as upsell_info
            ON a.last_active_billing == upsell_info.last_active_billing
        ) as b
        ON a.billing_account_id == b.billing_account_id
        GROUP BY last_active_billing, lead_source_description
    )
    FORMAT TabSeparatedWithNames
    """
    return consumption_req

In [12]:
def add_to_dataframe_difference_in_checks(df_raw, periods_in_days, 
                                          specific_columns=None):
    """
    Добавление в таблицу разницы в чеке для каждого пользователя до начала некоторой даты
    и после некоторой другой даты
    Параметры
    ----------
    df_raw : pandas DataFrame
        Таблица, в которой есть столбцы
        - `billing_account_id`,
        - `end_date_before_consumption`
        - `start_date_after_consumption`
    period_in_days: list
        массив в котором находятся числа
        Числа отвечают за количество дней, в течение которого считается потребление 
        до `end_date_before_consumption` и 
        после `start_date_after_consumption`
    specific_columns: list
        Какие столбцы вернуть
    Возвращает
    -------
    df: pandas DataFrame
        Таблица, в которой к исходным столбцам прибавляются новые столбцы
        `consumption_difference_period_{day}` по всем days в period_in_days
        или specific_columns
    """
    ##############################################################################
    
    df = df_raw.copy()
    # Проверим, что есть нужные столбцы
    assert "billing_account_id" in df.columns
    assert "end_date_before_consumption" in df.columns
    assert "start_date_after_consumption" in df.columns
    
    # Столбцы для добавления
    new_columns = [f"consumption_difference_period_{i}" for i in periods_in_days]
    if specific_columns is not None:
        new_columns += specific_columns
    
    lib.save_table("upsell_tmp", "//home/cloud_analytics/lunin-dv/tmp", df)
    time.sleep(20)
    consumption_req = _make_current_consumption_req(new_columns, 
                                                    _make_columns_for_difference_days, 
                                                    periods_in_days)
    #print(consumption_req)
    consumption_df = lib.execute_query(consumption_req)
    df = pd.merge(df, consumption_df, on=["last_active_billing",
                                          'lead_source_description'], how='inner')
    assert set(df.columns) == set(df_raw.columns.tolist() +\
                                  new_columns)
    return df

In [13]:
def add_to_dataframe_consumption_by_days(df_raw, periods_in_days):
    """
    Добавление в таблицу потребления по дням для каждого пользователя до начала некоторой даты
    и после некоторой другой даты
    Параметры
    ----------
    df_raw : pandas DataFrame
        Таблица, в которой есть столбцы
        - `billing_account_id`,
        - `end_date_before_consumption`
        - `start_date_after_consumption`
    period_in_days: list
        массив в котором находятся числа
        Числа отвечают за дни, в течение которых считается потребление 
        до `end_date_before_consumption` и 
        после `start_date_after_consumption`
    Возвращает
    -------
    df: pandas DataFrame
        Таблица, в которой к исходным столбцам прибавляются столбцы
        `before_consumption_in_{day}`,
        `after_consumption_in_{day}`по всем days в period_in_days
    """
    ##############################################################################
    
    df = df_raw.copy()
    # Проверим, что есть нужные столбцы
    assert "billing_account_id" in df.columns
    assert "end_date_before_consumption" in df.columns
    assert "start_date_after_consumption" in df.columns
    
    # Столбцы для добавления
    new_columns = [f"before_consumption_in_{i}" for i in periods_in_days] + \
                  [f"after_consumption_in_{i}" for i in periods_in_days]

    lib.save_table("upsell_tmp", "//home/cloud_analytics/lunin-dv/tmp", df)
    time.sleep(20)
    consumption_req = _make_current_consumption_req(new_columns, 
                                                    _make_columns_for_consumption_to_day, 
                                                    periods_in_days)
    consumption_df = lib.execute_query(consumption_req)
    df = pd.merge(df, consumption_df, on=["last_active_billing",
                                          'lead_source_description'], how='inner')
    assert set(df.columns) == set(df_raw.columns.tolist() +\
                                  new_columns)
    return df

In [14]:
def add_to_dataframe_consumption_after_experiment_reversed(df_raw, periods_in_days):
    ##############################################################################
    
    df = df_raw.copy()
    # Проверим, что есть нужные столбцы
    assert "billing_account_id" in df.columns
    assert "end_date_before_consumption" in df.columns
    assert "start_date_after_consumption" in df.columns
    
    # Столбцы для добавления
    new_columns = [f"after_reversed_consumption_{i}" for i in periods_in_days]

    lib.save_table("upsell_tmp", "//home/cloud_analytics/lunin-dv/tmp", df)
    time.sleep(20)
    consumption_req = _make_current_consumption_req(new_columns, 
                                                _make_columns_for_after_consumption_reversed, 
                                                    periods_in_days)
    consumption_df = lib.execute_query(consumption_req)
    df = pd.merge(df, consumption_df, on=["last_active_billing",
                                          'lead_source_description'], how='inner')
    assert set(df.columns) == set(df_raw.columns.tolist() +\
                                  new_columns)
    return df

In [15]:
meta_info = lib.MetaInformationClass(interested_columns=[])
meta_info.create_users_id()
res_df = meta_info.get_dataframe_with_grouped_information()

In [16]:
upsell_df = create_upsell_df(days_to_add_to_experiment=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pa

In [24]:
upsell_df =\
upsell_df.sort_values(by='end_date_before_consumption').groupby('billing_account_id').first().reset_index()

In [17]:
upsell_df = pd.merge(upsell_df, res_df, on='billing_account_id', how='left')

In [18]:
upsell_df['last_active_billing'] = upsell_df[['last_active_billing', 'billing_account_id']].apply(
    lambda row: row['last_active_billing'] if not pd.isnull(row['last_active_billing']) else
                row['billing_account_id'], axis=1
)

In [19]:
upsell_df = upsell_df[['lead_id', 
                       'billing_account_id', 
                       'last_state',
                       'end_date_before_consumption',
                       'lead_source_description',
                       'start_date_after_consumption',
                       'status',
                       'last_active_billing']]

In [20]:
upsell_grouped = upsell_df.groupby('last_active_billing')
row_array_new  = []
for bill, curr_df in upsell_grouped:
    for status in ["win", 'converted', 'disqualified', 'unreachible']:
        if len(curr_df[curr_df['status'] == status]) > 0:
            curr_df = curr_df[curr_df['status'] == status]
            curr_df = curr_df.sort_values(by=['end_date_before_consumption'])
            row_array_new.append(curr_df.iloc[0])

upsell_df = pd.DataFrame(row_array_new)

In [21]:
periods_in_days = [i for i in range(1, 121, 1)]
df = add_to_dataframe_consumption_by_days(upsell_df, periods_in_days)

In [22]:
periods_in_days_small = [i for i in range(1, 121, 1)]

In [23]:
periods_in_days_large = [i for i in range(1, 365, 1)]

In [24]:
df = add_to_dataframe_consumption_after_experiment_reversed(df, periods_in_days_large)

In [25]:
df = add_to_dataframe_difference_in_checks(
    df, periods_in_days_small,
    specific_columns=['before_consumption_14',
                      'after_consumption_30', 
                      'after_consumption_60', 
                      'after_consumption_90'])

In [26]:
def make_array(row, indexes, start_part):
    arr = []
    for ind in indexes:
        if (datetime.now() - pd.to_datetime(row["start_date_after_consumption"])
                      ).days > ind:
            arr.append(row[start_part + f"{ind}"])
        else:
            arr.append(None)
    return arr

In [27]:
df["consumption_difference_array"] = df.apply(lambda row: 
                        make_array(row,
                                   periods_in_days_small,
                                   "consumption_difference_period_"), axis=1)

In [28]:
def make_all_consumption_array(row, indexes):
    arr = []
    indexes = range(min(indexes), max(indexes))
    for ind in indexes:
        if (datetime.now() - pd.to_datetime(row["start_date_after_consumption"])
           ).days > ind and ind in indexes:
            mul_factor = (datetime.now() -
                          pd.to_datetime(row["start_date_after_consumption"])).days - ind
            arr.append(row[f"after_reversed_consumption_{ind}"] - 
                       (row["before_consumption_14"] * mul_factor / 14))
        else:
            arr.append(None)
    return arr

In [29]:
df["consumption_difference_array_all_days"] = df.apply(lambda row: 
                        make_all_consumption_array(row, periods_in_days_large), axis=1)

In [30]:
df["after_consumption_array"] = df.apply(lambda row: 
                        make_array(row, periods_in_days_small, 
                                   "after_consumption_in_"), axis=1)

In [31]:
df["before_consumption_array"] = df.apply(lambda row: 
                        make_array(row, periods_in_days_small, 
                                   "before_consumption_in_"), axis=1)

In [32]:
def concat(row):
    return list(reversed(row["before_consumption_array"])) +\
    [row["after_consumption_array"][0]] + row["after_consumption_array"]

In [33]:
df["consumption_array"] = df.apply(lambda row: concat(row), axis=1)

In [34]:
df["days_30_revenue"] = df["after_consumption_30"] - (df["before_consumption_14"] * 30 / 14)

df["days_60_revenue"] = df["after_consumption_60"] - (df["before_consumption_14"] * 60 / 14)

df["days_90_revenue"] = df["after_consumption_90"] - (df["before_consumption_14"] * 90 / 14)

In [35]:
res = df[['lead_id', 
       'billing_account_id', 
       'last_state',
       'end_date_before_consumption',
       'lead_source_description',
       'start_date_after_consumption',
       'status',
       'last_active_billing',
       'consumption_array',
       'consumption_difference_array_all_days',
       'consumption_difference_array',
       "days_30_revenue",
       'days_60_revenue',
       'days_90_revenue']]

In [36]:
lib.save_table("upsell_call",
               "//home/cloud_analytics/lunin-dv/dashboard_tables", res,
               schema={"consumption_difference_array":"list:double",
                       "consumption_array":"list:double",
                       "consumption_difference_array_all_days":"list:double"})

In [37]:
lib.save_table_from_yt_to_grafana(
    "//home/cloud_analytics/lunin-dv/dashboard_tables/upsell_call", 
    "cloud_analytics.upsell_call", sort_col="billing_account_id")

2020-10-05 12:46:31,237	INFO	Transfer task started: https://transfer-manager.yt.yandex-team.ru/task?id=25cdd219-faf0cd70-fc735f18-c5cd951b&tab=details&backend=production
