In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from ta_lib.core.api import load_yml,create_context
from ta_lib.data_utils import offline_constants
from ta_lib.data_utils.data_processing.offline.baseprice import get_base_price
import os



In [3]:
import pandas as pd
import numpy as np
import os
import gc
import datetime
import time
from ta_lib.core.api import _change_permissions_recursive

In [4]:
config_path = '../conf/config.yml'
cfg=create_context(config_path)

In [5]:
# Defining Input paths
offline_input_path = cfg.offline["OFFLINE_RAW"].format(**cfg.offline)
digital_input_path =cfg.offline["DIGITAL_TRNS"].format(**cfg.offline)

# Getting file list and initialising other variables
file_list = [x for x in os.listdir(offline_input_path)]
file_list.sort(reverse=True)

grouper_cols = ['pos_busn_dt']

In [7]:
def get_rev_from_trn(cfg, grouping=True, result_filter='offline'):
    df_rev = pd.DataFrame()
    for file in file_list:
        print(file)
        # Loading dumped GDW data
        st = time.time()
        df = pd.read_parquet(offline_input_path + file + '/data.parquet')
        df['sld_menu_itm_id'] = df['sld_menu_itm_id'].astype('int')
        # compute Rev
        alc_df = df[(df['cmbo_pren_sld_menu_itm_id'] == -1)].reset_index(drop=True)
        cmbo_df = df[(df['cmbo_pren_sld_menu_itm_id'] != -1)].reset_index(drop=True)
        cmbo_df["pos_itm_grss_am"] = cmbo_df.groupby(['pos_busn_dt', 
                                  'mcd_gbal_lcat_id_nu',
                                  'pos_ord_nu', 
                                  'cmbo_pren_sld_menu_itm_id'])[['pos_itm_grss_am']].transform("sum")
        cmbo_df = cmbo_df[cmbo_df['cmbo_pren_sld_menu_itm_id'] == cmbo_df['sld_menu_itm_id']]
        df = pd.concat([alc_df, cmbo_df], axis=0).reset_index(drop=True)
        # Loading digital trn data if exist and doing separate operations
        digital_flag = 0
        if os.path.exists(digital_input_path + file + '/'):

            # Loading digital data
            dig_df = pd.read_parquet(digital_input_path + file + '/part-0.parquet')[['pos_ord_nu', 'mcd_gbal_lcat_id_nu']]
            dig_df['digital_flag'] = 1

            # Merging digital data
            df = df.merge(dig_df, on=['mcd_gbal_lcat_id_nu', 'pos_ord_nu'], how='left')

            # Clearing space
            del dig_df
            digital_flag = 1
        else :
            df['digital_flag'] = np.NaN
        df.digital_flag.fillna(0,inplace=True)
        # Separating alc and Combo trns to get mode price
        df = df.groupby(grouper_cols+["digital_flag"]).pos_itm_grss_am.sum().reset_index()
        df_rev = df_rev.append(df)
        print(time.time()-st)
    if "offline" in result_filter:
        units_path = cfg.offline["UNITS"].format(**cfg.offline,type_="offline")
        output_path = cfg.offline["REV"].format(**cfg.offline,type_="offline")
        df = df_rev[df_rev.digital_flag==0]
        df_units = pd.read_parquet(units_path)
        df = df.merge(df_units,on=grouper_cols,how="left")
        df["check_value"] = df["pos_itm_grss_rev"]/df["quantity"]
        # create dir if doesn't exist 
        os.makedirs(output_path, exist_ok=True)
        df.to_parquet(output_path, partition_cols=['sld_menu_itm_id', 'monthid'])
        _change_permissions_recursive(output_path, 0o777)
    if "digital" in result_filter:
        units_path = cfg.offline["UNITS"].format(**cfg.offline,type_="digital")
        output_path = cfg.offline["REV"].format(**cfg.offline,type_="digital")
        df = df_rev[df_rev.digital_flag==1]
        df_units = pd.read_parquet(units_path)
        df = df.merge(df_units,on=grouper_cols,how="left")
        df["check_value"] = df["pos_itm_grss_rev"]/df["quantity"]
        # create dir if doesn't exist 
        os.makedirs(output_path, exist_ok=True)
        df.to_parquet(output_path, partition_cols=['sld_menu_itm_id', 'monthid'])
        _change_permissions_recursive(output_path, 0o777)

In [29]:
if "offline" in "offline":
    units_path = "/opt/sasdata/dev/PricingEngines/DiscountEngines/Russia/data/processed/digital/day_level/"
    output_path = "/opt/sasdata/dev/PricingEngines/DiscountEngines/Russia/data/processed/digital/grouped/day_level_rev/"
    df = df_rev[df_rev.digital_flag==0]
# if "digital" in result_filter:
#     units_path = cfg.offline["UNITS"].format(**cfg.offline,type_="digital")
#     output_path = cfg.offline["REV"].format(**cfg.offline,type_="digital")
#     df = df_rev[df_rev.digital_flag==0]
df_units = pd.read_parquet(units_path)
# df = df.merge(df_units,on=grouper_cols,how="left")
# df["check_value"] = df["pos_itm_grss_rev"]/df["quantity"]
# # create dir if doesn't exist 
# os.makedirs(output_path, exist_ok=True)
# df.to_parquet(output_path, partition_cols=['sld_menu_itm_id', 'monthid'])
# _change_permissions_recursive(output_path, 0o777)

In [6]:
df_rev = pd.DataFrame()
for file in file_list:
    print(file)
    # Loading dumped GDW data
    st = time.time()
    df = pd.read_parquet(offline_input_path + file + '/data.parquet')
    df['sld_menu_itm_id'] = df['sld_menu_itm_id'].astype('int')
    # compute Rev
    alc_df = df[(df['cmbo_pren_sld_menu_itm_id'] == -1)].reset_index(drop=True)
    cmbo_df = df[(df['cmbo_pren_sld_menu_itm_id'] != -1)].reset_index(drop=True)
    cmbo_df["pos_itm_grss_am"] = cmbo_df.groupby(['pos_busn_dt', 
                              'mcd_gbal_lcat_id_nu',
                              'pos_ord_nu', 
                              'cmbo_pren_sld_menu_itm_id'])[['pos_itm_grss_am']].transform("sum")
    cmbo_df = cmbo_df[cmbo_df['cmbo_pren_sld_menu_itm_id'] == cmbo_df['sld_menu_itm_id']]
    df = pd.concat([alc_df, cmbo_df], axis=0).reset_index(drop=True)
    # Loading digital trn data if exist and doing separate operations
    digital_flag = 0
    if os.path.exists(digital_input_path + file + '/'):

        # Loading digital data
        dig_df = pd.read_parquet(digital_input_path + file + '/part-0.parquet')[['pos_ord_nu', 'mcd_gbal_lcat_id_nu']]
        dig_df['digital_flag'] = 1

        # Merging digital data
        df = df.merge(dig_df, on=['mcd_gbal_lcat_id_nu', 'pos_ord_nu'], how='left')

        # Clearing space
        del dig_df
        digital_flag = 1
    else :
        df['digital_flag'] = np.NaN
    df.digital_flag.fillna(0,inplace=True)
    # Separating alc and Combo trns to get mode price
    df = df.groupby(grouper_cols+["digital_flag"]).pos_itm_grss_am.sum().reset_index()
    df_rev = df_rev.append(df)
    print(time.time()-st)

month_id=20210801
181.89301133155823
month_id=20210701
166.51798748970032
month_id=20210601
162.0581967830658
month_id=20210501
170.3772087097168
month_id=20210401
154.07703304290771
month_id=20210301
146.6512393951416
month_id=20210201
118.29081273078918
month_id=20210101
125.8896255493164
month_id=20201201
124.26907205581665
month_id=20201101
112.75923180580139
month_id=20201001
218.2705602645874
month_id=20200901
135.26359605789185
month_id=20200801
129.9920370578766
month_id=20200701
116.09629845619202
month_id=20200601
95.25340986251831
month_id=20200501
67.5096127986908
month_id=20200401
48.61569356918335
month_id=20200301
100.65328454971313
month_id=20200201
60.59633541107178
month_id=20200101
63.32029581069946
month_id=20191201
76.04257607460022
month_id=20191101
68.09688448905945
month_id=20191001
73.1204514503479
month_id=20190901
77.45056676864624
month_id=20190801
81.33086442947388
month_id=20190701
75.85873699188232
month_id=20190601
76.14026856422424
month_id=20190501
73.

In [22]:
df_units = df_units[["pos_busn_dt","quantity","num_trans"]]
df_units["monthid"] =  df_units.pos_busn_dt.dt.strftime("%Y%m")

In [30]:
df_units.head()

Unnamed: 0,pos_busn_dt,quantity,num_trans,monthid
0,2020-03-12,4,1,202003
1,2020-03-18,4,1,202003
2,2020-03-19,4,1,202003
3,2020-03-20,40436,11035,202003
4,2020-03-21,147642,39812,202003


In [31]:
output_path

'/opt/sasdata/dev/PricingEngines/DiscountEngines/Russia/data/processed/digital/grouped/day_level_rev/'

In [32]:
df = df_rev[df_rev.digital_flag!=0]

In [33]:
df = df.merge(df_units,on=grouper_cols,how="left")
df["check_value"] = df["pos_itm_grss_am"]/df["quantity"]
# create dir if doesn't exist 
os.makedirs(output_path, exist_ok=True)
df.to_parquet(output_path, partition_cols=['monthid'])
_change_permissions_recursive(output_path, 0o777)

In [28]:
if "digital" in "digital":
    units_path = cfg.offline["UNITS"].format(**cfg.offline,type_="digital")
    output_path = cfg.offline["REV"].format(**cfg.offline,type_="digital")
    df = df_rev[df_rev.digital_flag==0]
df_units = pd.read_parquet(units_path)

In [29]:
df_units.head()

Unnamed: 0_level_0,pos_busn_dt,pos_itm_tot_qt,gc,sld_menu_itm_id,monthid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12471,2020-08-14,1,1,10000106,202008
12472,2020-08-15,2,2,10000106,202008
12473,2020-08-16,1,1,10000106,202008
12474,2020-08-20,2,1,10000106,202008
12699,2020-09-12,1,1,10000106,202009


In [30]:
output_path

'/opt/sasdata/dev/PricingEngines/DiscountEngines/Russia/data/processed/digital/ungrouped_data/item_level_rev/'

In [31]:
df_units = df_units.reset_index(drop=True)
df = df.merge(df_units,on=grouper_cols,how="left")

In [32]:
df["check_value"] = df["pos_itm_grss_am"]/df["pos_itm_tot_qt"]
# create dir if doesn't exist 
os.makedirs(output_path, exist_ok=True)
df.to_parquet(output_path, partition_cols=['sld_menu_itm_id', 'monthid'])
_change_permissions_recursive(output_path, 0o777)