In [12]:
import pandas as pd
import os

In [13]:
from process_data import MarketShareProcess, MarketEventProcess
from connect_data import DataFrameValidator, read_json_file, read_csv_file

## settings data

In [14]:
sales_data_path = "../data/landing/sales/"
crm_data_path = "../data/landing/crm/crm_data.csv"
sales_crm_data_path = "../data/production/sales_crm/market_share_event_sum.csv"

In [15]:
sales_data_schema = {
    "acct_id": {"type": str},
    "product_name": {
        "type": str,
        "choices": ["Globberin", "Vorbulon", "Snaffleflax", "Beeblizox"],
        "value_mapping": {
            "Globberin": ["Globbrin", " Globberin"],
            "Vorbulon": ["vorbulon."],
            "Snaffleflax": ["Snafulopromazide-b (Snaffleflax)"],
            "Beeblizox": ["Beebliz%C3%B6x"],
        },
    },
    "date": {"type": "date"},
    "unit_sales": {"type": int},
    "created_at": {"type": "date"},
}

crm_data_schema = {
    "acct_id": {"type": str},
    "event_type": {
        "type": str,
        "choices": ["f2f", "group call", "workplace event"],
    },
    "date": {"type": str},
}

In [16]:
sales_window_size_to_weights = {2: [], 3: []}

crm_window_size_to_weights = {2: [0.3, 0.7], 3: [0.25, 0.25, 0.5]}

## read data and validation with schema

In [17]:
validator = DataFrameValidator()
sales_data = []
for file_path in os.listdir(sales_data_path):
    data = read_json_file(os.path.join(sales_data_path, file_path))
    data = validator.validate(sales_data_schema, data)
    sales_data.append(data)
sales_data = pd.concat(sales_data, axis=0)

In [18]:
sales_data.duplicated().any()

False

In [19]:
crm_data = read_csv_file(crm_data_path)
crm_data = validator.validate(crm_data_schema, crm_data)

## market share

In [20]:
market_share_process = MarketShareProcess(sales_window_size_to_weights)
market_share = market_share_process.process(sales_data)
market_share

Unnamed: 0_level_0,market_share,lagged_1_month_avg_market_share,lagged_2_month_avg_market_share
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,0.23,,
2019-02-01,0.3,0.26,
2019-03-01,0.23,0.26,0.25
2019-04-01,0.34,0.28,0.29
2019-05-01,0.25,0.3,0.27
2019-06-01,0.22,0.23,0.27
2019-07-01,0.19,0.2,0.22
2019-08-01,0.2,0.19,0.2
2019-09-01,0.22,0.21,0.2
2019-10-01,0.18,0.2,0.2


## sum of events

In [21]:
market_event_process = MarketEventProcess(crm_window_size_to_weights)
event_data = market_event_process.process(crm_data)
event_data

Unnamed: 0_level_0,f2f,group call,workplace event,event_count,lagged_1_month_sum_events,lagged_2_month_sum_events,lagged_1_month_weighted_sum_events,lagged_2_month_weighted_sum_events
event_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-01,36,36,41,113,,,,
2019-02-01,47,29,26,102,107.5,,105.3,
2019-03-01,42,32,51,125,113.5,113.33,118.1,116.25
2019-04-01,41,35,42,118,121.5,115.0,120.1,115.75
2019-05-01,45,45,43,133,125.5,125.33,128.5,127.25
2019-06-01,34,37,32,103,118.0,118.0,112.0,114.25
2019-07-01,46,49,39,134,118.5,123.33,124.7,126.0
2019-08-01,40,42,36,118,126.0,118.33,122.8,118.25
2019-09-01,44,39,38,121,119.5,124.33,120.1,123.5
2019-10-01,41,34,38,113,117.0,117.33,115.4,116.25


## merge data

In [22]:
data = pd.merge(
    market_share, event_data, how="outer", left_index=True, right_index=True
)
data = data.reset_index(level=0)
data.to_csv(sales_crm_data_path)