In [1]:
import sys
sys.path.append('..')
from src.utilities import *

from src.features.sales_feature_extraction import extract_sales_features
from src.features.region_feature_extraction import extract_region_features
from src.features.activity_feature_extraction import extract_activity_features
from src.features.covid_feature_extraction import extract_covid_features

## 2. Master creation

### 2.1 Read necessary raw tables

In [2]:
regions_data = pd.read_csv(os.path.join(raw_path, 'regions.csv'))
regions_hcps = pd.read_csv(os.path.join(raw_path, 'regions_hcps.csv'))
hcps = pd.read_csv(os.path.join(raw_path, 'hcps.csv'))
sales_train = pd.read_csv(os.path.join(raw_path, 'sales_train.csv'))
activity_data = pd.read_csv(os.path.join(raw_path, 'activity.csv'))
prophet_features = pd.read_csv(os.path.join(interim_path, 'prophet_features.csv'))
covid_data = pd.read_csv(os.path.join(raw_path, 'owid-covid-data.csv')) # https://github.com/owid/covid-19-data/tree/master/public/data

### 2.2 Define cross join of month - region - brand

In [3]:
months = pd.DataFrame({'month': sales_train.month.unique()})
regions = pd.DataFrame({'region': sales_train.region.unique()})
brands = pd.DataFrame({'brand': ['brand_1', 'brand_2']})

months['dummy_col'] = 0
regions['dummy_col'] = 0
brands['dummy_col'] = 0

master = months.merge(regions, how = 'outer', on = 'dummy_col')
master = master.merge(brands, how = 'outer', on = 'dummy_col')
master.drop(columns = 'dummy_col', inplace = True)
master = master.merge(sales_train, how = 'left', on = ['month', 'region', 'brand'])
master.head()

Unnamed: 0,month,region,brand,sales
0,2020-01,region_0,brand_1,0.0
1,2020-01,region_0,brand_2,0.0
2,2020-01,region_1,brand_1,0.0
3,2020-01,region_1,brand_2,0.0
4,2020-01,region_2,brand_1,0.0


### 2.3 Append features

In [4]:
# Create features from raw tables
sales_features = extract_sales_features(sales_train)
region_features = extract_region_features(regions_data, regions_hcps, hcps)
activity_features = extract_activity_features(activity_data).fillna(0)
covid_features = extract_covid_features(covid_data) 

# Add integer month indicator
master['month_indicator'] = master.month.apply(lambda x: int(x[:4])*100 + int(x[-2:]))

# Combine features in a single master table
master = master.merge(sales_features, how = 'left', on = ['month', 'region', 'brand'])
master = master.merge(region_features, how = 'left', on = 'region')
master = master.merge(covid_features, how = 'left', on = 'month')
master = master.merge(activity_features, how = 'left', on = ['month', 'region', 'brand'])
master = master.merge(prophet_features, how = 'left', on = ['month', 'region'])

### 2.3 Save master table

In [6]:
master.to_csv(os.path.join(processed_path, 'master.csv'), index = False)