# Purpose 

This notebook creates the final dataset with proper filtering and merging of previosuly cleaned datasets. The final dataset is alsso spilt into train and test sets for modelling.

# Imports

In [9]:
from pprint import pprint
import os
import os.path as op
import shutil

# standard third party imports
import numpy as np
import pandas as pd
import random
import datetime

import matplotlib.pyplot as plt 
%matplotlib inline

import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('viridis')
from sklearn.model_selection import train_test_split
pd.options.mode.use_inf_as_na = True

In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import warnings

warnings.filterwarnings('ignore', message="The default value of regex will change from True to False in a future version.", 
                        category=FutureWarning)

In [12]:
# standard code-template imports
from ta_lib.core.api import (
    create_context, get_dataframe, get_feature_names_from_column_transformer, get_package_path,
    display_as_tabs, string_cleaning, merge_info, initialize_environment,
    list_datasets, load_dataset, save_dataset
)
import ta_lib.eda.api as eda

# Initialisation

In [13]:
config_path = op.join('conf', 'config.yml')
context = create_context(config_path)
pprint(list_datasets(context))

['/raw/google_search_data',
 '/raw/product_manufacturer_list',
 '/raw/sales_data',
 '/raw/social_media_data',
 '/raw/theme_list',
 '/raw/theme_product_list',
 '/cleaned/google_search_data',
 '/cleaned/product_manufacturer_list',
 '/cleaned/sales_data',
 '/cleaned/social_media_data',
 '/cleaned/theme_list',
 '/cleaned/theme_product_list',
 '/train/features',
 '/train/target',
 '/test/features',
 '/test/target']


In [14]:
g_search = load_dataset(context, 'cleaned/google_search_data')
product = load_dataset(context, 'cleaned/product_manufacturer_list')
sales = load_dataset(context, 'cleaned/sales_data')
media = load_dataset(context, 'cleaned/social_media_data')
themes = load_dataset(context, 'cleaned/theme_list')
theme_product = load_dataset(context, 'cleaned/theme_product_list')

# Dataset prep

All the datasets need to be filtered by the following criteria:

* Filter by 30 common themes
* Filter excess time data from google
* Filter by vendor

**To-do list**

* fix the date columns format in social media(✅) and google search(✅)
* filter by 30 common themes ✅
* filter by date (remove 2014 data from google search) ✅
* vendor

In [15]:
# get time span for social media data
media['published_date'].agg(['min', 'max'])

min   2015-05-21
max   2019-10-31
Name: published_date, dtype: datetime64[ns]

In [16]:
# get time span for google search data
g_search['date'].agg(['min', 'max'])

min   2014-01-01
max   2019-10-01
Name: date, dtype: datetime64[ns]

In [17]:
# get time span for sales data
sales['calender_key'].agg(['min', 'max'])

min   2016-01-09
max   2019-10-05
Name: calender_key, dtype: datetime64[ns]

## Get Common themes list

In [18]:
# merge social media data with themes data
media_merged = pd.merge(media, themes, how='left', left_on='theme_id', right_on='claim_id')

print('Social Media data merged with themes:')
media_merged.head()

Social Media data merged with themes:


Unnamed: 0,theme_id,published_date,total_post,claim_id,claim_name
0,8,2015-05-21,104,8.0,low carb
1,8,2015-05-22,92,8.0,low carb
2,8,2015-05-23,111,8.0,low carb
3,8,2015-05-24,105,8.0,low carb
4,8,2015-05-25,106,8.0,low carb


In [19]:
# merge google search data with themes data
g_search_merged = pd.merge(g_search,themes, how = 'left', on='claim_id')

print('Google Search data merged with themes:')
g_search_merged.head()

Google Search data merged with themes:


Unnamed: 0,date,claim_id,platform,year_new,week_number,search_volume,claim_name
0,2014-01-01,8,google,2014,1,6613,low carb
1,2014-01-01,39,google,2014,1,181,highsource of protein
2,2014-01-01,75,google,2014,1,135,french bisque
3,2014-01-01,81,google,2014,1,1257,gmo free
4,2014-01-01,100,google,2014,1,2636,low calorie


In [20]:
# merge sales data with themes data
sales_merged = pd.merge(sales, theme_product, how='left', on='product_id')
sales_merged = pd.merge(sales_merged, themes, how='left', on='claim_id')

print('Sales data merged with themes:')
sales_merged.head()

Sales data merged with themes:


Unnamed: 0,calender_key,product_id,sales_dollars_value,sales_units_value,sales_lbs_value,claim_id,claim_name
0,2016-01-09,1,13927,934,18680,0,No Claim
1,2016-01-09,3,10289,1592,28646,0,No Claim
2,2016-01-09,4,357,22,440,0,No Claim
3,2016-01-09,6,23113,2027,81088,0,No Claim
4,2016-01-09,7,23177,3231,58164,0,No Claim


In [21]:
# get common themes for all datasets
common_themes = set(media_merged['claim_name']) & set(g_search_merged['claim_name']) & set(sales_merged['claim_name'])

print('Common themes across all datasets:')
common_themes

Common themes across all datasets:


{'beef hamburger',
 'blueberry',
 'bone health',
 'chicken',
 'convenience easytoprepare',
 'crab',
 'energyalertness',
 'ethical not specific',
 'ethical packaging',
 'ethnic & exotic',
 'french bisque',
 'gingerbread',
 'gmo free',
 'halal',
 'highsource of protein',
 'low calorie',
 'low carb',
 'low sodium',
 'low sugar',
 'mackerel',
 'no additivespreservatives',
 'nuts',
 'peach',
 'poultry',
 'prebiotic',
 'salmon',
 'sea salt',
 'soy foods',
 'tuna',
 'vegetarian'}

## Data filtering by common themes

In [22]:
# filtering social media data by common themes
media_filtered = media_merged[media_merged['claim_name'].isin(common_themes)]

print('Social media data filtered by common themes:')
media_filtered.head()

Social media data filtered by common themes:


Unnamed: 0,theme_id,published_date,total_post,claim_id,claim_name
0,8,2015-05-21,104,8.0,low carb
1,8,2015-05-22,92,8.0,low carb
2,8,2015-05-23,111,8.0,low carb
3,8,2015-05-24,105,8.0,low carb
4,8,2015-05-25,106,8.0,low carb


In [23]:
media_filtered.shape

(48399, 5)

In [24]:
# filtering google search data by common themes
g_search_filtered = g_search_merged[g_search_merged['date'] >= media_filtered['published_date'].min()]
g_search_filtered = g_search_filtered[g_search_filtered['claim_name'].isin(common_themes)] 

print('Google search data filtered by common themes:')
g_search_filtered.head()

Google search data filtered by common themes:


Unnamed: 0,date,claim_id,platform,year_new,week_number,search_volume,claim_name
28531,2015-05-21,8,google,2015,21,10046,low carb
28532,2015-05-21,39,google,2015,21,181,highsource of protein
28533,2015-05-21,75,google,2015,21,1213,french bisque
28534,2015-05-21,81,google,2015,21,539,gmo free
28535,2015-05-21,100,google,2015,21,659,low calorie


In [25]:
g_search_filtered.shape

(33195, 7)

In [26]:
# filtering sales data by common themes
sales_filtered = sales_merged[sales_merged['claim_name'].isin(common_themes)]

print('Sales data filtered by common themes:')
sales_filtered.head()

Sales data filtered by common themes:


Unnamed: 0,calender_key,product_id,sales_dollars_value,sales_units_value,sales_lbs_value,claim_id,claim_name
22,2016-01-09,37,1101,159,105,158,chicken
24,2016-01-09,75,362,58,29,227,salmon
91,2016-01-09,11323,16668,516,8252,430,soy foods
96,2016-01-09,11324,6926,575,2013,187,blueberry
103,2016-01-09,11325,17710,537,8592,187,blueberry


In [27]:
sales_filtered.shape

(3648793, 7)

In [28]:
sales_merged.shape

(7767420, 7)

In [29]:
sales.shape

(4526182, 5)

## Resample the datasets by weekly granularity 

Set index as dates for all of the datasets and offset dates for google_search by 1 week and sales by 2 weeks considering latency

### 1. Social Media

In [30]:
# add 3 weeks to social media date 
media_filtered['published_date'] = media_filtered['published_date'] + datetime.timedelta(days=21)

# generate year and week features with delayed dates 
media_filtered['week'] = media_filtered['published_date'].apply(lambda x : x.isocalendar()[1])
media_filtered['year'] = media_filtered['published_date'].dt.year

# groupby to obtain weekly granularity
media_weekly = media_filtered.groupby(['theme_id','year','week'],as_index = False).agg(
    total_post = ('total_post','sum') )

media_weekly.head()

Unnamed: 0,theme_id,year,week,total_post
0,8,2015,24,412
1,8,2015,25,974
2,8,2015,26,1710
3,8,2015,27,1095
4,8,2015,28,899


### 2. Google search

In [31]:
# add 1 week to google search dates
g_search_filtered['date'] = g_search_filtered['date']  + datetime.timedelta(days=7)

# generate year and week features with delayed dates 
g_search_filtered['week'] = g_search_filtered['date'].apply(lambda x : x.isocalendar()[1])
g_search_filtered['year'] = g_search_filtered['date'].dt.year

# groupby to obtain weekly granularity
g_search_weekly = g_search_filtered.groupby(['claim_id','year','week'],as_index = False).agg(
    search_volume = ('search_volume','sum') )

g_search_weekly.head()

Unnamed: 0,claim_id,year,week,search_volume
0,8,2015,22,29630
1,8,2015,23,41710
2,8,2015,24,55698
3,8,2015,25,43872
4,8,2015,26,56971


### 3. Sales

This will need extra filtering as it is required to create a dependent variable by aggregating sales of our client (vendor A) 

In [32]:
# merge with product dataset to get the vendor column
sales_filtered = pd.merge(sales_filtered, product, on = 'product_id',  how='inner', validate='m:1')

# filter to obtain only the client data (vendor A)
sales_filtered_A = sales_filtered[ sales_filtered['vendor'] == 'A']
sales_filtered_A.head()

Unnamed: 0,calender_key,product_id,sales_dollars_value,sales_units_value,sales_lbs_value,claim_id,claim_name,vendor
209,2016-01-09,11323,16668,516,8252,430,soy foods,A
210,2016-01-16,11323,16313,510,8160,430,soy foods,A
211,2016-01-23,11323,15627,488,7808,430,soy foods,A
212,2016-01-30,11323,16522,518,8288,430,soy foods,A
213,2016-02-06,11323,13164,349,5581,430,soy foods,A


In [33]:
# generate year and week features
sales_filtered_A['week'] = sales_filtered_A['calender_key'].apply(lambda x : x.isocalendar()[1])
sales_filtered_A['year'] = sales_filtered_A['calender_key'].dt.year

In [34]:
# groupby to obtain weekly granularity
sales_weekly = sales_filtered_A.groupby(['claim_id','year','week'],as_index = False).agg(sales_dollars_value = ('sales_dollars_value','sum'),
                                                                                         sales_units_value = ('sales_units_value','sum'),
                                                                                         sales_lbs_value = ('sales_lbs_value','sum') )

sales_weekly.head()

Unnamed: 0,claim_id,year,week,sales_dollars_value,sales_units_value,sales_lbs_value
0,8,2016,1,8853853,1952575,2947044
1,8,2016,2,8705721,1921171,2901409
2,8,2016,3,8839233,1968251,2954048
3,8,2016,4,8872568,1974980,2941180
4,8,2016,5,9166248,2092314,3012709


# Data Merging

In [35]:
# merging social media and google search data
social_google = pd.merge(media_weekly, g_search_weekly, how='inner', 
                         left_on=['theme_id','year','week'],
                         right_on=['claim_id','year','week'], validate='1:1')

# remove extra column
social_google.drop(['theme_id'], axis = 1, inplace = True)
social_google.head()

Unnamed: 0,year,week,total_post,claim_id,search_volume
0,2015,24,412,8,55698
1,2015,25,974,8,43872
2,2015,26,1710,8,56971
3,2015,27,1095,8,45272
4,2015,28,899,8,47687


In [36]:
social_google_sales = pd.merge(social_google,sales_weekly, how='inner', 
                               on=['claim_id','year','week'], validate='1:1')

In [37]:
print('Final Data after merging:')
social_google_sales.head()

Final Data after merging:


Unnamed: 0,year,week,total_post,claim_id,search_volume,sales_dollars_value,sales_units_value,sales_lbs_value
0,2016,1,983,8,51883,8853853,1952575,2947044
1,2016,2,726,8,51122,8705721,1921171,2901409
2,2016,3,873,8,46415,8839233,1968251,2954048
3,2016,4,1123,8,53664,8872568,1974980,2941180
4,2016,5,1509,8,47560,9166248,2092314,3012709


# Feature Engineering

In [38]:
def weight_per_unit(df):
    df['weight_per_unit'] = df['sales_lbs_value'] / df['sales_units_value']
    df['weight_per_unit'] = df['weight_per_unit'].round(2)
    df.drop(['sales_units_value', 'sales_lbs_value'], axis = 1, inplace = True)
    return df 

weight_per_unit(social_google_sales).head()

Unnamed: 0,year,week,total_post,claim_id,search_volume,sales_dollars_value,weight_per_unit
0,2016,1,983,8,51883,8853853,1.51
1,2016,2,726,8,51122,8705721,1.51
2,2016,3,873,8,46415,8839233,1.5
3,2016,4,1123,8,53664,8872568,1.49
4,2016,5,1509,8,47560,9166248,1.44


# Generate Train and Test sets

In [39]:
def binned_selling_price(df):
    """Bin the selling price column using quantiles."""
    return pd.qcut(df["sales_dollars_value"], q=10)

In [40]:
from sklearn.model_selection import StratifiedShuffleSplit
from ta_lib.core.api import custom_train_test_split  # helper function to customize splitting
#from production.scripts import *

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=context.random_seed)
train, test= custom_train_test_split(social_google_sales, splitter, by=binned_selling_price)

In [41]:
target_col = "sales_dollars_value"

train_X, train_y = (
    train
    
    # split the dataset to train and test
    .get_features_targets(target_column_names=target_col)
)
save_dataset(context, train_X, 'train/features')
save_dataset(context, train_y, 'train/target')


test_X, test_y = (
    test
    
    # split the dataset to train and test
    .get_features_targets(target_column_names=target_col)
)
save_dataset(context, test_X, 'test/features')
save_dataset(context, test_y, 'test/target')