# Imports

In [32]:
from pprint import pprint
import os
import os.path as op
import shutil

# standard third party imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.options.mode.use_inf_as_na = True

In [33]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
import warnings

warnings.filterwarnings('ignore', message="The default value of regex will change from True to False in a future version.", 
                        category=FutureWarning)

In [35]:
# standard code-template imports
from ta_lib.core.api import (
    create_context, get_dataframe, get_feature_names_from_column_transformer, get_package_path,
    display_as_tabs, string_cleaning, merge_info, initialize_environment,
    list_datasets, load_dataset, save_dataset
)
import ta_lib.eda.api as eda

In [36]:
initialize_environment(debug=False, hide_warnings=True)

# Initialisation

In [37]:
config_path = op.join('conf', 'config.yml')
context = create_context(config_path)
pprint(list_datasets(context))

['/raw/google_search_data',
 '/raw/product_manufacturer_list',
 '/raw/sales_data',
 '/raw/social_media_data',
 '/raw/theme_list',
 '/raw/theme_product_list',
 '/cleaned/google_search_data',
 '/cleaned/product_manufacturer_list',
 '/cleaned/sales_data',
 '/cleaned/social_media_data',
 '/cleaned/theme_list',
 '/cleaned/theme_product_list']


In [38]:
g_search = load_dataset(context, 'raw/google_search_data')
product = load_dataset(context, 'raw/product_manufacturer_list')
sales = load_dataset(context, 'raw/sales_data')
media = load_dataset(context, 'raw/social_media_data')
themes = load_dataset(context, 'raw/theme_list')
theme_product = load_dataset(context, 'raw/theme_product_list')

# Data Cleaning

### Google search

From discovery, it is known that:
* key columns: None
* integer columns: searchVolume, Claim_ID, week_number,yyear_ne'
* datetime columns: date

In [39]:
g_search.head()

Unnamed: 0,date,platform,searchVolume,Claim_ID,week_number,year_new
0,05-01-2014,google,349,916,1,2014
1,06-01-2014,google,349,916,2,2014
2,07-01-2014,google,697,916,2,2014
3,10-01-2014,google,349,916,2,2014
4,20-01-2014,google,697,916,4,2014


In [40]:
g_search.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181565 entries, 0 to 181564
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   date          181565 non-null  object
 1   platform      181565 non-null  object
 2   searchVolume  181565 non-null  int64 
 3   Claim_ID      181565 non-null  int64 
 4   week_number   181565 non-null  int64 
 5   year_new      181565 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 8.3+ MB


In [41]:
# column names after cleaning

str_cols = list(
    set(g_search.select_dtypes('object').columns.to_list()) 
    - set(['searchVolume', 'Claim_ID', 'week_number', 'year_new', 'date'])
)
g_search_clean = (
    g_search
    
    .copy()
    #.sample(frac=1, resample=False)

    # set dtypes
    .change_type(['searchVolume', 'Claim_ID', 'week_number', 'year_new'], np.int64)
    
    # clean string columns (NOTE: only after setting datetime)
    .transform_columns(str_cols, string_cleaning, elementwise=False)

    # clean column names                                                                                                                                   
    .clean_names(case_type='snake')
)
g_search_clean['date'] = pd.to_datetime(g_search_clean["date"], format='%d-%m-%Y').dt.strftime('%Y-%m-%d')
g_search_clean['date'] = pd.to_datetime(g_search_clean['date'])
g_search_clean.head()


Unnamed: 0,date,platform,search_volume,claim_id,week_number,year_new
0,2014-01-05,google,349,916,1,2014
1,2014-01-06,google,349,916,2,2014
2,2014-01-07,google,697,916,2,2014
3,2014-01-10,google,349,916,2,2014
4,2014-01-20,google,697,916,4,2014


In [42]:
g_search_clean = g_search_clean.groupby(['date','claim_id','platform','year_new','week_number'], as_index = False)['search_volume'].sum()

In [43]:
save_dataset(context, g_search_clean, 'cleaned/google_search_data')

### Product Manufacturer list

From discovery, it is known that:
* key columns: PRODUCT_ID
* integer columns: PRODUCT_ID
* object columns: Vendor

In [45]:
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67175 entries, 0 to 67174
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   PRODUCT_ID  67175 non-null  int64  
 1   Vendor      67175 non-null  object 
 2   Unnamed: 2  0 non-null      float64
 3   Unnamed: 3  0 non-null      float64
 4   Unnamed: 4  0 non-null      float64
 5   Unnamed: 5  0 non-null      float64
 6   Unnamed: 6  0 non-null      float64
dtypes: float64(5), int64(1), object(1)
memory usage: 3.6+ MB


In [46]:
product.head()

Unnamed: 0,PRODUCT_ID,Vendor,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,1,Others,,,,,
1,2,Others,,,,,
2,3,Others,,,,,
3,4,Others,,,,,
4,5,Others,,,,,


In [47]:
# column names after cleaning

str_cols = list(
    set(product.select_dtypes('object').columns.to_list())
)
product_clean = (
    product
    
    .copy()
    #.sample(frac=1, resample=False)

    # set dtypes : nothing to do here
    .passthrough()
    
    # drop unnecessary columns
    .drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6'], axis = 1)

    # set dtypes
    .change_type(['PRODUCT_ID'], np.int64)
    
    # clean string columns (NOTE: only after setting datetime)
    .transform_columns(str_cols, string_cleaning, elementwise=False)
    .replace({'': np.NaN})

    # ensure that the key column does not have duplicate records
    .remove_duplicate_rows(col_names=['PRODUCT_ID'], keep_first=True)

    # clean column names                                                                                                                                   
    .clean_names(case_type='snake')
)
product_clean.head().T


Unnamed: 0,0,1,2,3,4
product_id,1,2,3,4,5
vendor,Others,Others,Others,Others,Others


In [48]:
save_dataset(context, product_clean, 'cleaned/product_manufacturer_list')

### Sales

From discovery, it is known that:
* key columns: None
* integer columns: 'product_id', 'sales_dollars_value',  'sales_units_value', 'sales_lbs_value
* datet columns: system_calendar_key_Nr

In [49]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4526182 entries, 0 to 4526181
Data columns (total 5 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   system_calendar_key_N  int64  
 1   product_id             int64  
 2   sales_dollars_value    float64
 3   sales_units_value      int64  
 4   sales_lbs_value        int64  
dtypes: float64(1), int64(4)
memory usage: 172.7 MB


In [50]:
sales_clean = (
    sales
    
    .copy()
    #.sample(frac=1, resample=False)

    # drop unnecessary columns
    #.drop(['Unnamed: 0'], axis = 1)

    # set dtypes
    .change_type(['sales_dollars_value'], np.int64)

    # drop duplicates
    .drop_duplicates()

    # clean column names                                                                                                                                   
    .clean_names(case_type='snake')
    .rename_columns({'system_calendar_key_n': 'calender_key'})
)
sales_clean['calender_key'] = pd.to_datetime(sales_clean['calender_key'], format='%Y%m%d')
sales_clean.head()


Unnamed: 0,calender_key,product_id,sales_dollars_value,sales_units_value,sales_lbs_value
0,2016-01-09,1,13927,934,18680
1,2016-01-09,3,10289,1592,28646
2,2016-01-09,4,357,22,440
3,2016-01-09,6,23113,2027,81088
4,2016-01-09,7,23177,3231,58164


In [51]:
sales_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4526182 entries, 0 to 4526181
Data columns (total 5 columns):
 #   Column               Dtype         
---  ------               -----         
 0   calender_key         datetime64[ns]
 1   product_id           int64         
 2   sales_dollars_value  int64         
 3   sales_units_value    int64         
 4   sales_lbs_value      int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 207.2 MB


In [52]:
save_dataset(context, sales_clean, 'cleaned/sales_data')

### Social Media

From discovery, it is known that:
* key columns: None
* integer columns: total_post, Theme Id
* date columns:  published_date

In [53]:
media.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 533390 entries, 0 to 533389
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Theme Id        314879 non-null  float64
 1   published_date  533390 non-null  object 
 2   total_post      533390 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 12.2+ MB


In [54]:
media.head()

Unnamed: 0,Theme Id,published_date,total_post
0,148.0,10-01-2015,76
1,148.0,10-10-2015,31
2,148.0,10-11-2015,65
3,148.0,10-12-2015,88
4,148.0,10/13/2015,85


In [55]:
str_cols = list(
    set(media.select_dtypes('object').columns.to_list())) 

media_clean = (
    media
    
    .copy()
    #.sample(frac=1, resample=False)
    
    # drop nulls
    .dropna(subset=['Theme Id'], how='any')

    # set dtypes
    .change_type(['Theme Id'], np.int64)
    
    # clean column names                                                                                                                                   
    .clean_names(case_type='snake')
)

media_clean['published_date'] = pd.to_datetime(media_clean['published_date'])
media_clean = media_clean.groupby(['theme_id','published_date'], as_index = False)['total_post'].sum()
media_clean.head()

Unnamed: 0,theme_id,published_date,total_post
0,8,2015-05-21,104
1,8,2015-05-22,92
2,8,2015-05-23,111
3,8,2015-05-24,105
4,8,2015-05-25,106


In [56]:
save_dataset(context, media_clean, 'cleaned/social_media_data')

### Themes list 
From discovery, it is known that:
* key columns: CLAIM_ID, Claim Name
* integer columns: CLAIM_ID
* str columns:  Claim Name

In [57]:
themes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   CLAIM_ID    208 non-null    int64 
 1   Claim Name  208 non-null    object
dtypes: int64(1), object(1)
memory usage: 3.4+ KB


In [58]:
str_cols = ['Claim Name']
themes_clean = (
    themes
    
    .copy()
    #.sample(frac=1, resample=False)

    # set dtypes
    .change_type(['CLAIM_ID'], np.int64)

    # clean string columns (NOTE: only after setting datetime)
    .transform_columns(str_cols, string_cleaning, elementwise=False)

    # ensure that the key column does not have duplicate records
    .remove_duplicate_rows(col_names=['CLAIM_ID'], keep_first=True)
    .remove_duplicate_rows(col_names=['Claim Name'], keep_first=True) 
    
    # clean column names                                                                                                                                   
    .clean_names(case_type='snake')
)
themes_clean.head()

Unnamed: 0,claim_id,claim_name
0,0,No Claim
1,8,low carb
2,15,beans
3,16,cocoa
4,26,vanilla


In [59]:
save_dataset(context, themes_clean, 'cleaned/theme_list')

### Themes Product list

From discovery, it is known that:
* key columns: None
* integer columns: CLAIM_ID, PRODUCT_ID

In [60]:
theme_product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91485 entries, 0 to 91484
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   PRODUCT_ID  91485 non-null  int64
 1   CLAIM_ID    91485 non-null  int64
dtypes: int64(2)
memory usage: 1.4 MB


In [61]:
theme_product_clean = (
    theme_product
    
    .copy()
    #.sample(frac=1, resample=False)

    # set dtypes
    .change_type(['CLAIM_ID', 'PRODUCT_ID'], np.int64)

    # clean column names                                                                                                                                   
    .clean_names(case_type='snake')
)
theme_product_clean.head()

Unnamed: 0,product_id,claim_id
0,26,8
1,29,8
2,48,81
3,50,81
4,74,227


In [62]:
save_dataset(context, theme_product_clean, 'cleaned/theme_product_list')