# Imports

In [110]:
from pprint import pprint
import os
import os.path as op
import shutil

# standard third party imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.options.mode.use_inf_as_na = True

In [111]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [112]:
import warnings

warnings.filterwarnings('ignore', message="The default value of regex will change from True to False in a future version.", 
                        category=FutureWarning)

In [113]:
# standard code-template imports
from ta_lib.core.api import (
    create_context, get_dataframe, get_feature_names_from_column_transformer, get_package_path,
    display_as_tabs, string_cleaning, merge_info, initialize_environment,
    list_datasets, load_dataset, save_dataset
)
import ta_lib.eda.api as eda

In [114]:
initialize_environment(debug=False, hide_warnings=True)

# Initialisation

In [115]:
config_path = op.join('conf', 'config.yml')
context = create_context(config_path)
pprint(list_datasets(context))

['/raw/google_search_data',
 '/raw/product_manufacturer_list',
 '/raw/sales_data',
 '/raw/social_media_data',
 '/raw/theme_list',
 '/raw/theme_product_list',
 '/cleaned/google_search_data',
 '/cleaned/product_manufacturer_list',
 '/cleaned/sales_data',
 '/cleaned/social_media_data',
 '/cleaned/theme_list',
 '/cleaned/theme_product_list',
 '/train/google_search_data',
 '/train/product_manufacturer_list',
 '/train/sales_data',
 '/train/social_media_data',
 '/train/theme_list',
 '/train/theme_product_list',
 '/test/sales/features',
 '/test/sales/target',
 '/score/sales/output']


In [116]:
g_search = load_dataset(context, 'raw/google_search_data')
product = load_dataset(context, 'raw/product_manufacturer_list')
sales = load_dataset(context, 'raw/sales_data')
media = load_dataset(context, 'raw/social_media_data')
themes = load_dataset(context, 'raw/theme_list')
theme_product = load_dataset(context, 'raw/theme_product_list')

# Data Cleaning

### Google search

From discovery, it is known that:
* key columns: None
* integer columns: searchVolume, Claim_ID, week_number,yyear_ne'
* datetime columns: date

In [117]:
g_search.head()

Unnamed: 0,date,platform,searchVolume,Claim_ID,week_number,year_new
0,05-01-2014,google,349,916,1,2014
1,06-01-2014,google,349,916,2,2014
2,07-01-2014,google,697,916,2,2014
3,10-01-2014,google,349,916,2,2014
4,20-01-2014,google,697,916,4,2014


In [118]:
g_search.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181565 entries, 0 to 181564
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   date          181565 non-null  object
 1   platform      181565 non-null  object
 2   searchVolume  181565 non-null  int64 
 3   Claim_ID      181565 non-null  int64 
 4   week_number   181565 non-null  int64 
 5   year_new      181565 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 8.3+ MB


In [119]:
# column names after cleaning

str_cols = list(
    set(g_search.select_dtypes('object').columns.to_list()) 
    - set(['searchVolume', 'Claim_ID', 'week_number', 'year_new', 'date'])
)
g_search_clean = (
    g_search
    
    .copy()
    #.sample(frac=1, resample=False)

    # set dtypes
    .change_type(['searchVolume', 'Claim_ID', 'week_number', 'year_new'], np.int64)
    
    # clean string columns (NOTE: only after setting datetime)
    .transform_columns(str_cols, string_cleaning, elementwise=False)

    # clean column names                                                                                                                                   
    .clean_names(case_type='snake')
)
g_search_clean['date'] = pd.to_datetime(g_search_clean["date"], format='%d-%m-%Y').dt.strftime('%Y-%m-%d')
g_search_clean['date'] = pd.to_datetime(g_search_clean['date'])
g_search_clean.head()


Unnamed: 0,date,platform,search_volume,claim_id,week_number,year_new
0,2014-01-05,google,349,916,1,2014
1,2014-01-06,google,349,916,2,2014
2,2014-01-07,google,697,916,2,2014
3,2014-01-10,google,349,916,2,2014
4,2014-01-20,google,697,916,4,2014


In [120]:

g_search_clean = g_search_clean.groupby(['date','claim_id','platform','year_new','week_number'], as_index = False)['search_volume'].sum()

In [121]:
save_dataset(context, g_search_clean, 'cleaned/google_search_data')

### Product Manufacturer list

From discovery, it is known that:
* key columns: PRODUCT_ID
* integer columns: PRODUCT_ID
* object columns: Vendor

In [122]:
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67175 entries, 0 to 67174
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PRODUCT_ID  67175 non-null  int64 
 1   Vendor      67175 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


In [123]:


product.head()

Unnamed: 0,PRODUCT_ID,Vendor
0,1,Others
1,2,Others
2,3,Others
3,4,Others
4,5,Others


In [124]:
# column names after cleaning

str_cols = list(
    set(product.select_dtypes('object').columns.to_list())
)
product_clean = (
    product
    
    .copy()
    #.sample(frac=1, resample=False)

    # set dtypes : nothing to do here
    .passthrough()
    
    # drop unnecessary columns
    #.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6'], axis = 1)

    # set dtypes
    .change_type(['PRODUCT_ID'], np.int64)
    
    # clean string columns (NOTE: only after setting datetime)
    .transform_columns(str_cols, string_cleaning, elementwise=False)
    .replace({'': np.NaN})

    # ensure that the key column does not have duplicate records
    .remove_duplicate_rows(col_names=['PRODUCT_ID'], keep_first=True)

    # clean column names                                                                                                                                   
    .clean_names(case_type='snake')
)
product_clean.head()

Unnamed: 0,product_id,vendor
0,1,Others
1,2,Others
2,3,Others
3,4,Others
4,5,Others


In [125]:
save_dataset(context, product_clean, 'cleaned/product_manufacturer_list')

### Sales

From discovery, it is known that:
* key columns: None
* integer columns: 'product_id', 'sales_dollars_value',  'sales_units_value', 'sales_lbs_value
* datet columns: system_calendar_key_Nr

In [126]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4526182 entries, 0 to 4526181
Data columns (total 5 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   system_calendar_key_N  int64  
 1   product_id             int64  
 2   sales_dollars_value    float64
 3   sales_units_value      int64  
 4   sales_lbs_value        int64  
dtypes: float64(1), int64(4)
memory usage: 172.7 MB


In [127]:
sales_clean = (
    sales
    
    .copy()
    #.sample(frac=1, resample=False)

    # drop unnecessary columns
    #.drop(['Unnamed: 0'], axis = 1)

    # set dtypes
    .change_type(['sales_dollars_value'], np.int64)

    # drop duplicates
    .drop_duplicates()

    # clean column names                                                                                                                                   
    .clean_names(case_type='snake')
    .rename_columns({'system_calendar_key_n': 'calender_key'})
)
sales_clean['calender_key'] = pd.to_datetime(sales_clean['calender_key'], format='%Y%m%d')
sales_clean.head()


Unnamed: 0,calender_key,product_id,sales_dollars_value,sales_units_value,sales_lbs_value
0,2016-01-09,1,13927,934,18680
1,2016-01-09,3,10289,1592,28646
2,2016-01-09,4,357,22,440
3,2016-01-09,6,23113,2027,81088
4,2016-01-09,7,23177,3231,58164


In [128]:
sales_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4526182 entries, 0 to 4526181
Data columns (total 5 columns):
 #   Column               Dtype         
---  ------               -----         
 0   calender_key         datetime64[ns]
 1   product_id           int64         
 2   sales_dollars_value  int64         
 3   sales_units_value    int64         
 4   sales_lbs_value      int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 207.2 MB


In [129]:
save_dataset(context, sales_clean, 'cleaned/sales_data')

### Social Media

From discovery, it is known that:
* key columns: None
* integer columns: total_post, Theme Id
* date columns:  published_date

In [130]:
media.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 533390 entries, 0 to 533389
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Theme Id        314879 non-null  float64
 1   published_date  533390 non-null  object 
 2   total_post      533390 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 12.2+ MB


In [131]:
media.head()

Unnamed: 0,Theme Id,published_date,total_post
0,148.0,10-01-2015,76
1,148.0,10-10-2015,31
2,148.0,10-11-2015,65
3,148.0,10-12-2015,88
4,148.0,10/13/2015,85


In [132]:
str_cols = list(
    set(media.select_dtypes('object').columns.to_list())) 

media_clean = (
    media
    
    .copy()
    #.sample(frac=1, resample=False)
    
    # drop nulls
    .dropna(subset=['Theme Id'], how='any')

    # set dtypes
    .change_type(['Theme Id'], np.int64)
    
    # clean column names                                                                                                                                   
    .clean_names(case_type='snake')
)

media_clean['published_date'] = pd.to_datetime(media_clean['published_date'])
media_clean = media_clean.groupby(['theme_id','published_date'], as_index = False)['total_post'].sum()
media_clean.head()

Unnamed: 0,theme_id,published_date,total_post
0,8,2015-05-21,104
1,8,2015-05-22,92
2,8,2015-05-23,111
3,8,2015-05-24,105
4,8,2015-05-25,106


In [133]:
save_dataset(context, media_clean, 'cleaned/social_media_data')

### Themes list 
From discovery, it is known that:
* key columns: CLAIM_ID, Claim Name
* integer columns: CLAIM_ID
* str columns:  Claim Name

In [134]:
themes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   CLAIM_ID    208 non-null    int64 
 1   Claim Name  208 non-null    object
dtypes: int64(1), object(1)
memory usage: 3.4+ KB


In [135]:
str_cols = ['Claim Name']
themes_clean = (
    themes
    
    .copy()
    #.sample(frac=1, resample=False)

    # set dtypes
    .change_type(['CLAIM_ID'], np.int64)

    # clean string columns (NOTE: only after setting datetime)
    .transform_columns(str_cols, string_cleaning, elementwise=False)

    # ensure that the key column does not have duplicate records
    .remove_duplicate_rows(col_names=['CLAIM_ID'], keep_first=True)
    .remove_duplicate_rows(col_names=['Claim Name'], keep_first=True) 
    
    # clean column names                                                                                                                                   
    .clean_names(case_type='snake')
)
themes_clean.head()

Unnamed: 0,claim_id,claim_name
0,0,No Claim
1,8,low carb
2,15,beans
3,16,cocoa
4,26,vanilla


In [136]:
save_dataset(context, themes_clean, 'cleaned/theme_list')

### Themes Product list

From discovery, it is known that:
* key columns: None
* integer columns: CLAIM_ID, PRODUCT_ID

In [137]:
theme_product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91485 entries, 0 to 91484
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   PRODUCT_ID  91485 non-null  int64
 1   CLAIM_ID    91485 non-null  int64
dtypes: int64(2)
memory usage: 1.4 MB


In [138]:
theme_product_clean = (
    theme_product
    
    .copy()
    #.sample(frac=1, resample=False)

    # set dtypes
    .change_type(['CLAIM_ID', 'PRODUCT_ID'], np.int64)

    # clean column names                                                                                                                                   
    .clean_names(case_type='snake')
)
theme_product_clean.head()

Unnamed: 0,product_id,claim_id
0,26,8
1,29,8
2,48,81
3,50,81
4,74,227


In [139]:
save_dataset(context, theme_product_clean, 'cleaned/theme_product_list')

In [140]:
g_search_clean

Unnamed: 0,date,claim_id,platform,year_new,week_number,search_volume
0,2014-01-01,8,google,2014,1,6613
1,2014-01-01,39,google,2014,1,181
2,2014-01-01,75,google,2014,1,135
3,2014-01-01,81,google,2014,1,1257
4,2014-01-01,100,google,2014,1,2636
...,...,...,...,...,...,...
179077,2019-10-01,979,google,2019,40,373
179078,2019-10-01,980,amazon,2019,40,165
179079,2019-10-01,980,chewy,2019,40,124
179080,2019-10-01,980,google,2019,40,33863


# Now understanding consumer prefrences on google search by the common themes 

In [173]:
g_search_clean_mer = pd.merge(g_search_clean,themes_clean,left_on='claim_id', right_on='claim_id')


In [174]:
g_search_clean_mer

Unnamed: 0,date,claim_id,platform,year_new,week_number,search_volume,claim_name
0,2014-01-01,8,google,2014,1,6613,low carb
1,2014-01-02,8,google,2014,1,6867,low carb
2,2014-01-03,8,google,2014,1,4451,low carb
3,2014-01-04,8,google,2014,1,6994,low carb
4,2014-01-05,8,google,2014,1,5341,low carb
...,...,...,...,...,...,...,...
175068,2019-09-29,843,google,2019,39,84,snickerdoodle
175069,2018-06-04,606,amazon,2018,23,135,hemp seeds
175070,2018-11-21,514,google,2018,47,3,white cheddar cheese
175071,2019-01-31,433,amazon,2019,5,39,gingerbread


In [175]:
import matplotlib.pyplot as plt
plt.switch_backend('TkAgg')

In [176]:
common_themes = ['convenience - easy-to-prepare',
 'nuts',
 'prebiotic',
 'peach',
 'ethical - not specific',
 'soy foods',
 'french bisque',
 'no additives/preservatives',
 'sea salt',
 'vegetarian',
 'crab',
 'ethnic & exotic',
 'gmo free',
 'low sodium',
 'high/source of protein',
 'tuna',
 'bone health',
 'low calorie',
 'gingerbread',
 'blueberry',
 'mackerel',
 'poultry',
 'chicken',
 'ethical - packaging',
 'beef hamburger',
 'energy/alertness',
 'low carb',
 'halal',
 'salmon',
 'low sugar']

In [292]:

filtered_data_google = g_search_clean_mer[g_search_clean_mer['claim_name'].isin(common_themes)]

theme_counts = filtered_data_google.groupby('claim_name')['search_volume'].sum()

theme_counts=theme_counts.sort_values(ascending=False)
theme_counts

claim_name
chicken            33480524
ethnic & exotic    24798675
low carb           14911004
low sodium          8708056
blueberry           8467294
vegetarian          6267768
low sugar           5349287
french bisque       3235011
gmo free            2102603
beef hamburger      2081539
low calorie         1638926
salmon              1531221
soy foods           1043191
bone health          654823
tuna                 571063
prebiotic            280497
crab                 266546
poultry               25488
halal                 23996
mackerel               2316
sea salt                110
gingerbread              80
peach                    29
nuts                     17
Name: search_volume, dtype: int64

In [294]:
top_10 = theme_counts.head(10)
top_10

claim_name
chicken            33480524
ethnic & exotic    24798675
low carb           14911004
low sodium          8708056
blueberry           8467294
vegetarian          6267768
low sugar           5349287
french bisque       3235011
gmo free            2102603
beef hamburger      2081539
Name: search_volume, dtype: int64

In [296]:
other_count = theme_counts[10:].sum()
other_count

6038303

In [298]:
theme_counts=pd.concat([top_10, pd.Series({'Other': other_count})])
theme_counts

chicken            33480524
ethnic & exotic    24798675
low carb           14911004
low sodium          8708056
blueberry           8467294
vegetarian          6267768
low sugar           5349287
french bisque       3235011
gmo free            2102603
beef hamburger      2081539
Other               6038303
dtype: int64

In [None]:





# # Calculate the count for 'Other' category


# # Combine the top 10 products with 'Other' as a new Series
# top_products_with_other = pd.concat([top_10_products, pd.Series({'Other': other_count})])

# # Plotting the pie chart
# plt.figure(figsize=(8, 8))
# plt.pie(top_products_with_other, labels=top_products_with_other.index, autopct='%1.1f%%', startangle=140)
# plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

# # Add a title to the pie chart
# plt.title('Top 10 Products with Other')

# # Display the plot
# plt.show()

In [311]:
labels = theme_counts.index.tolist()

sizes = theme_counts.values
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue', 'lightgreen']  
explode = [0.05] * len(labels) # To explode all slices 

# Plotting the pie chart
plt.figure(figsize=(12, 10))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal') 

plt.title('Consumer Preferences (Themes) for Google Search')

plt.show()

In [156]:
#Now understanding consumer prefrences on SOCIAL MEDIA by the common themes.

In [180]:
media_clean

Unnamed: 0,theme_id,published_date,total_post
0,8,2015-05-21,104
1,8,2015-05-22,92
2,8,2015-05-23,111
3,8,2015-05-24,105
4,8,2015-05-25,106
...,...,...,...
311678,999,2019-10-27,39
311679,999,2019-10-28,9
311680,999,2019-10-29,11
311681,999,2019-10-30,14


In [312]:
media_clean_mer = pd.merge(media_clean,themes_clean, left_on='theme_id', right_on='claim_id')

In [313]:
media_clean_mer

Unnamed: 0,theme_id,published_date,total_post,claim_id,claim_name
0,8,2015-05-21,104,8,low carb
1,8,2015-05-22,92,8,low carb
2,8,2015-05-23,111,8,low carb
3,8,2015-05-24,105,8,low carb
4,8,2015-05-25,106,8,low carb
...,...,...,...,...,...
310167,207,2019-10-27,13,207,energyalertness
310168,207,2019-10-28,20,207,energyalertness
310169,207,2019-10-29,22,207,energyalertness
310170,207,2019-10-30,15,207,energyalertness


In [315]:

filtered_data_media= media_clean_mer[media_clean_mer['claim_name'].isin(common_themes)]

theme_counts = filtered_data_media.groupby('claim_name')['total_post'].sum()

theme_counts=theme_counts.sort_values(ascending=False)
theme_counts

claim_name
bone health        695432
low carb           535903
ethnic & exotic    527730
vegetarian         348499
salmon             231258
nuts               160113
french bisque      147712
chicken            147315
gingerbread        133417
low sodium         121895
gmo free            90840
blueberry           82573
sea salt            73178
beef hamburger      40419
mackerel            37641
crab                26983
low sugar           26236
low calorie         16557
soy foods           14752
prebiotic           10958
tuna                 8901
halal                4331
poultry              2494
peach                 935
Name: total_post, dtype: int64

In [316]:
top_10 = theme_counts.head(10)
other_count = theme_counts[10:].sum()
theme_counts=pd.concat([top_10, pd.Series({'Other': other_count})])
theme_counts

bone health        695432
low carb           535903
ethnic & exotic    527730
vegetarian         348499
salmon             231258
nuts               160113
french bisque      147712
chicken            147315
gingerbread        133417
low sodium         121895
Other              436798
dtype: int64

In [319]:
labels = theme_counts.index.tolist()

sizes = theme_counts.values
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue', 'lightgreen']  
explode = [0.05] * len(labels)  # To explode all slices 

# Plotting the pie chart
plt.figure(figsize=(12, 10))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal') 

plt.title('Consumer Preferences (Themes) for Social Media')

plt.show()
plt.savefig('Social_media', bbox_inches='tight')

##Now understanding consumer prefrences on SALES Dataset by the common themes.

In [320]:
sales_clean

Unnamed: 0,calender_key,product_id,sales_dollars_value,sales_units_value,sales_lbs_value
0,2016-01-09,1,13927,934,18680
1,2016-01-09,3,10289,1592,28646
2,2016-01-09,4,357,22,440
3,2016-01-09,6,23113,2027,81088
4,2016-01-09,7,23177,3231,58164
...,...,...,...,...,...
4526177,2018-10-27,47536,8,2,3
4526178,2018-10-27,47539,391,39,68
4526179,2018-10-27,47543,105,59,48
4526180,2018-10-27,47544,3720,1246,4361


In [189]:
themes_clean

Unnamed: 0,claim_id,claim_name
0,0,No Claim
1,8,low carb
2,15,beans
3,16,cocoa
4,26,vanilla
...,...,...
203,508,cola
204,769,shortbread
205,949,passion fruit
206,521,blood orange


In [167]:
theme_product_clean

Unnamed: 0,product_id,claim_id
0,26,8
1,29,8
2,48,81
3,50,81
4,74,227
...,...,...
91480,8158,0
91481,45183,0
91482,25690,0
91483,46085,0


In [321]:
sales_clean_mer = pd.merge(sales_clean,theme_product_clean, left_on='product_id', right_on='product_id')
sales_clean_mer = pd.merge(sales_clean_mer,themes_clean, left_on='claim_id', right_on='claim_id')

In [322]:
sales_clean_mer

Unnamed: 0,calender_key,product_id,sales_dollars_value,sales_units_value,sales_lbs_value,claim_id,claim_name
0,2016-01-09,1,13927,934,18680,0,No Claim
1,2016-01-23,1,12628,878,17564,0,No Claim
2,2016-02-06,1,11379,810,16200,0,No Claim
3,2016-01-30,1,11568,821,16424,0,No Claim
4,2016-02-13,1,10959,784,15682,0,No Claim
...,...,...,...,...,...,...,...
7767415,2019-09-07,18405,2313,91,273,65,ethical not specific
7767416,2019-09-14,18405,2313,91,273,65,ethical not specific
7767417,2019-09-21,18405,2313,91,273,65,ethical not specific
7767418,2019-09-28,18405,2313,91,273,65,ethical not specific


low carb           18625656906
gmo free            3859459172
blueberry           3137484698
ethnic & exotic     2806206814
salmon              2400091924
soy foods           2377885566
crab                 855911394
low calorie          738984415
low sodium           590302820
chicken              321846191
Other                716229661
dtype: int64

In [325]:

filtered_data_sales= sales_clean_mer[sales_clean_mer['claim_name'].isin(common_themes)]


theme_counts = filtered_data_sales.groupby('claim_name')['sales_dollars_value'].sum()

theme_counts=theme_counts.sort_values(ascending=False)
theme_counts

claim_name
low carb           18625656906
gmo free            3859459172
blueberry           3137484698
ethnic & exotic     2806206814
salmon              2400091924
soy foods           2377885566
crab                 855911394
low calorie          738984415
low sodium           590302820
chicken              321846191
french bisque        292475739
gingerbread          127583032
vegetarian           104302236
poultry               75529517
nuts                  47005459
low sugar             40884656
prebiotic             14701854
beef hamburger         5338946
peach                  5294008
bone health            1731056
halal                   598513
mackerel                567388
sea salt                217048
tuna                       209
Name: sales_dollars_value, dtype: int64

In [326]:
top_10 = theme_counts.head(10)
other_count = theme_counts[10:].sum()
theme_counts=pd.concat([top_10, pd.Series({'Other': other_count})])
theme_counts

low carb           18625656906
gmo free            3859459172
blueberry           3137484698
ethnic & exotic     2806206814
salmon              2400091924
soy foods           2377885566
crab                 855911394
low calorie          738984415
low sodium           590302820
chicken              321846191
Other                716229661
dtype: int64

In [331]:
labels = theme_counts.index.tolist()

sizes = theme_counts.values
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue', 'lightgreen']  
explode = [0.1] * len(labels)  # To explode all slices 

# Plotting the pie chart
plt.figure(figsize=(12, 10))
plt.pie(sizes,  labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal') 

plt.title('Consumer Preferences (Themes) for Sales')

plt.show()
plt.savefig('Sales', bbox_inches='tight')

# Time Granularity

In [195]:
g_search_clean.head(5)

Unnamed: 0,date,claim_id,platform,year_new,week_number,search_volume
0,2014-01-01,8,google,2014,1,6613
1,2014-01-01,39,google,2014,1,181
2,2014-01-01,75,google,2014,1,135
3,2014-01-01,81,google,2014,1,1257
4,2014-01-01,100,google,2014,1,2636


In [282]:
media_clean_mer

Unnamed: 0,theme_id,published_date,total_post,claim_id,claim_name
0,8,2015-05-21,104,8,low carb
1,8,2015-05-22,92,8,low carb
2,8,2015-05-23,111,8,low carb
3,8,2015-05-24,105,8,low carb
4,8,2015-05-25,106,8,low carb
...,...,...,...,...,...
310167,999,2019-10-27,39,999,oral health
310168,999,2019-10-28,9,999,oral health
310169,999,2019-10-29,11,999,oral health
310170,999,2019-10-30,14,999,oral health


In [285]:
filtered_data_media= media_clean_mer[media_clean_mer['claim_name'].isin(common_themes)]
filtered_data_media

Unnamed: 0,theme_id,published_date,total_post,claim_id,claim_name
0,8,2015-05-21,104,8,low carb
1,8,2015-05-22,92,8,low carb
2,8,2015-05-23,111,8,low carb
3,8,2015-05-24,105,8,low carb
4,8,2015-05-25,106,8,low carb
...,...,...,...,...,...
77428,438,2019-10-27,165,438,ethnic & exotic
77429,438,2019-10-28,158,438,ethnic & exotic
77430,438,2019-10-29,121,438,ethnic & exotic
77431,438,2019-10-30,135,438,ethnic & exotic


In [289]:

df=filtered_data_media
grouped_df = df.groupby('published_date')['total_post'].sum().reset_index()

# Set 'date' as the DataFrame index
grouped_df.set_index('published_date', inplace=True)

# Resample the data at a weekly frequency and sum the 'total post' for each week
weekly_df = grouped_df.resample('W').sum()


plt.figure(figsize=(12, 6))
plt.plot(weekly_df.index, weekly_df['total_post'], marker='o', linestyle='-', color='b')

plt.xlabel('Date')
plt.ylabel('Total Posts')
plt.title('Total Posts Over Weeks (Weekly Granularity) for Social_media Dataset')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()

plt.show()

In [197]:
time_diff = g_search_clean['date'].diff()
time_diff

0           NaT
1        0 days
2        0 days
3        0 days
4        0 days
          ...  
179077   0 days
179078   0 days
179079   0 days
179080   0 days
179081   0 days
Name: date, Length: 179082, dtype: timedelta64[ns]

In [198]:
# # Assuming your g_search_clean DataFrame contains a column named 'date' representing the date of analysis
# g_search_clean['date'] = pd.to_datetime(g_search_clean['date'])

# # Calculate the time difference between consecutive dates
# time_diff = g_search_clean['date'].diff()

# # Find the most frequent time difference
# #time_granularity = time_diff.mode().iloc[0]

# # Determine the time granularity
# if time_granularity.days == 1:
#     print("Time Granularity: Daily")
# elif time_granularity.days == 7:
#     print("Time Granularity: Weekly")
# elif time_granularity.days == 30:
#     print("Time Granularity: Monthly")
# elif time_granularity.days == 90:
#     print("Time Granularity: Quarterly")
# elif time_granularity.days == 365:
#     print("Time Granularity: Yearly")
# else:
#     print("Time Granularity: Unknown")

Time Granularity: Unknown


Timedelta('0 days 00:00:00')

In [216]:
# # Assuming your g_search_clean DataFrame contains a column named 'date' representing the date of analysis
# # g_search_clean['date'] = pd.to_datetime(g_search_clean['date'])

# # Sort the DataFrame by date (if not already sorted)
# Sorted=g_search_clean.sort_values(by='date', inplace=False)

# # Calculate the date difference between consecutive dates
# Sorted['date_difference'] = g_search_clean['date'].diff()/ np.timedelta64(1, 'M')

# # Print the DataFrame to see the date difference column
# print(Sorted)

             date  claim_id platform  year_new  week_number  search_volume  \
0      2014-01-01         8   google      2014            1           6613   
25     2014-01-01       713   google      2014            1           2637   
26     2014-01-01       732   google      2014            1          18498   
27     2014-01-01       747   google      2014            1          14388   
28     2014-01-01       770   google      2014            1          20285   
...           ...       ...      ...       ...          ...            ...   
178988 2019-10-01       543   google      2019           40           1344   
178987 2019-10-01       541   google      2019           40           2296   
178986 2019-10-01       536   google      2019           40           1824   
178984 2019-10-01       519   google      2019           40            642   
179081 2019-10-01       999   google      2019           40            254   

        date_difference  
0                   NaN  
25         

In [217]:
# import pandas as pd
# import matplotlib.pyplot as plt



# # Calculate the time difference between consecutive dates
# time_diff = Sorted['date'].diff()

# # Plot the time difference as a line plot
# plt.figure(figsize=(10, 6))
# plt.plot(Sorted['date'], time_diff, marker='o', linestyle='--', color='b')
# plt.xlabel('Date')
# plt.ylabel('Time Difference (Days)')
# plt.title('Time Granularity Analysis')
# plt.grid(True)
# plt.tight_layout()

# # Display the plot
# plt.show()
# plt.savefig('TG', bbox_inches='tight')

In [218]:
g_search_clean

Unnamed: 0,date,claim_id,platform,year_new,week_number,search_volume
0,2014-01-01,8,google,2014,1,6613
1,2014-01-01,39,google,2014,1,181
2,2014-01-01,75,google,2014,1,135
3,2014-01-01,81,google,2014,1,1257
4,2014-01-01,100,google,2014,1,2636
...,...,...,...,...,...,...
179077,2019-10-01,979,google,2019,40,373
179078,2019-10-01,980,amazon,2019,40,165
179079,2019-10-01,980,chewy,2019,40,124
179080,2019-10-01,980,google,2019,40,33863


Unnamed: 0,theme_id,published_date,total_post
0,8,2015-05-21,104
1,8,2015-05-22,92
2,8,2015-05-23,111
3,8,2015-05-24,105
4,8,2015-05-25,106
...,...,...,...
311678,999,2019-10-27,39
311679,999,2019-10-28,9
311680,999,2019-10-29,11
311681,999,2019-10-30,14


In [227]:
# # Group the data by 'date' and sum the 'total post' for each date
# grouped_df = df.groupby('published_date')['total_post'].sum().reset_index()

# # Plot the line plot for each date
# plt.figure(figsize=(12, 6))
# for date, total_posts in zip(grouped_df['published_date'], grouped_df['total_post']):
#     plt.plot([date], [total_posts], marker='o', linestyle='-', label=date.date())

# plt.xlabel('Date')
# plt.ylabel('Total Posts')
# plt.title('Total Posts Over Time')
# plt.xticks(rotation=45)
# plt.grid(True)
# plt.legend(loc='best')
# plt.tight_layout()

# # Display the plot
# plt.show()

Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations.
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations.


In [228]:
df=media_clean
grouped_df = df.groupby('published_date')['total_post'].sum().reset_index()

# Set 'date' as the DataFrame index
grouped_df.set_index('published_date', inplace=True)

# Resample the data at a weekly frequency and sum the 'total post' for each week
weekly_df = grouped_df.resample('W').sum()


plt.figure(figsize=(12, 6))
plt.plot(weekly_df.index, weekly_df['total_post'], marker='o', linestyle='-', color='b')

plt.xlabel('Date')
plt.ylabel('Total Posts')
plt.title('Total Posts Over Weeks (Weekly Granularity)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()

plt.show()

In [230]:

# Group the data by 'date' and sum the 'total post' for each date
grouped_df = df.groupby('published_date')['total_post'].sum().reset_index()

# Set 'date' as the DataFrame index
grouped_df.set_index('published_date', inplace=True)

# Resample the data at a weekly frequency and sum the 'total post' for each week
weekly_df = grouped_df.resample('M').sum()

# Plot the line plot for weekly data
plt.figure(figsize=(12, 6))
plt.plot(weekly_df.index, weekly_df['total_post'], marker='o', linestyle='-', color='b')

plt.xlabel('Date')
plt.ylabel('Total Posts')
plt.title('Total Posts Over MONTH (MONTHy Granularity)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()

# Display the plot
plt.show()

In [232]:

# Group the data by 'date' and sum the 'total post' for each date
grouped_df = df.groupby('published_date')['total_post'].sum().reset_index()



# Plot the line plot for weekly data
plt.figure(figsize=(12, 6))
plt.plot(grouped_df['published_date'], grouped_df['total_post'], marker='o', linestyle='-', color='b')

plt.xlabel('Date')
plt.ylabel('Total Posts')
plt.title('Total Posts Over Daily (Daily Granularity)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()

# Display the plot
plt.show()

In [333]:
df=g_search_clean
df

Unnamed: 0,date,claim_id,platform,year_new,week_number,search_volume
0,2014-01-01,8,google,2014,1,6613
1,2014-01-01,39,google,2014,1,181
2,2014-01-01,75,google,2014,1,135
3,2014-01-01,81,google,2014,1,1257
4,2014-01-01,100,google,2014,1,2636
...,...,...,...,...,...,...
179077,2019-10-01,979,google,2019,40,373
179078,2019-10-01,980,amazon,2019,40,165
179079,2019-10-01,980,chewy,2019,40,124
179080,2019-10-01,980,google,2019,40,33863


In [336]:

grouped_df = df.groupby('date')['search_volume'].sum().reset_index()

# Set 'date' as the DataFrame index
grouped_df.set_index('date', inplace=True)

# Resample the data at a weekly frequency and sum the 'total post' for each week
#weekly_df = grouped_df.resample('W').sum()


plt.figure(figsize=(12, 6))
plt.plot(weekly_df.index, weekly_df['search_volume'], marker='o', linestyle='-', color='b')

plt.xlabel('Date')
plt.ylabel('Total Posts')
plt.title('Total Posts Over Weeks (Weekly Granularity)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()

plt.show()

In [335]:

grouped_df = df.groupby('date')['search_volume'].sum().reset_index()

# Set 'date' as the DataFrame index
grouped_df.set_index('date', inplace=True)

# Resample the data at a weekly frequency and sum the 'total post' for each week
weekly_df = grouped_df.resample('M').sum()


plt.figure(figsize=(12, 6))
plt.plot(weekly_df.index, weekly_df['search_volume'], marker='o', linestyle='-', color='b')

plt.xlabel('Date')
plt.ylabel('Total Posts')
plt.title('Total Posts Over mONTH (Month Granularity)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()

plt.show()

In [338]:
df=sales_clean
df

Unnamed: 0,calender_key,product_id,sales_dollars_value,sales_units_value,sales_lbs_value
0,2016-01-09,1,13927,934,18680
1,2016-01-09,3,10289,1592,28646
2,2016-01-09,4,357,22,440
3,2016-01-09,6,23113,2027,81088
4,2016-01-09,7,23177,3231,58164
...,...,...,...,...,...
4526177,2018-10-27,47536,8,2,3
4526178,2018-10-27,47539,391,39,68
4526179,2018-10-27,47543,105,59,48
4526180,2018-10-27,47544,3720,1246,4361


In [339]:

grouped_df = df.groupby('calender_key')['sales_dollars_value'].sum().reset_index()

# Set 'date' as the DataFrame index
grouped_df.set_index('calender_key', inplace=True)

# Resample the data at a weekly frequency and sum the 'total post' for each week
weekly_df = grouped_df.resample('W').sum()


plt.figure(figsize=(12, 6))
plt.plot(weekly_df.index, weekly_df['sales_dollars_value'], marker='o', linestyle='-', color='b')

plt.xlabel('Date')
plt.ylabel('Total Posts')
plt.title('Total Posts Over Week (Week Granularity)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()

plt.show()







Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure.



system_calendar_key_N       196
product_id                42616
sales_dollars_value      254341
sales_units_value        114070
sales_lbs_value          206368
PRODUCT_ID                42616
CLAIM_ID                     49
Claim Name                   49
dtype: int64

array(['No Claim', 'chicken', 'salmon', 'apple cinnamon', 'pollock',
       'pizza', 'cookie', 'soy foods', 'blueberry',
       'american southwest style', 'herbs', 'poultry', 'ethnic & exotic',
       'stroganoff', 'no additives/preservatives', 'buckwheat',
       'low carb', 'gmo free', 'low sodium', 'french bisque',
       'ethical - packaging', 'american gumbo', 'crab',
       'energy/alertness', 'low sugar', 'beef hamburger',
       'high/source of protein', 'brown ale', 'gingerbread', 'beans',
       'vegetarian', 'prebiotic', 'tilapia', 'peanut', 'nuts', 'cocoa',
       'red raspberry', 'mackerel', 'scallop', 'low calorie',
       'convenience - easy-to-prepare', 'peach', 'cherry', 'sea salt',
       'bone health', 'cumin', 'tuna', 'halal', 'ethical - not specific'],
      dtype=object)

date             2100
platform            4
searchVolume    19129
Claim_ID          160
week_number        53
year_new            6
CLAIM_ID          160
Claim Name        160
dtype: int64

array(['gluten free', 'low carb', 'natural', 'digestive/gut health',
       'high/source of protein', 'no additives/preservatives', 'allergy',
       'convenience - packaging', 'oral health', 'dry', 'traditional',
       'antioxidant', 'skin health', 'immune health',
       'convenience - ready prepared', 'indulgent & premium', 'organic',
       'low fat', 'vitamin/mineral fortified', 'low calorie',
       'high/source of fiber', 'low sodium', 'no added sugar', 'omega-3',
       'bone health', 'heart health', 'weight management', 'prebiotic',
       'ethical - packaging', 'eye health', 'economy',
       'ethical - animal/fish & bird', 'low sugar', 'joint health', 'dha',
       'brain health', 'no trans fats', 'sugar free', 'diabetic',
       'hfcs free', 'low cholesterol', 'low gi', 'lactose free',
       'wholegrain', 'added calcium', 'added fiber', 'added protein',
       'added iron', 'anti-aging/aging-well', 'energy/alertness',
       'sports & recovery', 'functional, not specified

Theme Id           193
published_date    1625
total_post        4270
CLAIM_ID           193
Claim Name         193
dtype: int64

array(['tuna', 'allergy', 'convenience - packaging', 'oral health',
       'mint', 'dry', 'traditional', 'antioxidant', 'peppermint',
       'skin health', 'banana', 'crab', 'mango', 'peanut butter',
       'immune health', 'broccoli', 'spinach',
       'convenience - ready prepared', 'toffee', 'indulgent & premium',
       'organic', 'low fat', 'vitamin/mineral fortified', 'trout',
       'low calorie', 'bacon', 'coconut', 'red apple',
       'high/source of fiber', 'low sodium', 'no added sugar',
       'green bean', 'omega-3', 'bone health', 'green tea', 'cranberry',
       'heart health', 'weight management', 'celery', 'strawberry',
       'dandelion greens', 'prebiotic', 'ethical - packaging',
       'eye health', 'poultry', 'parsley', 'economy', 'cheddar cheese',
       'venison', 'ethical - animal/fish & bird', 'low sugar',
       'ricotta cheese', 'joint health', 'brain health',
       'roquefort cheese', 'no trans fats', 'sugar free', 'diabetic',
       'hfcs free', 'low chole

30

['convenience - easy-to-prepare',
 'nuts',
 'prebiotic',
 'peach',
 'ethical - not specific',
 'soy foods',
 'french bisque',
 'no additives/preservatives',
 'sea salt',
 'vegetarian',
 'crab',
 'ethnic & exotic',
 'gmo free',
 'low sodium',
 'high/source of protein',
 'tuna',
 'bone health',
 'low calorie',
 'gingerbread',
 'blueberry',
 'mackerel',
 'poultry',
 'chicken',
 'ethical - packaging',
 'beef hamburger',
 'energy/alertness',
 'low carb',
 'halal',
 'salmon',
 'low sugar']

Rule 1 passed


Rule 2 failed


Rule 3 failed


{'A-B': 5, 'B-A': 0, 'AuB': 13549, 'A^B': 13544}

{'APL AWS3AL(G) 42MM GLD CST',
 'APL AWS3AL(G+C) 38MM GRY CST',
 'SP3 SMS N920 NOTE5 32G GLD DST',
 'SP3 SMS S8+ G955 BLK BER SPT',
 'UNLKD SONY XPERIA XZS BLK 32GB'}

Records affected due to missing keys are 402 accounting to 0.23% of orders


Found 1 duplicates in master. Sample duplicates are:
                              SKU  Frequency
0  UNLKD SONY XPERIA XZS BLUE 32G          2


13545

No. of rows before dropping duplicate SKUs: 13545
No. of rows after dropping duplicate SKUs: 13544












tzname GB identified but not understood.  Pass `tzinfos` argument in order to correctly return a timezone-aware datetime.  In a future version, this will raise an exception.





this method is deprecated in favour of `Styler.to_html()`

tzname GB identified but not understood.  Pass `tzinfos` argument in order to correctly return a timezone-aware datetime.  In a future version, this will raise an exception.

this method is deprecated in favour of `Styler.to_html()`

