# Purpose


This notebook demonstrates the data pipeline from raw tables to analytical datasets. At the end of this activity, train & test data sets are created from raw data.



## Imports

In [36]:
from pprint import pprint
import os
import os.path as op
import shutil

# standard third party imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.options.mode.use_inf_as_na = True


In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
# standard code-template imports
from ta_lib.core.api import (
    create_context, get_dataframe, get_feature_names_from_column_transformer, get_package_path,
    display_as_tabs, string_cleaning, merge_info, initialize_environment,
    list_datasets, load_dataset, save_dataset
)
import ta_lib.eda.api as eda

In [39]:
import warnings

warnings.filterwarnings('ignore', message="The default value of regex will change from True to False in a future version.", 
                        category=FutureWarning)

In [40]:
initialize_environment(debug=False, hide_warnings=True)

## Utility functions

# 1. Initialization

In [41]:
config_path = op.join('conf', 'config.yml')
context = create_context(config_path)
pprint(list_datasets(context))

google_search_data_df = load_dataset(context, 'raw/google_search_data')
product_manufacturer_list_df = load_dataset(context, 'raw/product_manufacturer_list')
sales_data_df = load_dataset(context, 'raw/sales_data')
theme_list_df = load_dataset(context, 'raw/theme_list')
theme_product_list_df = load_dataset(context, 'raw/theme_product_list')
social_media_data_df = load_dataset(context, 'raw/social_media_data')

# orders_df = load_dataset(context,'raw/orders')

['/raw/google_search_data',
 '/raw/product_manufacturer_list',
 '/raw/sales_data',
 '/raw/theme_list',
 '/raw/theme_product_list',
 '/raw/social_media_data',
 '/raw/orders',
 '/cleaned/prod_df_clean',
 '/cleaned/google_search_data_df_clean',
 '/cleaned/sales_data_df_clean',
 '/cleaned/theme_list_df_clean',
 '/cleaned/theme_product_list_df_clean',
 '/cleaned/social_media_data_df_clean',
 '/cleaned/orders',
 '/processed/sales',
 '/train/sales/features',
 '/train/sales/target',
 '/test/sales/features',
 '/test/sales/target',
 '/score/sales/output']


In [42]:
# to standardize the date format in every dataset
from datetime import datetime
def standardize_date(date_str):
    # List of possible date formats
    date_formats = ['%m/%d/%Y', '%m-%d-%Y', '%d/%m/%Y', '%Y-%m-%d']

    # Iterate through the formats and try to parse the date
    for fmt in date_formats:
        try:
            return datetime.strptime(date_str, fmt).strftime('%d-%m-%Y')
        except ValueError:
            pass

    # If no valid format is found, return None or the original string, depending on your preference
    return None

In [43]:
google_search_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181565 entries, 0 to 181564
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   date          181565 non-null  object
 1   platform      181565 non-null  object
 2   searchVolume  181565 non-null  int64 
 3   Claim_ID      181565 non-null  int64 
 4   week_number   181565 non-null  int64 
 5   year_new      181565 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 8.3+ MB


In [44]:
# removed some columns which have all null values
# product_manufacturer_list_df=product_manufacturer_list_df[['PRODUCT_ID','Vendor']]
product_manufacturer_list_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67175 entries, 0 to 67174
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   PRODUCT_ID  67175 non-null  int64  
 1   Vendor      67175 non-null  object 
 2   Unnamed: 2  0 non-null      float64
 3   Unnamed: 3  0 non-null      float64
 4   Unnamed: 4  0 non-null      float64
 5   Unnamed: 5  0 non-null      float64
 6   Unnamed: 6  0 non-null      float64
dtypes: float64(5), int64(1), object(1)
memory usage: 3.6+ MB


In [45]:
sales_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4526182 entries, 0 to 4526181
Data columns (total 5 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   system_calendar_key_N  int64  
 1   product_id             int64  
 2   sales_dollars_value    float64
 3   sales_units_value      int64  
 4   sales_lbs_value        int64  
dtypes: float64(1), int64(4)
memory usage: 172.7 MB


In [46]:
theme_list_df.head()

Unnamed: 0,CLAIM_ID,Claim Name
0,0,No Claim
1,8,low carb
2,15,beans
3,16,cocoa
4,26,vanilla


In [47]:
theme_product_list_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91485 entries, 0 to 91484
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   PRODUCT_ID  91485 non-null  int64
 1   CLAIM_ID    91485 non-null  int64
dtypes: int64(2)
memory usage: 1.4 MB


In [48]:
social_media_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 533390 entries, 0 to 533389
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Theme Id        314879 non-null  float64
 1   published_date  533390 non-null  object 
 2   total_post      533390 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 12.2+ MB


# 2. Data cleaning and consolidation

**<u>NOTES</u>**

The focus here is to create a cleaned dataset that is appropriate for solving the DS problem at hand from the raw data.

**1. Do**
* clean dataframe column names
* ensure dtypes are set properly
* join with other tables etc to create features
* transform, if appropriate, datetime like columns to generate additional features (weekday etc)
* transform, if appropriate, string columns to generate additional features
* discard cols that are not useful for training the model (IDs, constant cols, duplicate cols etc)
* additional features generated from existing columns


**2. Don't**
* handle missing values or outliers here. mark them and leave them for processing downstream.


## 2.1 Clean individual tables 

### product manufacturer list 


In [49]:
product_manufacturer_list_df=product_manufacturer_list_df[['PRODUCT_ID','Vendor']]
prod_df_clean = (

    product_manufacturer_list_df
    # while iterating on testing, it's good to copy the dataset(or a subset)
    # as the following steps will mutate the input dataframe. The copy should be
    # removed in the production code to avoid introducing perf. bottlenecks.
    .copy()

    # set dtypes : nothing to do here
    .passthrough()

    .transform_columns(['Vendor'], string_cleaning, elementwise=False)
    
    .replace({'': np.NaN})
    
    # ensure that the key column does not have duplicate records
    .remove_duplicate_rows(col_names=['PRODUCT_ID'], keep_first=True)
    
    # clean column names (comment out this line while cleaning data above)
    .clean_names(case_type='snake')
)
prod_df_clean.head()

Unnamed: 0,product_id,vendor
0,1,Others
1,2,Others
2,3,Others
3,4,Others
4,5,Others


In [50]:
prod_df_clean['vendor'].value_counts()

Others           28335
Private Label    21539
A                 5584
B                 5371
D                 2780
H                 1733
F                 1379
G                  230
E                  224
Name: vendor, dtype: int64

### NOTE

It's always a good idea to save cleaned tabular data using a storage format that supports the following 

1. preserves the type information
2. language agnostic storage format
3. Supports compression
4. Supports customizing storage to optimize different data access patterns

For larger datasets, the last two points become crucial.

`Parquet` is one such file format that is very popular for storing tabular data. It has some nice properties:
- Similar to pickles & RDS datasets, but compatible with all languages
- Preserves the datatypes
- Compresses the data and reduces the filesize
- Good library support in Python and other languages
- As a columnar storage we can efficiently read fewer columns
- It also supports chunking data by groups of columns (for instance, by dates or a particular value of a key column) that makes loading subsets of the data fast.

In [51]:
save_dataset(context, prod_df_clean, 'cleaned/prod_df_clean')

# Sales data


In [52]:
sales_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4526182 entries, 0 to 4526181
Data columns (total 5 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   system_calendar_key_N  int64  
 1   product_id             int64  
 2   sales_dollars_value    float64
 3   sales_units_value      int64  
 4   sales_lbs_value        int64  
dtypes: float64(1), int64(4)
memory usage: 172.7 MB


In [53]:
# product_manufacturer_list_df=product_manufacturer_list_df[['PRODUCT_ID','Vendor']]
sales_data_df_clean = (

    sales_data_df
    # while iterating on testing, it's good to copy the dataset(or a subset)
    # as the following steps will mutate the input dataframe. The copy should be
    # removed in the production code to avoid introducing perf. bottlenecks.
    .copy()

    # set dtypes : nothing to do here
    .passthrough()
    .drop_duplicates()
    .change_type(['sales_dollars_value'], np.int64)
    .clean_names(case_type='snake')
)
sales_data_df_clean['system_calendar_key_n'] = pd.to_datetime(sales_data_df_clean['system_calendar_key_n'], format='%Y%m%d')

sales_data_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4526182 entries, 0 to 4526181
Data columns (total 5 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   system_calendar_key_n  datetime64[ns]
 1   product_id             int64         
 2   sales_dollars_value    int64         
 3   sales_units_value      int64         
 4   sales_lbs_value        int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 207.2 MB


In [54]:
sales_data_df_clean['system_calendar_key_n'].min()

Timestamp('2016-01-09 00:00:00')

In [55]:
save_dataset(context, sales_data_df_clean, 'cleaned/sales_data_df_clean')

# Theme list


In [56]:
theme_list_df_clean = (

    theme_list_df

    .copy()

    # set dtypes : nothing to do here
    .passthrough()
    .drop_duplicates()
    .clean_names(case_type='snake')
)

theme_list_df_clean.head()

Unnamed: 0,claim_id,claim_name
0,0,No Claim
1,8,low carb
2,15,beans
3,16,cocoa
4,26,vanilla


In [57]:
save_dataset(context, theme_list_df_clean, 'cleaned/theme_list_df_clean')

In [154]:
theme_list_df_clean[theme_list_df_clean['claim_name'] == 'bone health']

Unnamed: 0,claim_id,claim_name
41,191,bone health


# Theme product list

In [59]:
theme_product_list_df_clean = (

    theme_product_list_df

    .copy()

    # set dtypes : nothing to do here
    .passthrough()
    .drop_duplicates()
    .clean_names(case_type='snake')
)

theme_product_list_df_clean.head()

Unnamed: 0,product_id,claim_id
0,26,8
1,29,8
2,48,81
3,50,81
4,74,227


In [60]:
theme_product_list_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91485 entries, 0 to 91484
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   product_id  91485 non-null  int64
 1   claim_id    91485 non-null  int64
dtypes: int64(2)
memory usage: 2.1 MB


In [61]:
save_dataset(context, theme_product_list_df_clean, 'cleaned/theme_product_list_df_clean')

# Google search

In [62]:
google_search_data_df_clean = (

    google_search_data_df

    .copy()

    # set dtypes : nothing to do here
    .passthrough()
    .drop_duplicates()
    .clean_names(case_type='snake')
)
google_search_data_df_clean = google_search_data_df_clean.groupby(['date','claim_id','platform','year_new','week_number'], as_index = False)['search_volume'].sum()
# year_new to year in data comparison
google_search_data_df_clean['date'] = pd.to_datetime(google_search_data_df_clean['date'])
google_search_data_df_clean['year'] = google_search_data_df_clean['date'].dt.year
google_search_data_df_clean.head()

Parsing '13-01-2014' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-01-2015' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-01-2016' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-01-2017' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-01-2018' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-01-2019' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-02-2014' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-02-2015' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-02-2016' in DD/MM/YYYY forma

Unnamed: 0,date,claim_id,platform,year_new,week_number,search_volume,year
0,2014-01-01,8,google,2014,1,6613,2014
1,2014-01-01,39,google,2014,1,181,2014
2,2014-01-01,75,google,2014,1,135,2014
3,2014-01-01,81,google,2014,1,1257,2014
4,2014-01-01,100,google,2014,1,2636,2014


In [63]:
# for year 2016 and 2017
google_search_data_df_clean_1617 = google_search_data_df_clean[google_search_data_df_clean['year'].isin([2016,2017])]

In [64]:
# year_new to year in data comparison
google_search_data_df_clean['year'].max()


2019

In [65]:
save_dataset(context, google_search_data_df_clean, 'cleaned/google_search_data_df_clean')

# Social media

In [66]:
social_media_data_df_clean = (

    social_media_data_df

    .copy()

    # set dtypes : nothing to do here
    .passthrough()
    .drop_duplicates()

    .clean_names(case_type='snake')
    .dropna(subset=['theme_id'], how='any')        # there are some nan values , those are removed
    .change_type(['theme_id'], np.int64)
    
)
social_media_data_df_clean.dropna(subset=['theme_id'], inplace=True)
social_media_data_df_clean['published_date'] = social_media_data_df_clean['published_date'].apply(standardize_date)
# for a theme_id and a published id there are 2 rows in many cases 
social_media_data_df_clean = social_media_data_df_clean.groupby(['theme_id','published_date'], as_index = False)['total_post'].sum()
social_media_data_df_clean['published_date'] = pd.to_datetime(social_media_data_df_clean['published_date'])
social_media_data_df_clean['year'] = social_media_data_df_clean['published_date'].dt.year
social_media_data_df_clean.head()

Parsing '13-01-2016' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-01-2017' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-01-2018' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-01-2019' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-02-2016' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-02-2017' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-02-2018' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-02-2019' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
Parsing '13-03-2016' in DD/MM/YYYY forma

Unnamed: 0,theme_id,published_date,total_post,year
0,8,2016-01-01,115,2016
1,8,2017-01-01,278,2017
2,8,2018-01-01,506,2018
3,8,2019-01-01,617,2019
4,8,2016-01-02,243,2016


In [67]:
social_media_data_df_clean['published_date'].max()
social_media_data_df_clean_16 = social_media_data_df_clean[social_media_data_df_clean['year'].isin([2016])]
social_media_data_df_clean_16.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70638 entries, 0 to 311680
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   theme_id        70638 non-null  int64         
 1   published_date  70638 non-null  datetime64[ns]
 2   total_post      70638 non-null  int64         
 3   year            70638 non-null  int64         
dtypes: datetime64[ns](1), int64(3)
memory usage: 2.7 MB


In [68]:
save_dataset(context, social_media_data_df_clean, 'cleaned/social_media_data_df_clean')

In [69]:
social_set = set(social_media_data_df_clean['theme_id'].unique().tolist())

In [70]:
theme_set = set(theme_list_df_clean['claim_id'].unique().tolist())

In [71]:
social_set - theme_set

set()

In [72]:
# sales_data_df_clean['product_id'].nunique()
# sales_data_df_clean.tail(100)

# ['system_calendar_key_N','product_id']    are PK for sales data
sales_data_df_clean[['system_calendar_key_n','product_id']].value_counts()
sales_data_df_clean[['system_calendar_key_n','product_id']].nunique()

system_calendar_key_n      196
product_id               42616
dtype: int64

In [73]:
# CLAIM_ID   is PK for theme list 
theme_list_df_clean['claim_id'].value_counts()
theme_list_df_clean['claim_id'].info()

<class 'pandas.core.series.Series'>
Int64Index: 208 entries, 0 to 207
Series name: claim_id
Non-Null Count  Dtype
--------------  -----
208 non-null    int64
dtypes: int64(1)
memory usage: 3.2 KB


In [74]:
theme_product_list_df_clean['product_id'].value_counts()
# theme_product_list_df_clean['PRODUCT_ID'].nunique()

28136    14
18693    14
27053    14
27051    14
27076    14
         ..
44591     1
2476      1
9221      1
50985     1
34907     1
Name: product_id, Length: 57317, dtype: int64

In [75]:
prod_df_clean['vendor'].nunique()
prod_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67175 entries, 0 to 67174
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   product_id  67175 non-null  int64 
 1   vendor      67175 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [76]:
col = sales_data_df_clean['sales_dollars_value']/sales_data_df_clean['sales_units_value']
col.head(30)

0     14.911135
1      6.462940
2     16.227273
3     11.402565
4      7.173321
5     12.194430
6     15.533411
7     11.497024
8     17.078524
9     18.019097
10    18.079526
11    16.823923
12    27.923107
13    16.285304
14    16.535211
15    18.000000
16    17.902439
17    15.392344
18     6.956897
19     6.810345
20     6.736842
21     6.609195
22     6.924528
23     0.625000
24     6.241379
25     8.362069
26     1.000000
27     9.199377
28    33.488636
29     3.454545
dtype: float64

## 2.2 Create consolidated features table

Here we take the cleaned dataframes and merge them to form the consolidated table.



In [77]:
prod_sales = pd.merge(prod_df_clean, sales_data_df_clean, how='inner', on='product_id', validate='1:m')
merge_info(prod_df_clean, sales_data_df_clean, prod_sales)

Unnamed: 0,n_cols,n_rows
left_df,2,67175
right_df,5,4526182
merged_df,6,4526182


# Understand the overall market share of our client

In [78]:
# prod_sales.info()
vendor_sales_sum = prod_sales.groupby('vendor')['sales_dollars_value'].sum()
total_sales = vendor_sales_sum.sum()
vendor_sales_percentage = (vendor_sales_sum / total_sales) * 100
vendor_sales_percentage

vendor
A                31.647006
B                15.736254
D                12.243428
E                 1.984087
F                 5.111352
G                 1.996254
H                 2.330302
Others           18.972815
Private Label     9.978502
Name: sales_dollars_value, dtype: float64

In [101]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 8))
plt.pie(vendor_sales_percentage, labels=vendor_sales_percentage.index, autopct='%1.1f%%', startangle=140,textprops={'fontsize': 20})
plt.title('Percentage of Sales by Vendor',fontsize= 20)
plt.axis('equal')  # Equal aspect ratio ensures that the pie chart is drawn as a circle.

plt.savefig('overall market share.png', bbox_inches='tight')


# potential competitors for our client in each theme

In [80]:
# for low carb theme
product_id_for_low_carb = list(set(theme_product_list_df_clean[theme_product_list_df_clean['claim_id'] == 8]['product_id'].tolist()))
prod_sales_low_carb = prod_sales[prod_sales['product_id'].isin(product_id_for_low_carb)]
vendor_sales_low_carb = prod_sales_low_carb.groupby('vendor')['sales_dollars_value'].sum().sort_values(ascending=False)


In [113]:

plt.figure(figsize=(8, 8))
plt.pie(vendor_sales_low_carb, labels=vendor_sales_low_carb.index, autopct='%1.1f%%', startangle=140,textprops={'fontsize': 20})
# plt.title('Total Sales in Dollars by Vendor (Low Carb Products)',fontsize= 20)
plt.axis('equal')  # Equal aspect ratio ensures that the pie chart is drawn as a circle.

plt.savefig('vendor_sales_low_carb.png', bbox_inches='tight')

In [103]:
# plt.figure(figsize=(10, 6))
# vendor_sales_low_carb.plot(kind='bar', color = ['blue', 'green', 'orange', 'red', 'purple'])
# plt.title('Total Sales in Dollars by Vendor (Low Carb Products)',fontsize = 20)
# plt.xlabel('Vendor',fontsize = 20)
# plt.ylabel('Total Sales Dollars',fontsize = 20)
# plt.xticks(rotation=45, ha='right')

# plt.tight_layout()  # To ensure labels and ticks are visible properly.
# plt.savefig('vendor_sales_low_carb.png', bbox_inches='tight')

### 30 common themes preset in all 3 datasets 

In [82]:
common_themes = ['convenience - easy-to-prepare','nuts','low carb','french bisque',
 'prebiotic',
 'peach',
 'ethical - not specific',
 'soy foods',
 'no additives/preservatives',
 'sea salt',
 'vegetarian',
 'crab',
 'ethnic & exotic',
 'gmo free',
 'low sodium',
 'high/source of protein',
 'tuna',
 'bone health',
 'low calorie',
 'gingerbread',
 'blueberry',
 'mackerel',
 'poultry',
 'chicken',
 'ethical - packaging',
 'beef hamburger',
 'energy/alertness',
 'low carb',
 'halal',
 'salmon',
 'low sugar']
theme_neighbours =[]
for theme in common_themes:
    theme_id = theme_list_df_clean[theme_list_df_clean['claim_name'] == theme]['claim_id'].iloc[0]
    # for each  theme
    product_ids = list(set(theme_product_list_df_clean[theme_product_list_df_clean['claim_id'] == theme_id]['product_id'].tolist()))
    prod_sales_theme = prod_sales[prod_sales['product_id'].isin(product_ids)]

    p_s_t_group = prod_sales_theme.groupby('vendor',as_index = False)['sales_dollars_value'].sum()
    total = p_s_t_group['sales_dollars_value'].sum()
    p_s_t_group['percentage_sales'] = (p_s_t_group['sales_dollars_value'] / int(total)) * 100
    p_s_t_group =p_s_t_group.sort_values('percentage_sales',ascending=False).reset_index().drop(columns = ['index'],axis =1)
    # print(p_s_t_group)
    # vendor_sales_theme = prod_sales_theme.groupby('vendor',as_index = False)['sales_dollars_value'].sum().sort_values('sales_dollars_value',ascending=False).reset_index().drop(columns = ['index'],axis =1)
    # print(vendor_sales_theme)

    vendor_A = 'A'
    if vendor_A not in p_s_t_group['vendor'].values:
        continue
    # Step 4: Find the total sales dollars for vendor A
    percentage_sales_vendor_A = p_s_t_group.loc[p_s_t_group['vendor'] == vendor_A, 'percentage_sales'].values[0]

    # Step 5: Calculate the threshold for 10% difference
    threshold =  10
    # Step 6: Find the vendors within the +- 10% range of vendor A's sales
    neighbors = p_s_t_group[
        (p_s_t_group['percentage_sales'] >= percentage_sales_vendor_A - threshold) &
        (p_s_t_group['percentage_sales'] <= percentage_sales_vendor_A + threshold) &
        (p_s_t_group['vendor'] != vendor_A)
    ]
    # print(neighbors)
    # position_vendor_A = vendor_sales_theme.index[vendor_sales_theme['vendor'] == vendor_A].tolist()[0]
    # # # Find the neighboring vendors based on their positions
    # num_neighbors = 1  # You can change this value to get more or fewer neighbors
    # neighbors = vendor_sales_theme.iloc[position_vendor_A - num_neighbors:position_vendor_A + 1]
    vendor_list = neighbors['vendor'].tolist()
    percentage_list = neighbors['percentage_sales'].tolist()
    percentage_list =  list(map(lambda x: round(x, 2), percentage_list))
    # print(neighbors)
    theme_neighbour = {'theme':theme, 'neighbours':vendor_list, 'percentages':percentage_list}
    theme_neighbours.append(theme_neighbour)


theme_neighbours_df = pd.DataFrame(theme_neighbours)
theme_neighbours_df


# percentage show

Unnamed: 0,theme,neighbours,percentages
0,low carb,"[F, D, Private Label, B]","[13.19, 11.24, 8.41, 3.72]"
1,french bisque,"[B, F, Private Label]","[9.46, 0.16, 0.08]"
2,soy foods,[],[]
3,no additives/preservatives,"[B, D]","[16.54, 9.78]"
4,vegetarian,[H],[0.03]
5,crab,"[F, Private Label]","[14.16, 0.07]"
6,ethnic & exotic,[B],[18.12]
7,gmo free,"[B, Private Label, H]","[0.42, 0.03, 0.02]"
8,low sodium,"[B, F]","[6.9, 3.9]"
9,high/source of protein,[],[]


### salmon theme

In [83]:
# for beans theme
product_id_for_slamon = list(set(theme_product_list_df_clean[theme_product_list_df_clean['claim_id'] == 227]['product_id'].tolist()))
prod_sales_salmon = prod_sales[prod_sales['product_id'].isin(product_id_for_slamon)]
vendor_sales_salmon = prod_sales_salmon.groupby('vendor')['sales_dollars_value'].sum().sort_values(ascending=False)

In [114]:
plt.figure(figsize=(8, 8))
plt.pie(vendor_sales_salmon, labels=vendor_sales_salmon.index, autopct='%1.1f%%', startangle=140,textprops={'fontsize': 20})
# plt.title('Total Sales in Dollars by Vendor (salmon Products)',fontsize= 20)
plt.axis('equal')  # Equal aspect ratio ensures that the pie chart is drawn as a circle.

plt.savefig('vendor_sales_salmon.png', bbox_inches='tight')

In [110]:
# plt.figure(figsize=(10, 6))
# vendor_sales_salmon.plot(kind='pie',fontsize = 18)
# plt.title('Total Sales in Dollars by Vendor (Beans Products)',fontsize = 10)
# # plt.xlabel('Vendor',fontsize = 20)
# # plt.ylabel('Total Sales Dollars',fontsize = 20)
# plt.xticks(rotation=45, ha='right')

# plt.tight_layout()  # To ensure labels and ticks are visible properly.
# plt.savefig('vendor_sales_salmon.png', bbox_inches='tight')

### What are the themes which are emerging in social media, Google Search & Sales?

### emerging percentage bw one year to another

In [85]:
# social_media_data_df_clean.info()
grouped_data = social_media_data_df_clean[social_media_data_df_clean['year']> 2017].groupby(['theme_id','year'])['total_post'].sum().reset_index()


# Create a separate line graph for each 'theme_id'
unique_themes = grouped_data['theme_id'].unique()

plt.figure(figsize=(10, 6))  # Optional: Adjust the size of the figure

# Plotting a line for each 'theme_id'
for theme_id in unique_themes:
    data_for_theme = grouped_data[grouped_data['theme_id'] == theme_id]
    plt.plot(data_for_theme['year'], data_for_theme['total_post'], label=f'Theme {theme_id}')

plt.xlabel('Year')
plt.ylabel('Total Posts')
plt.title('Total Number of Posts by Theme and Year')
plt.legend(loc='best')
plt.grid(True)
plt.tight_layout()  # Optional: Adjust the layout for better visualization
# plt.savefig('emerging theme', bbox_inches='tight')

Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations.
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations.


In [86]:
# Find the starting and ending years for each theme
starting_year = grouped_data.groupby('theme_id')['year'].min()
ending_year = grouped_data.groupby('theme_id')['year'].max()
# print(starting_year)
# Calculate the total posts for the starting and ending years for each theme
starting_total_posts = grouped_data[grouped_data['year'] == starting_year.iloc[0]]['total_post']
ending_total_posts = grouped_data[grouped_data['year'] == ending_year.iloc[0]]['total_post']

# Calculate the percentage increase for each theme
percentage_increase = ((ending_total_posts.values - starting_total_posts.values) / starting_total_posts.values) * 100

# Create a new DataFrame to store the results
result_df = pd.DataFrame({
    'theme_id': starting_year.index,
    'starting_year': starting_year.values,
    'ending_year': ending_year.values,
    'starting_total_posts': starting_total_posts.values,
    'ending_total_posts': ending_total_posts.values,
    'percentage_increase': percentage_increase
})

result_df.sort_values('percentage_increase',ascending = False, inplace = True)
result_df = result_df.merge(theme_list_df_clean, left_on= 'theme_id', right_on = 'claim_id', how  = 'inner')
result_df#[['claim_name','percentage_increase']]

invalid value encountered in true_divide


Unnamed: 0,theme_id,starting_year,ending_year,starting_total_posts,ending_total_posts,percentage_increase,claim_id,claim_name
0,704,2018,2019,100,4266,4166.000000,704,peanut butter
1,138,2018,2019,90,369,310.000000,138,venison
2,919,2018,2019,3,12,300.000000,919,butternut squash
3,842,2018,2019,71,238,235.211268,842,cranberry
4,385,2018,2019,123,358,191.056911,385,peanut
...,...,...,...,...,...,...,...,...
188,393,2018,2019,48858,9066,-81.444185,393,sea salt
189,982,2018,2019,38771,5919,-84.733435,982,kale
190,727,2018,2019,14,2,-85.714286,727,seaweed
191,139,2018,2019,1,0,-100.000000,139,fruit


### themes which are emerging in social media for year 2018

In [117]:
social_18 = social_media_data_df_clean[social_media_data_df_clean['year']== 2018].sort_values('published_date')
# grouped_data


# Filter data for the beginning of the year (January 1, 2017)
start_of_year = social_18[social_18['published_date'] == '2018-01-01']
# Filter data for the end of the year (December 31, 2017)
end_of_year = social_18[social_18['published_date'] == '2018-12-31']

# Group the DataFrame by 'theme_id' and calculate the total posts for each theme
total_posts_start = start_of_year.groupby('theme_id')['total_post'].sum()
total_posts_end = end_of_year.groupby('theme_id')['total_post'].sum()

# Calculate the percentage increase for each theme
percentage_increase = ((total_posts_end - total_posts_start) / total_posts_start) * 100
# print(percentage_increase)

# Create a new DataFrame to store the results
result_df = pd.DataFrame({
    'theme_id': percentage_increase.index,
    'percentage_increase': percentage_increase.values
})
result_df.sort_values('percentage_increase',ascending = False, inplace = True)
result_df = result_df.merge(theme_list_df_clean, left_on= 'theme_id', right_on = 'claim_id', how  = 'inner')
result_df['percentage_increase'] =  list(map(lambda x: round(x, 2), result_df['percentage_increase']))
result_df[['claim_name','percentage_increase']].head(15)

Unnamed: 0,claim_name,percentage_increase
0,seeds,1300.0
1,salmon,706.25
2,kale,624.24
3,sticks,570.0
4,prebiotic,550.0
5,high/source of protein,400.0
6,gingerbread,300.0
7,cola,211.76
8,convenience - easy-to-prepare,197.26
9,beef,142.38


In [132]:
social_18_seeds = social_18[social_18['theme_id'] == 227]
# social_18_seeds['week'] = social_18_seeds['published_date'].apply(lambda x : x.isocalendar()[1])
social_18_seeds_week = social_18_seeds.groupby('published_date')['total_post'].sum()


plt.figure(figsize=(10, 6))  

# Plot the data as a line graph
plt.plot(social_18_seeds_week.index, social_18_seeds_week.values, marker='o', linestyle='-')

# Set labels and title
plt.xlabel('Week')
plt.ylabel('Total Posts')
plt.title('Total Posts per Week')
plt.savefig('Total posts for seeds theme in 2018', bbox_inches='tight')

In [129]:
social_18_seeds[social_18_seeds['published_date'] == '2018-01-01']

Unnamed: 0,theme_id,published_date,total_post,year,week
308489,985,2018-01-01,1,2018,1


### themes which are emerging in Google Search  for 2018

In [119]:
# google_search_data_df_clean.info()
google_18 = google_search_data_df_clean[google_search_data_df_clean['year'] == 2018].sort_values('date')


# Filter data for the beginning of the year (January 1, 2017)
start_of_year = google_18[google_18['date'] == '2018-01-01']
# Filter data for the end of the year (December 31, 2017)
end_of_year = google_18[google_18['date'] == '2018-12-31']

# Group the DataFrame by 'theme_id' and calculate the total posts for each theme
total_posts_start = start_of_year.groupby('claim_id')['search_volume'].sum()
total_posts_end = end_of_year.groupby('claim_id')['search_volume'].sum()

# Calculate the percentage increase for each theme
percentage_increase = ((total_posts_end - total_posts_start) / total_posts_start) * 100
# print(percentage_increase)

# Create a new DataFrame to store the results
result_df = pd.DataFrame({
    'theme_id': percentage_increase.index,
    'percentage_increase': percentage_increase.values
})
result_df.sort_values('percentage_increase',ascending = False, inplace = True)
result_df = result_df.merge(theme_list_df_clean, left_on= 'theme_id', right_on = 'claim_id', how  = 'inner')
result_df['percentage_increase'] =  list(map(lambda x: round(x, 2), result_df['percentage_increase']))
result_df[['claim_name','percentage_increase']].head(15)

Unnamed: 0,claim_name,percentage_increase
0,bone health,2471.6
1,sports & recovery,1179.12
2,no added sugar,988.68
3,eye health,411.95
4,traditional,379.05
5,dry,332.29
6,soy foods,317.72
7,allergy,283.88
8,french bisque,272.09
9,vegetarian,258.45


In [156]:
from scipy.signal import savgol_filter
google_1819 = google_search_data_df_clean[google_search_data_df_clean['year'].isin([2018,2019])].sort_values('date')

google_1819_bone = google_1819[google_1819['claim_id'] == 191]
google_1819_bone['week'] = google_1819_bone['date'].apply(lambda x : x.isocalendar()[1])
google_1819_bone_week = google_1819_bone.groupby('date')['search_volume'].sum()

google_1819_bone_week_smooth = savgol_filter(google_1819_bone_week.values, window_length=30, polyorder=1)

plt.figure(figsize=(10, 6))  

# Plot the data as a line graph
plt.plot(google_1819_bone_week.index, google_1819_bone_week_smooth, linestyle='-')

# Set labels and title
plt.xlabel('Week')
plt.ylabel('Total Posts')
plt.title('Total Posts per Week')
plt.savefig('Total posts for seeds theme in 2018', bbox_inches='tight')

### themes which are emerging in sales  for 2018

In [89]:
sales_theme_product = pd.merge(sales_data_df_clean[sales_data_df_clean['system_calendar_key_n'].dt.year == 2018], theme_product_list_df_clean, on = 'product_id',  how='inner', validate='m:m')
merge_info(sales_data_df_clean[sales_data_df_clean['system_calendar_key_n'].dt.year == 2018],  theme_product_list_df_clean,sales_theme_product)
# sales_data_df_clean.info()

Unnamed: 0,n_cols,n_rows
left_df,5,1248368
right_df,2,91485
merged_df,6,2188399


In [120]:

sales_18 = sales_theme_product[sales_theme_product['system_calendar_key_n'].dt.year == 2018].sort_values('system_calendar_key_n')

# Filter data for the beginning of the year (January 1, 2017)
start_of_year = sales_18[sales_18['system_calendar_key_n'] == '2018-01-06']
# Filter data for the end of the year (December 31, 2017)
end_of_year = sales_18[sales_18['system_calendar_key_n'] == '2018-12-29']
# print(start_of_year)

# Group the DataFrame by 'theme_id' and calculate the total posts for each theme
total_posts_start = start_of_year.groupby('claim_id')['sales_dollars_value'].sum()
total_posts_end = end_of_year.groupby('claim_id')['sales_dollars_value'].sum()

# Calculate the percentage increase for each theme
percentage_increase = ((total_posts_end - total_posts_start) / total_posts_start) * 100
# print(percentage_increase)

# Create a new DataFrame to store the results
result_df = pd.DataFrame({
    'theme_id': percentage_increase.index,
    'percentage_increase': percentage_increase.values
})
result_df.sort_values('percentage_increase',ascending = False,inplace = True)
result_df = result_df.merge(theme_list_df_clean, left_on= 'theme_id', right_on = 'claim_id', how  = 'inner')
result_df['percentage_increase'] =  list(map(lambda x: round(x, 2), result_df['percentage_increase']))
result_df[['claim_name','percentage_increase']].head(15)

Unnamed: 0,claim_name,percentage_increase
0,bone health,1278.83
1,beef hamburger,308.77
2,energy/alertness,282.62
3,low sugar,180.94
4,vegetarian,151.85
5,convenience - easy-to-prepare,89.79
6,beans,88.42
7,peach,60.24
8,nuts,41.78
9,prebiotic,39.17


#  aggregation

### sales data

In [191]:
# only themes which are common in 3 data sets are considered
common_theme_ids = theme_list_df_clean[theme_list_df_clean['claim_name'].isin(common_themes)]['claim_id'].tolist()

# merge bw sales and prod manufacturer list --> adds vendor column
sales_manufacture = pd.merge(sales_data_df_clean, prod_df_clean, on = 'product_id',  how='inner', validate='m:1')
sales_manufacture_A = sales_manufacture[sales_manufacture['vendor'] == 'A']

# theme product list only for common theme ids
theme_product_list_df_clean_common = theme_product_list_df_clean[theme_product_list_df_clean['claim_id'].isin(common_theme_ids)]

# merged the data bw sales with vendor and common theme product list to include claim id
sales_manufacture_theme = pd.merge(sales_manufacture_A, theme_product_list_df_clean_common, on = 'product_id',  how='inner', validate='m:m')
merge_info(sales_manufacture,  theme_product_list_df_clean_common, sales_manufacture_theme)

Unnamed: 0,n_cols,n_rows
left_df,6,4526182
right_df,2,39331
merged_df,7,338918


In [92]:
sales_manufacture_theme['system_calendar_key_n'].dt.year.max()

2019

In [192]:
# aggrigate the sales  data in weekly basis
sales_manufacture_theme['week'] = sales_manufacture_theme['system_calendar_key_n'].apply(lambda x : x.isocalendar()[1])
sales_manufacture_theme['year'] = sales_manufacture_theme['system_calendar_key_n'].dt.year
sales_manufacture_theme_week = sales_manufacture_theme.drop(['system_calendar_key_n','product_id'],axis = 1)
sales_manufacture_theme_week = sales_manufacture_theme_week.groupby(['claim_id','year','week'],as_index = False).agg(
    sales_dollars_value = ('sales_dollars_value','sum'),
    sales_units_value = ('sales_units_value','sum'),
    sales_lbs_value = ('sales_lbs_value','sum') )

sales_manufacture_theme_week.head()

Unnamed: 0,claim_id,year,week,sales_dollars_value,sales_units_value,sales_lbs_value
0,8,2016,1,8853853,1952575,2947044
1,8,2016,2,8705721,1921171,2901409
2,8,2016,3,8839233,1968251,2954048
3,8,2016,4,8872568,1974980,2941180
4,8,2016,5,9166248,2092314,3012709


In [193]:
# sales_manufacture_theme_week['claim_id'].value_counts()

### social media data

In [167]:
social_media_data_df_clean_common = social_media_data_df_clean[social_media_data_df_clean['theme_id'].isin(common_theme_ids)]
import datetime
# aggrigate the social media  data in weekly basis
social_media_data_df_clean_common['published_date'] = social_media_data_df_clean_common['published_date'] + datetime.timedelta(days=14)
social_media_data_df_clean_common['week'] = social_media_data_df_clean_common['published_date'].apply(lambda x : x.isocalendar()[1])
social_media_data_df_clean_common['year'] = social_media_data_df_clean_common['published_date'].dt.year
social_media_data_df_clean_common_week = social_media_data_df_clean_common.drop(['published_date'],axis = 1)
social_media_data_df_clean_common_week = social_media_data_df_clean_common_week.groupby(['theme_id','year','week'],as_index = False).agg(
    total_post = ('total_post','sum') )

social_media_data_df_clean_common_week.head()

Unnamed: 0,theme_id,year,week,total_post
0,8,2015,4,1041
1,8,2015,5,123
2,8,2015,8,406
3,8,2015,9,1042
4,8,2015,12,429


In [196]:
social_media_data_df_clean_common_week.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7397 entries, 0 to 7396
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   theme_id    7397 non-null   int64
 1   year        7397 non-null   int64
 2   week        7397 non-null   int64
 3   total_post  7397 non-null   int64
dtypes: int64(4)
memory usage: 231.3 KB


### google search

In [187]:
google_search_data_df_clean_common = google_search_data_df_clean[google_search_data_df_clean['claim_id'].isin(common_theme_ids)]

# aggrigate the social media  data in weekly basis
google_search_data_df_clean_common['date'] = google_search_data_df_clean_common['date']  + datetime.timedelta(days=7)
google_search_data_df_clean_common['week'] = google_search_data_df_clean_common['date'].apply(lambda x : x.isocalendar()[1])
google_search_data_df_clean_common['year'] = google_search_data_df_clean_common['date'].dt.year
google_search_data_df_clean_common_week = google_search_data_df_clean_common.drop(['date','platform','year_new','week_number'],axis = 1)
google_search_data_df_clean_common_week = google_search_data_df_clean_common_week.groupby(['claim_id','year','week'],as_index = False).agg(
    search_volume = ('search_volume','sum') )

google_search_data_df_clean_common_week.head()

Unnamed: 0,claim_id,year,week,search_volume
0,8,2014,1,16659
1,8,2014,2,26833
2,8,2014,3,40695
3,8,2014,4,54171
4,8,2014,5,39676


In [157]:
google_search_data_df_clean_common_week.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6390 entries, 0 to 6389
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   claim_id       6390 non-null   int64
 1   year           6390 non-null   int64
 2   week           6390 non-null   int64
 3   search_volume  6390 non-null   int64
dtypes: int64(4)
memory usage: 199.8 KB


## merge

### merge bw social and google data

In [199]:

social_google = pd.merge(social_media_data_df_clean_common_week,google_search_data_df_clean_common_week, how='inner', 
left_on=['theme_id','year','week'],right_on=['claim_id','year','week'], validate='1:1')

merge_info(social_media_data_df_clean_common_week, google_search_data_df_clean_common_week, social_google)

Unnamed: 0,n_cols,n_rows
left_df,4,7397
right_df,4,6408
merged_df,6,5177


In [200]:
social_google.drop(['theme_id'],axis = 1, inplace  = True)
social_google.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5177 entries, 0 to 5176
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   year           5177 non-null   int64
 1   week           5177 non-null   int64
 2   total_post     5177 non-null   int64
 3   claim_id       5177 non-null   int64
 4   search_volume  5177 non-null   int64
dtypes: int64(5)
memory usage: 242.7 KB


### merge bw social_google and sales data

In [201]:
social_google_sales = pd.merge(social_google,sales_manufacture_theme_week, how='inner', 
on=['claim_id','year','week'], validate='1:1')
merge_info(social_google, sales_manufacture_theme_week, social_google_sales)

Unnamed: 0,n_cols,n_rows
left_df,5,5177
right_df,6,3231
merged_df,8,2630


# final data after merge

In [205]:
social_google_sales.head()

Unnamed: 0,year,week,total_post,claim_id,search_volume,sales_dollars_value,sales_units_value,sales_lbs_value
0,2016,1,726,8,49214,8853853,1952575,2947044
1,2016,2,1180,8,46543,8705721,1921171,2901409
2,2016,3,2014,8,44716,8839233,1968251,2954048
3,2016,4,1586,8,53664,8872568,1974980,2941180
4,2016,5,1357,8,47560,9166248,2092314,3012709


In [98]:
# week_number and week are different
google_search_data_df_clean_common[google_search_data_df_clean_common['week_number'] != google_search_data_df_clean_common['week']].head()

Unnamed: 0,date,claim_id,platform,year_new,week_number,search_volume,year,week
493,2014-01-02,8,google,2014,5,6104,2014,1
494,2014-01-02,75,google,2014,5,809,2014,1
495,2014-01-02,81,google,2014,5,898,2014,1
496,2014-01-02,100,google,2014,5,659,2014,1
497,2014-01-02,148,google,2014,5,1368,2014,1


In [354]:
sales_manufacture_theme_week.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1511 entries, 0 to 1510
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   claim_id             1511 non-null   int64  
 1   week                 1511 non-null   int64  
 2   sales_dollars_value  1511 non-null   float64
 3   sales_units_value    1511 non-null   int64  
 4   sales_lbs_value      1511 non-null   int64  
dtypes: float64(1), int64(4)
memory usage: 59.1 KB


In [45]:
prod_sales.head()

Unnamed: 0,product_id,vendor,system_calendar_key_n,sales_dollars_value,sales_units_value,sales_lbs_value
0,1,Others,09-01-2016,13927.0,934,18680
1,1,Others,23-01-2016,12628.0,878,17564
2,1,Others,06-02-2016,11379.0,810,16200
3,1,Others,30-01-2016,11568.0,821,16424
4,1,Others,13-02-2016,10959.0,784,15682


In [46]:
social_media_data_df_clean[['theme_id','published_date']].value_counts()
social_media_data_df_clean[(social_media_data_df_clean['theme_id'] == 788) & (social_media_data_df_clean['published_date'] == '30-07-2019')]
# social_media_data_df_clean_t[(social_media_data_df_clean_t['theme_id'] == 788) & (social_media_data_df_clean_t['published_date'] == '30-07-2019')]
social_media_data_df_clean['theme_id'].nunique()

193

In [94]:
# popular theme in social media
df1 = social_media_data_df_clean_16.groupby(['theme_id'], as_index = False)['total_post'].sum().sort_values('total_post', ascending = False)
popular_theme_id_list_social = df1[df1['total_post']> 400000]['theme_id'].tolist()
popular_theme_id_list_social
# df1.head(10)
social_media_data_df_clean_popular_16 = social_media_data_df_clean_16[social_media_data_df_clean_16['theme_id'].isin(popular_theme_id_list_social)]

In [92]:
# popular theme in social to google
google_search_data_df_clean_popular_1617 = google_search_data_df_clean_1617[google_search_data_df_clean_1617['claim_id'].isin(popular_theme_id_list_social)]


In [49]:
google_search_data_df_clean[['date','claim_id','platform','year_new','week_number']].value_counts()
# google_search_data_df_clean.groupby(['date','claim_id','platform','year_new','week_number'])['search_volume'].sum()

date        claim_id  platform  year_new  week_number
01-01-2014  8         google    2014      1              1
21-03-2019  152       amazon    2019      12             1
            65        google    2019      12             1
            75        amazon    2019      12             1
                      google    2019      12             1
                                                        ..
11-02-2016  158       google    2016      6              1
            163       google    2016      6              1
            177       google    2016      6              1
            187       google    2016      6              1
31-12-2018  980       google    2019      1              1
Length: 179082, dtype: int64

In [95]:
social_and_google = pd.merge(social_media_data_df_clean_popular_16, google_search_data_df_clean_popular_1617, left_on = 'theme_id', right_on = 'claim_id',  how='inner', validate='m:m')
merge_info(social_media_data_df_clean_popular_16,  google_search_data_df_clean_popular_1617,social_and_google)

Unnamed: 0,n_cols,n_rows
left_df,4,1464
right_df,7,2924
merged_df,11,1070184


In [96]:
social_and_google[social_and_google['published_date'] < social_and_google['date']]
social_and_google.drop('claim_id',axis = 1)

Unnamed: 0,theme_id,published_date,total_post,year_x,date,claim_id,platform,year_new,week_number,search_volume,year_y
1,576,2016-01-01,2188,2016,2017-01-01,576,google,2017,52,22382,2017
2,576,2016-01-01,2188,2016,2016-01-02,576,google,2016,5,19076,2016
3,576,2016-01-01,2188,2016,2017-01-02,576,google,2017,5,28533,2017
4,576,2016-01-01,2188,2016,2016-01-03,576,google,2016,9,15300,2016
5,576,2016-01-01,2188,2016,2017-01-03,576,google,2017,9,27953,2017
...,...,...,...,...,...,...,...,...,...,...,...
1070175,770,2016-12-31,968,2016,2017-05-31,770,google,2017,22,20423,2017
1070177,770,2016-12-31,968,2016,2017-07-31,770,google,2017,31,12963,2017
1070179,770,2016-12-31,968,2016,2017-08-31,770,google,2017,35,17694,2017
1070181,770,2016-12-31,968,2016,2017-10-31,770,google,2017,44,15774,2017


In [58]:
google_search_data_df_clean[google_search_data_df_clean['claim_id'] == 916].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 372 entries, 0 to 177423
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   date           372 non-null    object
 1   platform       372 non-null    object
 2   search_volume  372 non-null    int64 
 3   claim_id       372 non-null    int64 
 4   week_number    372 non-null    int64 
 5   year_new       372 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 20.3+ KB


In [32]:
theme_list_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   CLAIM_ID    208 non-null    int64 
 1   Claim Name  208 non-null    object
dtypes: int64(1), object(1)
memory usage: 3.4+ KB


In [33]:
theme_theme_product = pd.merge(theme_list_df_clean, google_search_data_df_clean, how='inner', left_on='CLAIM_ID',right_on = 'Claim_ID', validate='1:m')
# merge_info(prod_df_clean, sales_data_df_clean, prod_sales)

In [40]:
theme_theme_product['Claim_ID'].nunique()

160

In [39]:
theme_theme_product.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 181565 entries, 0 to 181564
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   CLAIM_ID      181565 non-null  int64 
 1   Claim Name    181565 non-null  object
 2   date          181565 non-null  object
 3   platform      181565 non-null  object
 4   searchVolume  181565 non-null  int64 
 5   Claim_ID      181565 non-null  int64 
 6   week_number   181565 non-null  int64 
 7   year_new      181565 non-null  int64 
dtypes: int64(5), object(3)
memory usage: 12.5+ MB


In [34]:
theme_product_list_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91485 entries, 0 to 91484
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   PRODUCT_ID  91485 non-null  int64
 1   CLAIM_ID    91485 non-null  int64
dtypes: int64(2)
memory usage: 1.4 MB


In [36]:
theme_theme_product_gsearch = pd.merge(theme_theme_product, theme_product_list_df_clean, how='inner', on = 'CLAIM_ID', validate='m:m')
# merge_info(prod_df_clean, sales_data_df_clean, prod_sales)

: 

: 

## 2.3 Business intuition features

This section can go into production code if these features are used in final model

#### First Time Customer.
- A binary feature that tells if the customer is in business for the first time or not.

In [12]:
# first time customer
cust_details = sales_df.groupby(['customername']).agg({'ledger_date':'min'}).reset_index()
cust_details.columns = ['customername','ledger_date']
cust_details['first_time_customer'] = 1
sales_df = sales_df.merge(cust_details, on=['customername','ledger_date'], how='left')
sales_df['first_time_customer'].fillna(0, inplace=True)

#### Days Since Last Purchase of a customer
- Feature representing the number of days from the last purchase of a customer. 
- Quantifies the Gaps customers take b/w purchases

In [13]:
#### days since last purchase
sales_df.sort_values('ledger_date',inplace=True)
sales_df['days_since_last_purchase'] = (
    sales_df
       .groupby('customername')['ledger_date']
       .diff()
       .dt.days
       .fillna(0, downcast='infer'))

In [14]:
# create a sample dataframe with minimal processing

sales_df_processed = (
    sales_df
    
    # tweak to test pipeline quickly or profile performance
    #.sample(frac=1, replace=False)
    
    # any additional processing/cleaning
)

# Any verifications on the data
from ta_lib.eda.api import get_variable_summary
display_as_tabs([
    ("Summary", f"Length: {len(sales_df_processed)}, Columns: {len(sales_df_processed.columns)}"),
    ("Variable summary", get_variable_summary(sales_df_processed)),
    ("head", sales_df.head(5).T),
    ("tail", sales_df.tail(5).T),
])

tzname GB identified but not understood.  Pass `tzinfos` argument in order to correctly return a timezone-aware datetime.  In a future version, this will raise an exception.


In [15]:
save_dataset(context, sales_df_processed, 'cleaned/sales')

# 3. Generate Train, Validation and Test datasets



- We split the data into train, test (optionally, also a validation dataset)
- In this example, we are binning the target into 10 quantiles and then use a Stratified Shuffle to split the data.
- See sklearn documentation on the various available splitters
- https://scikit-learn.org/stable/modules/classes.html#splitter-classes
- This will go into production code (training only)

In [16]:
from sklearn.model_selection import StratifiedShuffleSplit
from ta_lib.core.api import custom_train_test_split  # helper function to customize splitting
from scripts import *

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=context.random_seed)
sales_df_train, sales_df_test = custom_train_test_split(sales_df_processed, splitter, by=binned_selling_price)

In [17]:
target_col = "unit_price"

train_X, train_y = (
    sales_df_train
    
    # split the dataset to train and test
    .get_features_targets(target_column_names=target_col)
)
save_dataset(context, train_X, 'train/sales/features')
save_dataset(context, train_y, 'train/sales/target')


test_X, test_y = (
    sales_df_test
    
    # split the dataset to train and test
    .get_features_targets(target_column_names=target_col)
)
save_dataset(context, test_X, 'test/sales/features')
save_dataset(context, test_y, 'test/sales/target')