# Modeing Rating and Reviews data

**PACKAGES**

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import datetime

import os
import pickle
import time
from tqdm.notebook import tqdm

from fuzzywuzzy import fuzz
import textdistance

import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score



In [2]:
tqdm.pandas()
code_start = time.time()

  from pandas import Panel


**CODE PARAMETERS**

In [3]:
# PANDAS DISPLAY PARAMETERS
# Setting the number of maximum columns and rows to display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', 1)

In [4]:
# PATHS FOR RATINGS AND REVIEWS
input_reviews_dir_path = r'../data/raw_data/Rating and Reviews/'
output_reviews_dir_path = r'../data/clean_data/'

# PATHS FOR PRODUCT CATALOGUE
input_product_dir_path = r'../data/raw_data/Product Catalogue/'
output_product_dir_path = r'../data/clean_data/'

# PATHS FOR DEMAND
input_demand_dir_path = r'../data/raw_data/Demand Data/'
output_demand_dir_path = r'../data/clean_data/'

# PATHS FOR MAPPINGS
input_mappings_dir_path = r'../data/clean_data/'
output_mappings_dir_path = r'../data/clean_data/'

## Data Wrangling

**Quick explanation:**

The data is broken into *Product Catalogues* and *Ratings and Reviews*. *Product Catalogues* contain data about the products. Ratings and Reviews contain data about reviews (EZ). Each of the two are broken into *skincare* and *cosmetics*. The granularity of the data is *Brand* and *Sub Category* also called *ELC Solution Type* (lipstick, gloss etc.). The *Rating and Reviews* data contains the *ELC Solution Type* column but does not contain the *Brand* column. The *Brand* can be retrived from the *Product Catalogue*.

Therefore, we need to join the *Rating and Reviews* data with the *Product Catalogue* then join that to the demand data and finally apply our models.

NB: The *Product Catalogues* and *Ratings and Reviews* files get uploaded to a shared folder once in a while. The *demand* data is downloaded from a certain server.

### Wrangling Demand Data

Before this code, I manually deleted the first row of the xlsx file and saved it as a .csv file. A lot of formating needs to be done to transform the data into a Pandas friendly DataFrame.

#### Read data

In [5]:
# Read data
demand = pd.read_csv(os.path.join(input_demand_dir_path,'full_demand.csv'), encoding="ISO-8859-1")

#### Data description

The demand data is a bit messy, all the number are in text format. The brands are represented using abbreviations but we have a mapping between brands and there abbreviation. 

In [6]:
print("DATA SAMPLE:")
display(demand.sample(3))
print("DATA DESCRIPTION:")
display(demand.describe(include='all'))

DATA SAMPLE:


Unnamed: 0,Brand,ItemID 9,Item Description,Affiliate,Major Category ID,Major Category,Application ID,Application,Category ID,Category,Sub Category ID,Sub Category,Product Line ID,Product Line,Sub Product Line ID,Sub Product Line,Major Inventory,Inventory,7/1/2013,8/1/2013,9/1/2013,10/1/2013,11/1/2013,12/1/2013,1/1/2014,2/1/2014,3/1/2014,4/1/2014,5/1/2014,6/1/2014,7/1/2014,8/1/2014,9/1/2014,10/1/2014,11/1/2014,12/1/2014,1/1/2015,2/1/2015,3/1/2015,4/1/2015,5/1/2015,6/1/2015,7/1/2015,8/1/2015,9/1/2015,10/1/2015,11/1/2015,12/1/2015,1/1/2016,2/1/2016,3/1/2016,4/1/2016,5/1/2016,6/1/2016,7/1/2016,8/1/2016,9/1/2016,10/1/2016,11/1/2016,12/1/2016,1/1/2017,2/1/2017,3/1/2017,4/1/2017,5/1/2017,6/1/2017,7/1/2017,8/1/2017,9/1/2017,10/1/2017,11/1/2017,12/1/2017,1/1/2018,2/1/2018,3/1/2018,4/1/2018,5/1/2018,6/1/2018,7/1/2018,8/1/2018,9/1/2018,10/1/2018,11/1/2018,12/1/2018,1/1/2019,2/1/2019,3/1/2019,4/1/2019,5/1/2019,6/1/2019,7/1/2019,8/1/2019,9/1/2019,10/1/2019,11/1/2019,12/1/2019,1/1/2020,2/1/2020,3/1/2020,4/1/2020,5/1/2020,6/1/2020
22694,OR,0J9J400000,YOUTHTOPIA CREAM REFORM 5ML,US,520,Skincare,25,Face,31,Moisturizers,80,Moisturizers,349,(D) Anti Aging,152,(D) Youthtopia,Promotional,Samples,-,-,-,-,-,1,1,-,5,-,-,-,-,-,3,-,-,1,-,-,1,2,1,771,2,-,-,1,-,-,245,1,3,1,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
28677,BB,ECT0050990,FACE TOUCH UP PALETTE-WN,US,510,Makeup,25,Face,25,Concealers & Correct,M0,Palette,53,Bobbi Brown,775,Bobbi Brown,Promotional,Samples,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,84,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
1960,CL,7JGK020000,PORE INSTANT PERFECTING MU,US,510,Makeup,25,Face,28,Foundation,26,Liquid Foundation,286,Pore Refining Soluti,231,All Pore Refining Solutions Products,Saleable,Standard Saleable,942,2979,2262,1467,1443,1800,1521,3009,1899,1560,2413,1503,426,2098,1581,1163,1913,1102,1787,1915,2160,1181,1579,1293,1291,1494,1011,867,913,550,153,159,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-


DATA DESCRIPTION:


Unnamed: 0,Brand,ItemID 9,Item Description,Affiliate,Major Category ID,Major Category,Application ID,Application,Category ID,Category,Sub Category ID,Sub Category,Product Line ID,Product Line,Sub Product Line ID,Sub Product Line,Major Inventory,Inventory,7/1/2013,8/1/2013,9/1/2013,10/1/2013,11/1/2013,12/1/2013,1/1/2014,2/1/2014,3/1/2014,4/1/2014,5/1/2014,6/1/2014,7/1/2014,8/1/2014,9/1/2014,10/1/2014,11/1/2014,12/1/2014,1/1/2015,2/1/2015,3/1/2015,4/1/2015,5/1/2015,6/1/2015,7/1/2015,8/1/2015,9/1/2015,10/1/2015,11/1/2015,12/1/2015,1/1/2016,2/1/2016,3/1/2016,4/1/2016,5/1/2016,6/1/2016,7/1/2016,8/1/2016,9/1/2016,10/1/2016,11/1/2016,12/1/2016,1/1/2017,2/1/2017,3/1/2017,4/1/2017,5/1/2017,6/1/2017,7/1/2017,8/1/2017,9/1/2017,10/1/2017,11/1/2017,12/1/2017,1/1/2018,2/1/2018,3/1/2018,4/1/2018,5/1/2018,6/1/2018,7/1/2018,8/1/2018,9/1/2018,10/1/2018,11/1/2018,12/1/2018,1/1/2019,2/1/2019,3/1/2019,4/1/2019,5/1/2019,6/1/2019,7/1/2019,8/1/2019,9/1/2019,10/1/2019,11/1/2019,12/1/2019,1/1/2020,2/1/2020,3/1/2020,4/1/2020,5/1/2020,6/1/2020
count,37119,37119,37119,37119,37119.0,37119,37119.0,37119,37119.0,37119,37119.0,37119,37119.0,37119,37119.0,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119,37119
unique,21,37119,17712,1,,5,23.0,23,94.0,93,212.0,208,418.0,418,1120.0,1118,2,6,3534,3816,3981,3804,3971,3913,3336,3783,3940,3459,3941,4017,2900,3941,4112,3861,3861,3923,3482,3900,3969,3750,3692,4084,3605,3892,4172,3880,3919,4078,3536,4046,4160,3613,3725,4398,3515,3974,4146,3823,3991,3994,3441,3976,4196,3480,3806,4387,2931,3620,3563,3620,3427,3203,2754,3277,3693,2970,3098,3464,2503,3412,3669,3352,3374,3180,2777,3112,3612,3178,3349,3621,2956,3484,3391,3504,3475,3308,3090,3195,2819,1854,2184,2327
top,MC,AM0L010000,LIP COLOR,US,,Makeup,25.0,Face,11.0,Lipsticks,17.0,All Lipsticks,57.0,MAC,700.0,MAC,Saleable,Standard Saleable,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
freq,7083,1,330,37119,,21117,11253.0,11253,4112.0,4112,4080.0,4080,3445.0,3445,3161.0,3161,29244,28755,28085,28019,27776,27664,27747,27758,27977,27913,27849,28118,27931,28408,28764,27949,27862,27654,27925,27740,27976,27862,27878,27866,27878,27677,27877,27626,27412,27324,27265,27072,27260,27202,27132,27234,27046,26946,27120,26868,26659,26580,26562,26507,26716,26690,26306,26570,26401,26220,28050,27704,27692,27566,27595,27847,28273,28144,27952,28376,28422,28046,28904,28246,28189,28246,28166,28478,28778,28721,27878,28096,28107,27735,28378,27945,28101,27595,27913,28108,28195,28190,28214,29947,29652,28838
mean,,,,,518.567041,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
std,,,,,14.394473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
min,,,,,510.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25%,,,,,510.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
50%,,,,,510.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
75%,,,,,520.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


#### Data wrangling

In [7]:
# Pandas friendly column names
demand.columns = [col.lower().strip().replace(' ','_') for col in demand.columns]

In [8]:
# We extract the columns that contain demand data. They have the format (dd/mm/yyy)
# TODO: use regex. Bleh
dates_columns = demand.columns[demand.columns.str.contains('/')].tolist()

In [9]:
# In xlsx 0 is marked as '-' which cause the demand columns to be read as of type object (str).
# We need to replace the '-' by 0 and transform the type to int so that we can aggregate using sum().
temp = demand[dates_columns]
temp[temp == '-'] = 0
demand[dates_columns] = temp
del temp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


In [10]:
demand[demand=='-'] = np.nan

  res_values = method(rvalues)


In [11]:
groupbycols = [col for col in demand.columns if not col in dates_columns]

In [12]:
# We aggregate the data
demand = demand.set_index(groupbycols)

In [13]:
# Finally we use stack to create a columns date, and have a row for each seperate month
demand.columns.name = 'date'
demand = demand.stack().to_frame('demand').reset_index()
demand['date'] = pd.to_datetime(demand['date'], errors='coerce').dt.to_period('m')
#.values.astype('datetime64[M]')

In [14]:
demand['demand'] = demand['demand'].str.replace(',','').fillna(0).astype(int)

In [15]:
demand['itemid_4'] = demand['itemid_9'].str[0:4] 

In [16]:
demand = demand.groupby([
    'brand', 
    'itemid_4', 
    'item_description',
    'date',])['demand'].sum().reset_index()

In [17]:
demand = demand.sort_values(['brand', 
    'itemid_4', 
    'item_description',
    'date'])

In [18]:
demand

Unnamed: 0,brand,itemid_4,item_description,date,demand
0,AR,2008,AR EAU DE TOILETTE,2013-07,219
1,AR,2008,AR EAU DE TOILETTE,2013-08,924
2,AR,2008,AR EAU DE TOILETTE,2013-09,690
3,AR,2008,AR EAU DE TOILETTE,2013-10,843
4,AR,2008,AR EAU DE TOILETTE,2013-11,1728
...,...,...,...,...,...
1738879,TF,T9A4,CCS SCOREMORE TF4A,2020-02,0
1738880,TF,T9A4,CCS SCOREMORE TF4A,2020-03,0
1738881,TF,T9A4,CCS SCOREMORE TF4A,2020-04,0
1738882,TF,T9A4,CCS SCOREMORE TF4A,2020-05,0


In [19]:
demand.to_csv(os.path.join(output_demand_dir_path,'demand.csv'), index=False)

In [20]:
# If the wrangled files already exist
demand = pd.read_csv(os.path.join(output_demand_dir_path,'demand.csv'))
demand.sample(10)

Unnamed: 0,brand,itemid_4,item_description,date,demand
1059931,GG,G03T,POWERMUD DUALCLEANSE TREATMENT,2015-02,0
950780,EL,RXE5,PURE COLOR ENVY SINGLE LONG NAV,2019-03,0
1244861,LL,J0T3,N36 EDP CP,2018-12,0
133,AR,2014,AR AFTER SHAVE,2017-08,0
1537961,OR,0N15,MAD HAND CREAM KEY CLIP,2013-12,0
1214535,KL,N38Y,STRAIGHT TO HEAVEN BY KL WH CRI,2018-10,0
246555,AV,AMJP,BEAUTIFYING PUREFUME MIST,2014-10,0
1163964,JM,L8TE,TRAVEL COLLECTION,2018-07,0
624674,CL,ZCTM,TA REVITALIZING INSTANT FACIAL,2017-09,2331
542278,CL,KAJ2,MS HYDRATING SUPERCHARGED CNCNT,2018-05,0


### Wrangling Rating and Reviews

#### Read and concatenate data

First we need to read all the files and concatenate them.

In [21]:
# List of available files
os.listdir(input_reviews_dir_path)

['cosmetics_reviews_20191130_final.csv',
 'cosmetics_reviews_20200101-20200131.csv',
 'cosmetics_reviews_20200229.csv',
 'cosmetics_reviews_20200331.csv',
 'old_files',
 'pwds',
 'skincare_reviews_20150201-20200131.csv',
 'skincare_reviews_20200229.csv',
 'skincare_reviews_20200331.csv',
 'template']

In [None]:
start = time.time()
reviews_skincare = pd.DataFrame()
reviews_cosmetics = pd.DataFrame()
for file in os.listdir(input_reviews_dir_path):
    if '.csv' in file.lower():
        if 'skincare' in file.lower():
            interm = time.time()
            print('\nReading', file)
            temp = pd.read_csv(os.path.join(input_reviews_dir_path,file), low_memory=False)
            temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
            temp.columns = [colname.lower().replace(' ','_') for colname in temp.columns]
            print('Concatenating', file)
            reviews_skincare=pd.concat([reviews_skincare, temp], ignore_index=True)
            print('Time for this dataset: %.0f seconds' % (time.time()-interm))
            print('Total time: %.0f seconds' % (time.time()-start))
        elif 'cosmetics' in file.lower():
            interm = time.time()
            print('\nReading', file)
            temp = pd.read_csv(os.path.join(input_reviews_dir_path,file), low_memory=False)
            temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
            temp.columns = [colname.lower().replace(' ','_') for colname in temp.columns]
            print('Concatenating', file)
            reviews_cosmetics=pd.concat([reviews_cosmetics, temp], ignore_index=True)
            print('Time for this dataset: %.0f seconds' % (time.time()-interm))
            print('Total time: %.0f seconds' % (time.time()-start))
del temp

In [None]:
reviews_cosmetics = reviews_cosmetics.drop_duplicates()
reviews_skincare = reviews_skincare.drop_duplicates()

In [22]:
reviews_cosmetics.to_csv(os.path.join(output_reviews_dir_path,'reviews_cosmetics.csv'), index=False)
reviews_skincare.to_csv(os.path.join(output_reviews_dir_path,'reviews_skincare.csv'), index=False)

NameError: name 'reviews_cosmetics' is not defined

In [23]:
# If concatenated files already exist
reviews_cosmetics = pd.read_csv(os.path.join(output_reviews_dir_path,'reviews_cosmetics.csv'), low_memory=False)
reviews_skincare = pd.read_csv(os.path.join(output_reviews_dir_path,'reviews_skincare.csv'), low_memory=False)

KeyboardInterrupt: 

#### Data description

Each review/post has a unique *OnlinePost_ID*. Each *OnlinePost_ID* is broken into different *Statements*. Each statement has a unique *OnlineStatement_ID* which is the concatenation of the *OnlineStatement_ID* and the rank of the statement. A statement is usually a sentence or a comment. The actual text is contained in the *Description* column.

The Reviews data and the Products data must be joined using *Source Product Identifier* and *Channel* as a compounded key.

**COSMETICS**

In [None]:
print("DATA SAMPLE:")
display(reviews_cosmetics.sample(3))
print("DATA DESCRIPTION:")
display(reviews_cosmetics.describe(include='all'))
print("% OF MISSING DATA:")
display((reviews_cosmetics.isna().sum()/len(reviews_cosmetics))*100)

**REVIEWS**

In [None]:
print("DATA SAMPLE:")
display(reviews_skincare.sample(3))
print("DATA DESCRIPTION:")
display(reviews_skincare.describe(include='all'))
print("% OF MISSING DATA:")
display((reviews_skincare.isna().sum()/len(reviews_skincare))*100)

#### Data wrangling

Here are some ideas about the aggregationg of R&R data :
1. Filter on Geography to only keep USA data
2. create date columns (year, month) 
3. change sentiment to numerical data : negative:-1, neurtal:0, positive:1
4. create sentiment one hot encodings 
5. groupby OnlineStatement_ID and aggregate by {sentiment:mean, sentiments_hot_one:average_count)
5. create rating one hot encodings
5. Groupby year, month, Channel, Source Product Identifier
6. Aggregate by {num_sentiment:mean, num_rating:mean, sentiments_hot_one:count, rating_hot_one:count}
7. Count the number of reviews

Then we concatenate cosmetic and skincare, and add a columns *type* to keep track of the source of the data.

**NB**: The columns are not exactly the same between the two types. By concatenting, we will have the union of the features of the two datasets and missing values will be filled with NaNs.

In [None]:
reviews_cosmetics['type'] = 'Cosmetics'
reviews_skincare['type'] = 'Skincare'
reviews = pd.concat([reviews_cosmetics, reviews_skincare], axis=0)

In [None]:
del reviews_cosmetics
del reviews_skincare

In [None]:
reviews.head(2)

In [None]:
# Filtering geographies on US
reviews = reviews[reviews['geography']=='USA']

In [None]:
# Creating date columns in the right dtype and dropping the day of the month: 2019-02-24 => 2019-02-01
reviews.loc[:,'clean_date'] = pd.to_datetime(reviews['date'], errors='coerce')
if reviews['clean_date'].isna().sum() > 0:
    print('{} rows have been dropped because the date format is wrong.'.format(reviews['clean_date'].isna().sum()))
    display(reviews.loc[reviews['clean_date'].isna(), 'date'])
    reviews = reviews.dropna(subset='date')
reviews['date'] = reviews['clean_date']  
reviews = reviews.drop('clean_date', axis=1)
reviews['date'] = reviews['date'].dt.to_period('m')

In [None]:
# Checking for missing data (NA => -1)
if reviews['rating'].isna().sum()>0:
    print('{} rows are missing ratings'.format(reviews['rating'].isna().sum()))
    reviews.loc[:,'rating'] = reviews['rating'].fillna(-1).astype(int)

if reviews['sentiment'].isna().sum()>0:
    print('{} rows are missing sentiments'.format(reviews['sentiment'].isna().sum()))
    reviews.loc[:,'sentiment'] = reviews['sentiment'].fillna(-1).astype(int)

In [None]:
# Transforming rating and sentiment to dummy variables (one-hot encoding)
reviews.loc[:,'sentiment'] = reviews['sentiment'].str.lower()
reviews.loc[:,'rating'] = reviews['rating'].astype(int)
reviews = pd.concat([reviews, pd.get_dummies(data=reviews[['rating','sentiment']], columns=['rating','sentiment'], dtype=int)], axis=1)

In [None]:
# Readding NAs data to ratings
reviews.loc[reviews['rating']==-1,'rating'] = np.nan
reviews.loc[reviews['sentiment']==-1,'sentiment'] = np.nan

In [None]:
# Transforming sentiment to integer data (positive:1; netural:0, negative:-1)
reviews.loc[:,'sentiment'] = reviews['sentiment_positive'] - reviews['sentiment_negative']

In [None]:
# Aggregating RR data by OnlinePost_ID

# Creating a column to count the number of statements by review once aggreagtion happens
reviews['nb_statements'] = reviews['sentiment']

reviews = reviews.groupby(['type',
                             'channel',
                             'source_product_identifier',  
                             'date',
                             'onlinepost_id']).agg({
    'nb_statements':'count',
    'rating':'first',
    'rating_1':'first',
    'rating_2':'first',
    'rating_3':'first',
    'rating_4':'first',
    'rating_5':'first',
    'sentiment_negative':'sum',
    'sentiment_neutral':'sum',
    'sentiment_positive':'sum',
    'sentiment':'mean'
}).reset_index()

# Normalize the one hot sentiment encoding counts (sentiment_negative, sentiment_neutral, sentiment_positive) by the nb_statement.
reviews[['sentiment_negative', 'sentiment_neutral', 'sentiment_positive']] = reviews[['sentiment_negative', 'sentiment_neutral', 'sentiment_positive']].div(reviews['nb_statements'], axis=0)

In [None]:
reviews.head(4)

In [None]:
# Aggregating RR data by channel + source_product_identifier

# Creating a column to count the number of reviews once aggreagtion happens
reviews['nb_reviews'] = reviews['rating']
reviews['avg_nb_statements'] = reviews['nb_statements']

reviews = reviews.groupby(['type',
                           'channel',
                           'source_product_identifier',
                           'date']).agg({
    'avg_nb_statements':'mean',
    'nb_reviews':'count',
    'rating':'mean',
    'rating_1':'sum',
    'rating_2':'sum',
    'rating_3':'sum',
    'rating_4':'sum',
    'rating_5':'sum',
    'sentiment_negative':'sum',
    'sentiment_neutral':'sum',
    'sentiment_positive':'sum',
    'sentiment':'mean'
}).reset_index()

In [None]:
reviews.head(4)

In [None]:
# Saving dataset
reviews.to_csv(os.path.join(output_reviews_dir_path,'reviews_wrangled.csv'), index=False)

In [25]:
# If the wrangled files already exist
reviews = pd.read_csv(os.path.join(output_reviews_dir_path,'reviews_wrangled.csv'), low_memory=False)
reviews

Unnamed: 0,type,channel,source_product_identifier,date,avg_nb_statements,nb_reviews,rating,rating_1,rating_2,rating_3,rating_4,rating_5,sentiment_negative,sentiment_neutral,sentiment_positive,sentiment
0,Cosmetics,Amazon USA,0001842420,2015-03,2.0,1,5.0,0,0,0,0,1,0.000000,0.000000,1.000000,1.000000
1,Cosmetics,Amazon USA,0001842420,2015-04,1.0,1,5.0,0,0,0,0,1,0.000000,0.000000,1.000000,1.000000
2,Cosmetics,Amazon USA,0001842420,2015-05,2.0,1,5.0,0,0,0,0,1,0.000000,0.000000,1.000000,1.000000
3,Cosmetics,Amazon USA,0001842420,2015-06,3.5,2,5.0,0,0,0,0,2,0.166667,0.166667,1.666667,0.750000
4,Cosmetics,Amazon USA,0001842420,2015-09,3.0,2,4.0,0,0,0,2,0,0.000000,0.500000,1.500000,0.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696311,Skincare,Ulta,xlsImpprod6400280,2020-03,4.0,3,5.0,0,0,0,0,3,0.000000,0.285714,2.714286,0.904762
696312,Skincare,Ulta,xlsImpprod6430317,2020-03,5.0,1,2.0,0,1,0,0,0,0.000000,0.000000,1.000000,1.000000
696313,Skincare,Ulta,xlsImpprod6430359,2020-01,4.0,2,5.0,0,0,0,0,2,0.000000,0.200000,1.800000,0.900000
696314,Skincare,Ulta,xlsImpprod6470101,2020-03,2.0,2,5.0,0,0,0,0,2,0.000000,0.000000,2.000000,1.000000


The Rating and Reviews data is now wrangled. We now need to wrangle the Product Catalogue data before merging the two datasets.

### Wrangling Product Catalogue

#### Read and concatenate data

First we need to read all the files and concatenate them.

In [26]:
# List of available files
os.listdir(input_product_dir_path)

['cosmetics_products_20200331.csv',
 'Cosmetics_Product_20190831.csv',
 'Cosmetics_Product_20190930.csv',
 'Cosmetics_Product_20191031.csv',
 'cosmetics_product_2019Q4.csv',
 'Cosmetics_Product_20200116.csv',
 'ELC_historic_catalog_cosmetics.b.csv',
 'ELC_historic_catalog_cosmetics.csv',
 'ELC_historic_catalog_skincare.b.csv',
 'ELC_historic_catalog_skincare.csv',
 'Hair Care.zip',
 'old_files',
 'skincare_products_20200331.csv',
 'skincare_productupc_20200331.csv',
 'SkinCare_Product_20190630_ST MV2SV.csv',
 'SkinCare_Product_20190930.csv',
 'SkinCare_Product_20190930_Update.csv',
 'SkinCare_Product_20191031.csv',
 'skincare_product_20191231.csv',
 'skincare_product_2019Q3.csv',
 'skincare_product_2019Q3_final.csv',
 'SkinCare_Product_20200116.csv']

In [None]:
start = time.time()
products_skincare = pd.DataFrame()
products_cosmetics = pd.DataFrame()
for file in os.listdir(input_product_dir_path):
    if '.csv' in file.lower():
        if 'skincare' in file.lower():
            interm = time.time()
            print('\nReading', file)
            temp = pd.read_csv(os.path.join(input_product_dir_path,file), low_memory=False)
            temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
            temp.columns = [colname.lower().replace(' ','_') for colname in temp.columns]
            print('Concatenating', file)
            products_skincare = pd.concat([products_skincare, temp], ignore_index=True)
            print('Time for this dataset: %.0f seconds' % (time.time()-interm))
            print('Total time: %.0f seconds' % (time.time()-start))
        elif 'cosmetics' in file.lower():
            interm = time.time()
            print('\nReading', file)
            temp = pd.read_csv(os.path.join(input_product_dir_path,file), low_memory=False)
            temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
            temp.columns = [colname.lower().replace(' ','_') for colname in temp.columns]
            print('Concatenating', file)
            products_cosmetics = pd.concat([products_cosmetics, temp], ignore_index=True)
            print('Time for this dataset: %.0f seconds' % (time.time()-interm))
            print('Total time: %.0f seconds' % (time.time()-start))
del temp

Then we drop duplicated rows (if any exist).

In [None]:
products_cosmetics = products_cosmetics.drop_duplicates()
products_skincare = products_skincare.drop_duplicates()

Finally we save the two files so that we don't have to run everything from scract everytime.

In [None]:
products_cosmetics.to_csv(os.path.join(output_product_dir_path,'products_cosmetics.csv'), index=False)
products_skincare.to_csv(os.path.join(output_product_dir_path,'products_skincare.csv'), index=False)

In [27]:
# If concatenated files already exist
products_cosmetics = pd.read_csv(os.path.join(output_product_dir_path,'products_cosmetics.csv'), low_memory=False)
products_skincare = pd.read_csv(os.path.join(output_product_dir_path,'products_skincare.csv'), low_memory=False)

#### Data description

**COSMETICS**

In [28]:
print("DATA SAMPLE:")
display(products_cosmetics.sample(3))
print("DATA DESCRIPTION:")
display(products_cosmetics.describe(include='all'))
print("% OF MISSING DATA:")
display((products_cosmetics.isna().sum()/len(products_cosmetics))*100)

DATA SAMPLE:


Unnamed: 0,product_id,source_product_identifier,product,description,channel,brand,feature,benefit,ingredient,additional_ingredients_(no_rulebase),product_form,elc_solution_type,rating,number_of_reviews,geography,collection_date,normalized_product_title,productcluster_id,finish,looks,other,trends,cluster_size,title,solution_type
23004,Product_20200413_7009230,B079SXSQ9Z,Physicians Formula - Healthy Lip Velvet Liquid...,"What It Is: Long-wearing, decadent mousse-like...",Amazon UK,PHYSICIANS FORMULA,Gentle;Lightweight Formula,Anti Aging;Moisturizing,Hyaluronic Acid;Peptides;Vitamin A;Vitamin E,Avocado Oil,Cream;Liquid;Stick,Liquid Lipcolor,4.1,42.0,UK,2019-12-16,PHYSICIANS FORMULA LIQUID LIPCOLOR THE HEALTHY...,PHYSICIANS FORMULA_Liquid Lipcolor_1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,,,
435577,Product_20190722_4826377,B00CL5ZB1U,"freshMinerals Mineral Loose Powder Foundation,...",Color: Natural Mineral powder foundation by fr...,Amazon USA,freshMinerals,Natural;SPF Protection,Contains Natural Ingredients;Refreshing,Not Mentioned,Not Mentioned,Powder,Foundation,3.4,23.0,USA,7/11/2019,,,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,,,
182042,Product_20191016_4975804,B01FEXGCLI,TONYMOLY Panda's Dream Smudge out mascara # 01...,Colour: #01 Volume About the Product No more s...,Amazon UK,TONYMOLY,Not Mentioned,Long Lasting;Refreshing;Smudgeproof;Volumizing,Not Mentioned,Not Mentioned,Not Mentioned,Mascara,3.8,7.0,UK,2019-09-15,,,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,,,


DATA DESCRIPTION:


Unnamed: 0,product_id,source_product_identifier,product,description,channel,brand,feature,benefit,ingredient,additional_ingredients_(no_rulebase),product_form,elc_solution_type,rating,number_of_reviews,geography,collection_date,normalized_product_title,productcluster_id,finish,looks,other,trends,cluster_size,title,solution_type
count,525535,525535.0,525535,509455,525535,525535,525535,525533,525535,503592,525535,514033,463326.0,507847.0,525535,525395,195076,195076,515389,515389,515389,515389,31405.0,54829,72230
unique,294817,120512.0,127291,143785,14,2417,9389,38781,755,2837,597,785,,,2,293,52066,33289,557,123,36,144,,51417,661
top,Product_20190722_4026451,123443000000.0,Lip Gloss,Size: Pack of 2 For the most up to date inform...,Amazon USA,MAC,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Lipstick,,,USA,2019-07-02,MAC amplified lipstick,NARS_Eyeshadow_1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,,Lip Gloss,Lipstick
freq,5,143.0,154,1451,188105,18483,237381,85740,412074,392078,171609,58979,,,310564,38457,156,164,199233,500571,468231,438648,,19,8302
mean,,,,,,,,,,,,,4.032949,137.810955,,,,,,,,,4.243401,,
std,,,,,,,,,,,,,0.834271,858.80431,,,,,,,,,7.510366,,
min,,,,,,,,,,,,,0.0,0.0,,,,,,,,,1.0,,
25%,,,,,,,,,,,,,3.7,3.0,,,,,,,,,1.0,,
50%,,,,,,,,,,,,,4.2,9.0,,,,,,,,,3.0,,
75%,,,,,,,,,,,,,4.6,51.0,,,,,,,,,4.0,,


% OF MISSING DATA:


product_id                               0.000000
source_product_identifier                0.000000
product                                  0.000000
description                              3.059739
channel                                  0.000000
brand                                    0.000000
feature                                  0.000000
benefit                                  0.000381
ingredient                               0.000000
additional_ingredients_(no_rulebase)     4.175364
product_form                             0.000000
elc_solution_type                        2.188627
rating                                  11.837271
number_of_reviews                        3.365713
geography                                0.000000
collection_date                          0.026640
normalized_product_title                62.880493
productcluster_id                       62.880493
finish                                   1.930604
looks                                    1.930604


**SKINCARE**

In [29]:
print("DATA SAMPLE:")
display(products_skincare.sample(3))
print("DATA DESCRIPTION:")
display(products_skincare.describe(include='all'))
print("% OF MISSING DATA:")
display((products_skincare.isna().sum()/len(products_cosmetics))*100)

DATA SAMPLE:


Unnamed: 0,additional_ingredients_(no_rulebase),benefit,brand,channel,collection_date,product,description,elc_solution_type,feature,geography,ingredient,normalized_product_title,number_of_reviews,packaging,product_form,productcluster_id,product_id,rating,skin_condition,skin_type,solution_type,source_product_identifier,treatment_area,use_case,cluster_title,caf1a03ebf00d4078bbb0a590ec19a089,upc,c8285216327864eb388e64216dead18b1,c5bdcdac51d9e41dca007d69f7cd66c58,c886f108693fa44d3b90e50714f925352,c247b9901626441798fd9f5b0e1da5b6b,c592ee2bea9194edfa01db5a04de3e8ed,c39ee8c98ff6e4d2394de70aba44d3b53,ce9ff836616d54454a16e6e453cd8c3a0,c60e432d687a14313af9a796a9d1aa662,cdb3deb1d22ff4b3b95f7cf93a716e37f,c221108b649a940bfb30f51761418e985,cf1972cff60c14e5c9a0360d29728f326,c0f3a87d761e14a9a9355fd6cbf5e6a23,c12155d43bdc349f09c79baafbd50920c,cdcbef08d2dab47dd848b0b95f0fedbe2,c0fcd4ae2146440218fabd77b2fd0b9d2,cc7c0e57649784f21b4b637f729d6caf9,c0881be2cfa9840f4a01414ab5ecf5ff3,cf6a2c815ba954cb39cc42b31b4fee2f0,cff79bc48a3904424abc514711b240877,c9a2e8d1d2d564ee89ef2dc6bb2c613d4,c4587ff7e968142b69f6e308c49df39d3,cd96842a0c3c04ec385214879a168dfaa,c2771dfbcbc0d4e778de96df220c34b06,c73106569fb4d4913b88ae2b231d39064,c0b87f5c8b4c449bbbefa78e44454686a,c9a2fcedcc6564367861ae58adf7f62d1,c085c055e8cd4447aad82a1b1c15a5750,title,cluster_size
590578,Not Mentioned,Brightening & Whitening;Firming;Lifting;Nouris...,Murad,Birchbox,2019-07-18,,Skincare Step 2: Treat\nThis eye serum is clin...,Eye Treatment,Not Mentioned,USA,Retinoid;Retinol,,0.0,Not Mentioned,Not Mentioned,,Product_20190807_10915047,,Dark Circle;Wrinkles and Sagging,Not Mentioned,,39870,Eyes;Feet,Not Mentioned,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Murad® Retinol Youth Renewal Eye Serum,
868721,Essential Oil;Ferment/Fermentation;Parabens;Ph...,Brightening & Whitening;Effective;Firming;Glow...,DRUNK ELEPHANT,Sephora,2019-09-26,C-FirmaTM Vitamin C Day Serum,Which skin type is it good for? ✔ Normal ✔ Oil...,Serum,Cruelty Free;Healing;Natural;Scented;Synthetic...,USA,Antioxidants;Beta Hydroxy Acid;Mineral;Vitamin C,DRUNK ELEPHANT C-FIRMA VITAMIN DAY SERUM SERUM,1831.0,Not Mentioned,Oil,DRUNK ELEPHANT_Serum_1,Product_20191007_12521045,3.9005,Dark Spots;Pigmentation;Wrinkles and Sagging,Not Mentioned,,P400259,Not Mentioned,Not Mentioned,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
17320,,Cleansing & Cleaning;Moisturizing;Smoothing,Shiseido,Amazon UK,11/5/2018,Shiseido Ibuki Refining Moisturiser Enriched 5...,A multi-functional moisturiser that improves v...,Face Moisturizer,Not Mentioned,UK,Not Mentioned,,0.0,Not Mentioned,Not Mentioned,,Product_20181119_294354,,Acne;Dryness,All Skin Types;Dry,Eye Cream,B017M5DDOA,Face Area,Not Mentioned,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


DATA DESCRIPTION:


Unnamed: 0,additional_ingredients_(no_rulebase),benefit,brand,channel,collection_date,product,description,elc_solution_type,feature,geography,ingredient,normalized_product_title,number_of_reviews,packaging,product_form,productcluster_id,product_id,rating,skin_condition,skin_type,solution_type,source_product_identifier,treatment_area,use_case,cluster_title,caf1a03ebf00d4078bbb0a590ec19a089,upc,c8285216327864eb388e64216dead18b1,c5bdcdac51d9e41dca007d69f7cd66c58,c886f108693fa44d3b90e50714f925352,c247b9901626441798fd9f5b0e1da5b6b,c592ee2bea9194edfa01db5a04de3e8ed,c39ee8c98ff6e4d2394de70aba44d3b53,ce9ff836616d54454a16e6e453cd8c3a0,c60e432d687a14313af9a796a9d1aa662,cdb3deb1d22ff4b3b95f7cf93a716e37f,c221108b649a940bfb30f51761418e985,cf1972cff60c14e5c9a0360d29728f326,c0f3a87d761e14a9a9355fd6cbf5e6a23,c12155d43bdc349f09c79baafbd50920c,cdcbef08d2dab47dd848b0b95f0fedbe2,c0fcd4ae2146440218fabd77b2fd0b9d2,cc7c0e57649784f21b4b637f729d6caf9,c0881be2cfa9840f4a01414ab5ecf5ff3,cf6a2c815ba954cb39cc42b31b4fee2f0,cff79bc48a3904424abc514711b240877,c9a2e8d1d2d564ee89ef2dc6bb2c613d4,c4587ff7e968142b69f6e308c49df39d3,cd96842a0c3c04ec385214879a168dfaa,c2771dfbcbc0d4e778de96df220c34b06,c73106569fb4d4913b88ae2b231d39064,c0b87f5c8b4c449bbbefa78e44454686a,c9a2fcedcc6564367861ae58adf7f62d1,c085c055e8cd4447aad82a1b1c15a5750,title,cluster_size
count,1074346,1107894,1107894,1107894,1101412,998200,995236,1029818,1107894,1107165,1107894,362820,1081537.0,1107894,1107894,362820,1107895,646731.0,1107894,1107894,149891,1107895.0,1107894,1107894,99534,0.0,24874.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109695,37415.0
unique,6416,93006,6332,17,330,219736,205600,806,27304,4,23233,149476,4555.0,20,2214,52510,483921,6329.0,3054,196,1380,202808.0,613,5,27300,,22414.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,102762,24.0
top,Not Mentioned,Not Mentioned,Not Mentioned,Amazon USA,2020-04-02,-,Description coming soon.,Cleanser,Not Mentioned,USA,Not Mentioned,lip balm,0.0,Not Mentioned,Not Mentioned,SUN BUM_In Sun_1,Product_20191007_12526454,5.0,Not Mentioned,Not Mentioned,Wash & Cleanser,123000000000.0,Not Mentioned,Not Mentioned,DR. HAUSCHKA FACE MOISTURIZER DAY CREAM LIGHT,,75486091859.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,Â®,1.0
freq,779063,155222,24869,552025,91892,222,920,168112,406345,741960,513491,202,449466.0,1032150,348388,145,5,120994.0,585385,681566,29236,568.0,437942,1073394,56,,13.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,44,10352.0
mean,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
std,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
min,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
50%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
75%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


% OF MISSING DATA:


additional_ingredients_(no_rulebase)      6.383971
benefit                                   0.000381
brand                                     0.000381
channel                                   0.000381
collection_date                           1.233790
product                                  20.873205
description                              21.437202
elc_solution_type                        14.856860
feature                                   0.000381
geography                                 0.139096
ingredient                                0.000381
normalized_product_title                141.774763
number_of_reviews                         5.015651
packaging                                 0.000381
product_form                              0.000381
productcluster_id                       141.774763
product_id                                0.000190
rating                                   87.751529
skin_condition                            0.000381
skin_type                      

#### Data wrangling

Then we concatenate cosmetic and skincare, and add a columns *type* to keep track of the source of the data.

**NB**: The columns are not exactly the same between the two types. By concatenting, we will have the union of the features of the two datasets and missing values will be filled with NaNs.

In [30]:
products_cosmetics['type'] = 'Cosmetics'
products_skincare['type'] = 'Skincare'
products = pd.concat([products_cosmetics, products_skincare], axis=0)

In [31]:
products.sample(4)

Unnamed: 0,product_id,source_product_identifier,product,description,channel,brand,feature,benefit,ingredient,additional_ingredients_(no_rulebase),product_form,elc_solution_type,rating,number_of_reviews,geography,collection_date,normalized_product_title,productcluster_id,finish,looks,other,trends,cluster_size,title,solution_type,type,packaging,skin_condition,skin_type,treatment_area,use_case,cluster_title,caf1a03ebf00d4078bbb0a590ec19a089,upc,c8285216327864eb388e64216dead18b1,c5bdcdac51d9e41dca007d69f7cd66c58,c886f108693fa44d3b90e50714f925352,c247b9901626441798fd9f5b0e1da5b6b,c592ee2bea9194edfa01db5a04de3e8ed,c39ee8c98ff6e4d2394de70aba44d3b53,ce9ff836616d54454a16e6e453cd8c3a0,c60e432d687a14313af9a796a9d1aa662,cdb3deb1d22ff4b3b95f7cf93a716e37f,c221108b649a940bfb30f51761418e985,cf1972cff60c14e5c9a0360d29728f326,c0f3a87d761e14a9a9355fd6cbf5e6a23,c12155d43bdc349f09c79baafbd50920c,cdcbef08d2dab47dd848b0b95f0fedbe2,c0fcd4ae2146440218fabd77b2fd0b9d2,cc7c0e57649784f21b4b637f729d6caf9,c0881be2cfa9840f4a01414ab5ecf5ff3,cf6a2c815ba954cb39cc42b31b4fee2f0,cff79bc48a3904424abc514711b240877,c9a2e8d1d2d564ee89ef2dc6bb2c613d4,c4587ff7e968142b69f6e308c49df39d3,cd96842a0c3c04ec385214879a168dfaa,c2771dfbcbc0d4e778de96df220c34b06,c73106569fb4d4913b88ae2b231d39064,c0b87f5c8b4c449bbbefa78e44454686a,c9a2fcedcc6564367861ae58adf7f62d1,c085c055e8cd4447aad82a1b1c15a5750
98325,Product_20190807_10224389,B01D6VEGP4,Hera Signia Ampoule Set 10.2g x 4ea,Hera Signia Ampoule Set 10.2g x 4ea,Amazon USA,HERA,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Serum,,0.0,USA,6/4/2019,,,,,,,,,,Skincare,Ampoule,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
556857,Product_20200416_17687678,B00HKMVFRI,Dermalogica Precleanse Wipes PreCleanse Wipes,-A pack of gentle deep cleansing face wipes -H...,Amazon UK,DERMALOGICA,Water Resistant,Cleansing & Cleaning;Gentle,Not Mentioned,Not Mentioned,Wipe/Cloth,Cleanser,4.1,5.0,UK,2019-12-14,precleanse wipes precleanse wipes,DERMALOGICA_Cleanser_10,,,,,,,,Skincare,Not Mentioned,Not Mentioned,Not Mentioned,Eyes;Face Area,Not Mentioned,DERMALOGICA CLEANSER PRECLEANSE WIPES PRECLEAN...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
401669,Product_20200416_18549208,B07RTJZ37J,Topgee Soap Scented Flower Decorative Soaps fo...,☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺☺...,Amazon USA,LANCOME,Balances;Scented,Easy to Use;Smoothing,Rose Extract,Essential Oil,Oil,Cleanser,,0.0,USA,2020-02-27,LANCOME CLEANSER TOPGEE SOAP SCENTED DECORATIV...,LANCOME_Cleanser_12,,,,,,,,Skincare,Not Mentioned,Not Mentioned,Not Mentioned,Body Area,Not Mentioned,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
688849,Product_20191007_12526247,2902414,Pro-Collagen Eye Renewal Cream,Free shipping and returns on Elemis Pro-Collag...,Nordstrom,ELEMIS,Cruelty Free;Natural;Organic;Sulfate Free,Anti Aging;Effective;Firming;Nourishing;Pore M...,Collagen;Antioxidants;Seaweed;Silicone;Vitamin A,DEA;Mineral Oil;Parabens;Petrochemicals;Phthal...,Cream,Eye Cream,4.4,7.0,USA,10/2/2019,ELEMIS PEPTIDE4 EYE RECOVERY CREAM EYE CREAM,ELEMIS_Eye Cream_0,,,,,3.0,,,Skincare,Pump,Aging;Wrinkles and Sagging,All Skin Types;Mature;Sensitive,Eyes,Day Use,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [32]:
product_columns = ['product_id', 'source_product_identifier', 'product', 'description',
       'channel', 'brand', 'feature', 'benefit', 'ingredient',
       'additional_ingredients_(no_rulebase)', 'product_form',
       'elc_solution_type', 'rating', 'number_of_reviews', 'geography',
       'collection_date', 'normalized_product_title', 'productcluster_id',
       'finish', 'looks', 'other', 'trends', 'cluster_size', 'title',
       'solution_type', 'type', 'packaging', 'skin_condition', 'skin_type',
       'treatment_area', 'use_case', 'cluster_title',]
products = products[product_columns]

We create Clean Collection Date (proper pandas date format) and fill the missing rows/bad format (if any exist) with the oldest date available.

In [33]:
products['clean_collection_date'] = pd.to_datetime(products['collection_date'], errors='coerce')
if products['clean_collection_date'].isna().sum()>0:
    print('{} rows where not processed properly. They will be assigned the oldest date available.'.format(products['clean_collection_date'].isna().sum()))
products.loc[products['clean_collection_date'].isna(), 'collection_date']=min(products['clean_collection_date'])
products['clean_collection_date'] = pd.to_datetime(products['collection_date'], errors='coerce')

6625 rows where not processed properly. They will be assigned the oldest date available.


We drop the duplicates while only keeping the row with the most recent Collection Date.

In [34]:
products = products.sort_values(by=['clean_collection_date'], ascending=False).drop_duplicates(keep='first')

In [35]:
products.sample(2)

Unnamed: 0,product_id,source_product_identifier,product,description,channel,brand,feature,benefit,ingredient,additional_ingredients_(no_rulebase),product_form,elc_solution_type,rating,number_of_reviews,geography,collection_date,normalized_product_title,productcluster_id,finish,looks,other,trends,cluster_size,title,solution_type,type,packaging,skin_condition,skin_type,treatment_area,use_case,cluster_title,clean_collection_date
413330,Product_20200416_18097195,B06Y23W4F5,Clean and Clear Deep Acting 60 Second Shower M...,Clean and Clear Deep Acting 60 Second Shower M...,Amazon USA,CLEAN & CLEAR,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Face Mask,,0.0,USA,2020-04-02,CLEAN & CLEAR FACE MASK BLACKHEAD ING 2 IN 1 W...,CLEAN & CLEAR_Face Mask_0,,,,,,,,Skincare,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,,2020-04-02
210933,Product_20190305_3314113,121860000199,'Mineral sunscreen' SPF 30 fluid for face 30ml,"Ultra lightweight, 100% 'Mineral Sunscreen' is...",Debenhams,Clinique,Lightweight Formula;SPF/UV Protection;Scent Free,Gentle,Not Mentioned,Not Mentioned,Not Mentioned,In Sun,,0.0,UK,2019-02-01,,,,,,,,,Sunscreen,Skincare,Not Mentioned,Not Mentioned,Sensitive,Eyes;Face Area,Not Mentioned,,2019-02-01


In [36]:
products[['type', 'channel', 'source_product_identifier', 'brand', 'product', 'elc_solution_type']].isna().sum()

type                              0
channel                           2
source_product_identifier         1
brand                             2
product                      109696
elc_solution_type             89580
dtype: int64

In [37]:
nb_products = len(products[['type', 'channel', 'source_product_identifier']].drop_duplicates().dropna())
nb_products_with_brand = len(products[['type', 'channel', 'source_product_identifier', 'brand']].dropna().drop_duplicates(subset=['type', 'channel', 'source_product_identifier',]))
nb_products_with_product = len(products[['type', 'channel', 'source_product_identifier', 'product']].dropna().drop_duplicates(subset=['type', 'channel', 'source_product_identifier',]))
nb_products_with_elc_solution_type = len(products[['type', 'channel', 'source_product_identifier', 'elc_solution_type']].dropna().drop_duplicates(subset=['type', 'channel', 'source_product_identifier',]))

print("Number of products IDs: {}".format(nb_products))
print("Number of products IDs with brand: {}".format(nb_products_with_brand))
print("Number of products IDs with product_name: {}".format(nb_products_with_product))
print("Number of products IDs with elc_solution_type: {}".format(nb_products_with_elc_solution_type))

Number of products IDs: 377209
Number of products IDs with brand: 377209
Number of products IDs with product_name: 377209
Number of products IDs with elc_solution_type: 353442


In [38]:
# Saving dataset
products.to_csv(os.path.join(output_product_dir_path,'products_wrangled.csv'), index=False)

In [39]:
# If the wrangled files already exist
products = pd.read_csv(os.path.join(output_product_dir_path,'products_wrangled.csv'), low_memory=False)
products.sample(10)

Unnamed: 0,product_id,source_product_identifier,product,description,channel,brand,feature,benefit,ingredient,additional_ingredients_(no_rulebase),product_form,elc_solution_type,rating,number_of_reviews,geography,collection_date,normalized_product_title,productcluster_id,finish,looks,other,trends,cluster_size,title,solution_type,type,packaging,skin_condition,skin_type,treatment_area,use_case,cluster_title,clean_collection_date
1604969,Product_20181119_197311,B07FLJ5W5D,Bath and Body Works 3 Pack Nourishing Shea But...,It's the skin solution you've been searching for!,Amazon US,Bath & Body Works,Not Mentioned,Nourishing,Shea Butter,,Butter;Sheet/Paper,Face Mask,,0.0,USA,11/5/2018,,,,,,,,,Mask;Sheet Mask,Skincare,Not Mentioned,Not Mentioned,Not Mentioned,Body Area;Face Area,Not Mentioned,,2018-11-05
1541925,Product_20190211_3205586,B00EOP1CU4,[HERA] White Program Radiance Emulsion / 120ml,[HERA] White Program Radiance Emulsion / 120ml,Amazon UK,HERA,Dermatologically Tested,Glowing Skin/Radiance,Mineral,Not Mentioned,Emulsion,,,0.0,UK,2/8/2019,,,,,,,,,Night Cream,Skincare,Not Mentioned,Dryness,Dry,Not Mentioned,Not Mentioned,,2019-02-08
1391058,Product_20190807_10571826,B075V1LS6J,AVON Anew Clinical Defend & Repair Advanced Hy...,Anew Clinical Defend & Repair Advanced Hydrati...,Amazon USA,Avon,Not Mentioned,Hydrating;Overnight Results;Repairing,Not Mentioned,Not Mentioned,Not Mentioned,Face Mask,,0.0,USA,2019-06-04,,,,,,,,,,Skincare,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,,2019-06-04
385320,Product_20200416_18729657,B00547F7KM,Suki Transformative Purifying Masque 7.5 ML (P...,Size: 7.5 mL SSS SALICYLIC ACID: Our SSS Techn...,Amazon USA,SUKI,Allergen Free;Healing;Natural;Synthetic Ingred...,Cleansing & Cleaning;Healthy,Beta Hydroxy Acid;Retinoid;Retinol;Salicylic A...,Parabens;Petrochemicals;Phthalates;Sulfates,Tablets,Face Mask,2.8,24.0,USA,2019-12-06,transformative purifying masque (purse friendly),SUKI_Face Mask_1,,,,,,,,Skincare,Not Mentioned,Blackheads;Blemish;Dryness;Redness,Dry;Sensitive,Not Mentioned,Not Mentioned,SUKI FACE MASK MOISTURE RICH MASQUE,2019-12-06
1346876,Product_20190807_11572161,B07D811M8D,MD Complete by Dr. Brian Zelickson 2% Hydroqui...,New Improved Formula! Delivers dermatologist-l...,Amazon USA,MD Complete,Not Mentioned,Effective;Glowing Skin/Radiance;Nourishing;Red...,Niacinamide;Retinoid;Retinol;Vitamin C,Borage Oil;Hydroquinone,Not Mentioned,Serum,3.4,27.0,USA,6/4/2019,,,,,,,,,,Skincare,Pump;Tube,Dark Spots;Irritation,Not Mentioned,Not Mentioned,Not Mentioned,,2019-06-04
314460,Product_20200413_6817375,B0017K9WZQ,"Revlon Make A Sheen Lustrous Shadow, Espresso'...",Color: Espresso'o Yourself Revlon make a sheen...,Amazon USA,REVLON,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Eyeshadow,4.0,7.0,USA,2019-12-13,REVLON EYESHADOW COLORSTAY LOOKS BOOK EYESHADO...,REVLON_Eyeshadow_15,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,,,,Cosmetics,,,,,,,2019-12-13
1219791,Product_20190807_11832196,B017RMIX3Y,Biotherm Wet Or Dry Solar with SPF 15 200 ml,Product Description To all types of necesities...,Amazon UK,Yves Saint Laurent,SPF/UV Protection,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,In Sun,5.0,1.0,UK,2019-06-30,,,,,,,,,,Skincare,Not Mentioned,Not Mentioned,Not Mentioned,Hands,Not Mentioned,,2019-06-30
1583305,Product_20181119_308915,B01M6D9IG6,EsteÌe Lauder Re-Nutriv Ultimate Lift Rejuven...,Rewind the clock with EsteÌe Lauder's Re-Nutr...,Amazon UK,Estee Lauder,Cellular Energy,Anti Aging;Lifting;Moisturizing;Nourishing;Por...,Not Mentioned,,Cream,Face Moisturizer,,0.0,UK,11/5/2018,,,,,,,,,Wash & Cleanser,Skincare,Not Mentioned,Not Mentioned,Mature,Not Mentioned,Not Mentioned,,2018-11-05
1574920,Product_20190211_3164399,B06XD579MJ,Nature Republic Soothing&Moisture Aloe Vera Fo...,Nature Republic Soothing&Moisture Aloe Vera Fo...,Amazon UK,Nature Republic,Not Mentioned,Cleansing & Cleaning;Soothing,Aloe Vera,Not Mentioned,Foam,Cleanser,,0.0,UK,12/26/2018,,,,,,,,,Wash & Cleanser,Skincare,Not Mentioned,Not Mentioned,All Skin Types,Not Mentioned,Not Mentioned,,2018-12-26
97945,Product_20200416_18805917,9208505,CollagenR8 Restorative COQ10 Facial Toner,Our CollagenR8 toner brings vitality back to y...,Macy's,HERBAL DYNAMICS BEAUTY,Balances;Cellular Energy;pH Balance,Brightening & Whitening;Firming;Hydrating;Impr...,Aloe Vera;Antioxidants;Chamomile;Collagen;Cucu...,Not Mentioned,Wipe/Cloth,Toner,4.9259,27.0,USA,2020-04-02,collagenr8 restorative coq10 facial toner,HERBAL DYNAMICS BEAUTY_Toner_0,,,,,,,,Skincare,Not Mentioned,Aging;Redness,Not Mentioned,Face Area,Not Mentioned,HERBAL DYNAMICS BEAUTY TONER COLLAGENR8 RESTOR...,2020-04-02


### Mapping Brands

The brands displayed in the Product Catalogue data are not the official Estée Lauder format. We have a mapping of Estée Lauder brands and brands abbreviation, which we will map to the Product Catalogue brands. To do so, we will calculate a similarity score between each possible pair (Product Catalogue brand, official brand), and select the most similar ELC Brand. The Product Calatogue contains brands that are not part of ELC. We don't care about that data for now, so we will just change the brand to a missing value (NaN).

In [42]:
elc_brands = pd.read_csv(os.path.join(input_mappings_dir_path,'elc_brands.csv'), encoding = "ISO-8859-1")

In [43]:
# Estée Lauder brands
# A better way to get these, is from the Demand data (get brand_abbrev then manually add elc_brand).
# OR ask Anurag to update the full list (I feel like some brands are missing...)
elc_brands

Unnamed: 0,brand_abbrev,elc_brand
0,AR,Aramis
1,AV,Aveda
2,BA,BECCA
3,BB,Bobbi Brown
4,BU,Bumble and bumble
5,CL,Clinique
6,CM,La Mer
7,DA,Darphin
8,EL,Estée Lauder
9,FM,Frédéric Malle


In [44]:
# Brands from Product Catalogue
brands_to_map = pd.DataFrame({'brand' : products['brand'].unique()})
brands_to_map

Unnamed: 0,brand
0,CIATE
1,L'OCCITANE
2,DIOR
3,KIEHL'S
4,LANCOME
...,...
7223,Babe;Lano
7224,Aveeno;Botanicals
7225,Phyto;Tarte Cosmetics;Ulta Beauty
7226,Anese;Artisan;Skinfood


We create all possible combinations between PC Brands and EL Brands.

In [45]:
brand_matching = brands_to_map.assign(key=0).merge(elc_brands.assign(key=0), on='key', how='left').drop('key', axis=1)
brand_matching

Unnamed: 0,brand,brand_abbrev,elc_brand
0,CIATE,AR,Aramis
1,CIATE,AV,Aveda
2,CIATE,BA,BECCA
3,CIATE,BB,Bobbi Brown
4,CIATE,BU,Bumble and bumble
...,...,...,...
144555,VINTNERS DAUGHTER,MC,M.A.C
144556,VINTNERS DAUGHTER,OR,Origins
144557,VINTNERS DAUGHTER,PR,Prescriptives
144558,VINTNERS DAUGHTER,SX,Smashbox


Then we define a function that will calculate the similarity of the pair.

In [46]:
def brands_custom_distance(row):
    """
    Calculates a distance score between two sentences. In this case elc_brand and brand. The score is between 0 and 1, 1 being a good match.
    """
#     jaccard = textdistance.jaccard(str(row['brand']).lower().replace('.','').replace('&','and'), str(row['elc_brand']).lower().replace('.','').replace('é','e'))
    jaro = textdistance.jaro_winkler(str(row['brand']).lower().replace('.','').replace('&','and'), str(row['elc_brand']).lower().replace('.','').replace('é','e'))
    fuzzi = fuzz.partial_ratio(str(row['brand']).lower().replace('.','').replace('&','and'), str(row['elc_brand']).lower().replace('.','').replace('é','e'))/100
    return np.average([fuzzi,jaro], weights=[0.4,0.6])

and we apply it to our dataset:

In [49]:
# Calculate the matching score between brand and elc_brand
brand_matching['brand_score'] = brand_matching.progress_apply(lambda row : brands_custom_distance(row), axis=1)

HBox(children=(FloatProgress(value=0.0, max=144560.0), HTML(value='')))




In [51]:
# Sorting values
brand_matching = brand_matching.sort_values('brand_score', ascending=False)

In [52]:
brand_matching

Unnamed: 0,brand,brand_abbrev,elc_brand,brand_score
54463,Bobbi Brown,BB,Bobbi Brown,1.0
2607,DARPHIN,DA,Darphin,1.0
7484,BUMBLE AND BUMBLE,BU,Bumble and bumble,1.0
53927,Darphin,DA,Darphin,1.0
58444,Bumble & Bumble,BU,Bumble and bumble,1.0
...,...,...,...,...
53742,Origins,BA,BECCA,0.0
141487,STMT,DA,Darphin,0.0
1495,BY TERRY,MC,M.A.C,0.0
141491,STMT,IM,Non-brand,0.0


In [53]:
# OPTIONAL SAVE
brand_matching.to_csv('../data/clean_data/brand_mapping_scores.csv', index=False)

Finally, we keep the most simialr ELC Brand.

In [54]:
# Keep the elc_brand that has the highest score
brand_matching = brand_matching.groupby('brand').apply(lambda x: x.nlargest(1,'brand_score')).reset_index(drop=True).sort_values('brand_score', ascending=False)
brand_matching

Unnamed: 0,brand,brand_abbrev,elc_brand,brand_score
3682,JO MALONE LONDON,JM,Jo Malone London,1.000000
1555,Clinique,CL,Clinique,1.000000
994,Becca,BA,BECCA,1.000000
5228,Origins,OR,Origins,1.000000
3268,Glamglow,GG,GlamGlow,1.000000
...,...,...,...,...
3392,HOKKUU,BU,Bumble and bumble,0.358196
1796,Ddfâ®,DA,Darphin,0.348571
13,0 100%,CM,La Mer,0.338667
1701,DL1961,EL,Estée Lauder,0.338000


All brands, including the non-ELC brands have been mapped to an ELC brand. If our similarity function did a great job, all non-ELC brands will have a low score compared to ELC brands. Therefore, we can set a score threshhold, and all brands below that threshhold will be mapped to NaN.

The way I do it is to start with a small threshhold, then select the threshhold for which all non-elc brand get filtered-out.

Some brands have a huge resemblance with ELC brands (PARAMISS/ARAMIS, BIORIGINS/ORIGINS). Because of that, we will keep a list of these non-ELC brands and they will get filtered out even if their threshhold is high.

In [55]:
brands_to_filter_out = [
    "MACY'S",
    'Original Mineral',
    'Original Source',
    'Original Sprout',
    'ORIGINAL SOURCE',
    'Bobbi Boss',
    'Bobbi',
    'PARAMISS',
    'REBECCA',
    'BIORIGINS',
    'LA PERLA',
    'LES LABORATOIRES DE BIARRITZ',
    'NATORIGIN',
    'ORVEDA',
    'YILIAN',
    'LAVERA',
    'PuraVeda Organics',
    'Rebecca Taylor',
    'MACRENE ACTIVES',
    "Macy's Beauty Collection Company",
    "Macy'S Beauty Collection",
    'Rebecca Minkoff',
    'REBECCA DANIELL',
    'No_Brand',
    'B.',
]
brands_to_filter_out = [brand.lower() for brand in brands_to_filter_out]

In [56]:
# Run this cell with different threshhold until satisfied with the result.
pd.set_option('display.max_rows', 306)
brand_thresh = 0.815
display(brand_matching.loc[(brand_matching['brand_score']>brand_thresh) & (~brand_matching['brand'].str.lower().isin(brands_to_filter_out))])
pd.set_option('display.max_rows', 100)

Unnamed: 0,brand,brand_abbrev,elc_brand,brand_score
3682,JO MALONE LONDON,JM,Jo Malone London,1.0
1555,Clinique,CL,Clinique,1.0
994,Becca,BA,BECCA,1.0
5228,Origins,OR,Origins,1.0
3268,Glamglow,GG,GlamGlow,1.0
2708,ESTEE LAUDER,EL,Estée Lauder,1.0
5997,SMASHBOX,SX,Smashbox,1.0
4441,MAC,MC,M.A.C,1.0
1631,DARPHIN,DA,Darphin,1.0
5369,PRESCRIPTIVES,PR,Prescriptives,1.0


In [57]:
# All matching that have a score below the threshhold will be unmatched
brand_matching.loc[(brand_matching['brand_score']<brand_thresh) | (brand_matching['brand'].str.lower().isin(brands_to_filter_out)), 'brand_abbrev'] = np.nan
brand_matching.loc[(brand_matching['brand_score']<brand_thresh) | (brand_matching['brand'].str.lower().isin(brands_to_filter_out)), 'elc_brand'] = np.nan
brand_matching.loc[(brand_matching['brand_score']<brand_thresh) | (brand_matching['brand'].str.lower().isin(brands_to_filter_out)), 'brand_score'] = 0

Finally, we need to manually check that we didn't miss anything. This can also be done on Excel.

In [63]:
# SAVING TO CSV
brand_matching.to_csv(os.path.join(output_mappings_dir_path,'brand_mapping_to_check.csv'), index=False)

In [64]:
# We manually check that we didn't miss anything
pd.set_option('display.max_rows', len(brand_matching)+1)
display(brand_matching)
pd.set_option('display.max_rows', 100)

Unnamed: 0,brand,brand_abbrev,elc_brand,brand_score
3682,JO MALONE LONDON,JM,Jo Malone London,1.0
1555,Clinique,CL,Clinique,1.0
994,Becca,BA,BECCA,1.0
5228,Origins,OR,Origins,1.0
3268,Glamglow,GG,GlamGlow,1.0
2708,ESTEE LAUDER,EL,Estée Lauder,1.0
5997,SMASHBOX,SX,Smashbox,1.0
4441,MAC,MC,M.A.C,1.0
1631,DARPHIN,DA,Darphin,1.0
5369,PRESCRIPTIVES,PR,Prescriptives,1.0


If everything is fine and you did not use Excel to manually check the values, we can save the table as the final result.

In [65]:
brand_matching.to_csv(os.path.join(output_brands_dir_path,'brand_mapping.csv'), index=False)

NameError: name 'output_brands_dir_path' is not defined

In [66]:
# If the wrangled files already exist
brand_matching = pd.read_csv(os.path.join(output_mappings_dir_path,'brand_mapping.csv'))
brand_matching.sample(10)

Unnamed: 0,brand,brand_abbrev,elc_brand,brand_score
245,Designer Tom Ford Beauty A187,TF,Tom Ford Beauty,0.850115
4970,Yipsophilia,,,0.0
1111,DERM INSTITUTE,,,0.0
891,BIOREGEN,,,0.0
605,J.Cat Beauty,,,0.0
1417,LE PETIT MARSEILIAIS,,,0.0
6667,AKT.THERAPY,,,0.0
6483,UNANI,,,0.0
2074,Botanicals;Meaningful Beauty,,,0.0
5547,Body Shop,,,0.0


Now that our brands are mapped, we join them to our product table and we drop the duplicated rows.

In [67]:
products = products.merge(brand_matching, how='left')

In [68]:
products.sample(5)

Unnamed: 0,product_id,source_product_identifier,product,description,channel,brand,feature,benefit,ingredient,additional_ingredients_(no_rulebase),product_form,elc_solution_type,rating,number_of_reviews,geography,collection_date,normalized_product_title,productcluster_id,finish,looks,other,trends,cluster_size,title,solution_type,type,packaging,skin_condition,skin_type,treatment_area,use_case,cluster_title,clean_collection_date,brand_abbrev,elc_brand,brand_score
1198847,Product_20190807_11425702,B008A5S2DO,L'Oreal Paris Elseve Total Repair 5 Mask 200ml.,ELSEVE Total Repair 5 Mask 200 ml- Details: - ...,Amazon UK,L'Oreal,Natural,Anti Aging;Repairing,Not Mentioned,Not Mentioned,Not Mentioned,Face Mask,,0.0,UK,2019-06-30,,,,,,,,,,Skincare,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,,2019-06-30,,,0.0
255274,Product_20200109_5661485,777353,Revolution Maxi Reloaded Palette Large It Up,Product Information Game on! Level up your loo...,Superdrug,REVOLUTION,Not Mentioned,Glowing & Brightening;Pigmented,Not Mentioned,Not Mentioned,Spray,Cross Application Sets,4.1,7.0,UK,2020-01-02,REVOLUTION MAXI RELOADED PALETTE IT UP CROSS A...,REVOLUTION_Cross Application Sets_4,Matte/Matte Glow;Shimmer/Glitter,Not Mentioned,Not Mentioned,Not Mentioned,,,,Cosmetics,,,,,,,2020-01-02,,,0.0
1153459,Product_20190702_3256750,B0039UTU18,"Revlon ColorStay Soft and Smooth, Cozy Coral, ...",,Amazon USA,Revlon,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Stick,,4.0,63.0,USA,7/1/2019,,,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,,,Lipstick,Cosmetics,,,,,,,2019-07-01,,,0.0
903591,Product_20190807_12185630,B07P6T3PLM,,The cream strengthens the basic condition of y...,Amazon USA,Su:m37,Cellular Energy,Firming;Pore Minimizing;Strengthens,Not Mentioned,Not Mentioned,Cream,All Moisturizers,,0.0,USA,2019-07-10,,,,,,,,SUM37 SU:M37 Time Energy Skin Resetting Moist ...,,Skincare,Not Mentioned,Not Mentioned,Not Mentioned,Face Area,Not Mentioned,,2019-07-10,,,0.0
1223467,Product_20190807_12249982,B01G5ICPX0,,Add moisture to your skin without feeling like...,Amazon USA,Proactiv,Anti Acne;Oil Free,Glowing Skin/Radiance;Hydrating;Moisturizing;N...,Not Mentioned,Not Mentioned,Not Mentioned,All Moisturizers,,0.0,USA,2019-06-30,,,,,,,,"X Out Shine Control, 1.7 Fluid Ounce",,Skincare,Not Mentioned,Acne,Not Mentioned,Not Mentioned,Not Mentioned,,2019-06-30,,,0.0


In [69]:
products.isna().sum()/len(products)*100

product_id                               0.000061
source_product_identifier                0.000061
product                                  6.719037
description                              7.881834
channel                                  0.000123
brand                                    0.000123
feature                                  0.000123
benefit                                  0.000245
ingredient                               0.000123
additional_ingredients_(no_rulebase)     3.399025
product_form                             0.000123
elc_solution_type                        5.486903
rating                                  32.042092
number_of_reviews                        2.697942
geography                                0.044775
collection_date                          0.000000
normalized_product_title                65.878055
productcluster_id                       65.878055
finish                                  68.431749
looks                                   68.431749


We check that there are no duplicates left:

In [70]:
# products[products.duplicated(subset=['type', 'channel', 'source_product_identifier'], keep=False)].sort_values(by=products.columns.tolist())

Again, if the dataframe is empty, that means we're **A-OK**.

In [None]:
# SAVE
products.to_csv(os.path.join(output_product_dir_path,'products_wrangled_with_brands.csv'), index=False)

In [72]:
# If the wrangled files already exist
products = pd.read_csv(os.path.join(output_product_dir_path,'products_wrangled_with_brands.csv'), low_memory=False)
products.sample(10)

Unnamed: 0,product_id,source_product_identifier,product,description,channel,brand,feature,benefit,ingredient,additional_ingredients_(no_rulebase),product_form,elc_solution_type,rating,number_of_reviews,geography,collection_date,normalized_product_title,productcluster_id,finish,looks,other,trends,cluster_size,title,solution_type,type,packaging,skin_condition,skin_type,treatment_area,use_case,cluster_title,clean_collection_date
1154775,Product_20190702_3257295,B007L9RN8U,NYX Cosmetics Long Lasting Slim Eye Liner / Ey...,NYX Slim Eye Liner and Eyebrow Pencils apply s...,Amazon USA,NYX Cosmetics,Not Mentioned,Blendable;Long Lasting,Not Mentioned,Not Mentioned,Pencil,Cross Category Eye Sets,5.0,3.0,USA,7/1/2019,,,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,,,Eyebrow Color;Eyeliner,Cosmetics,,,,,,,2019-07-01
636542,Product_20200109_5791796,B076LWHV3D,HUDA BEAUTY #FauxFilter Foundation - Macaroon ...,"Not content with keeping our lips, lashes, lid...",Amazon UK,HUDA BEAUTY,Scented;Water Resistant,Covers/Conceals;Ease of Use;Flawless Skin;Full...,Not Mentioned,Argan Oil,Cream,Foundation,2.4,3.0,UK,2019-09-27,HUDA BEAUTY FAUXFILTER FOUNDATION CREME BRULE ...,HUDA BEAUTY_Foundation_0,Flawless;Matte/Matte Glow,Not Mentioned,Not Mentioned,Not Mentioned,,,,Cosmetics,,,,,,,2019-09-27
411254,Product_20200109_14825877,B01KCJDICK,Hawaiian Tropic Silk Hydration After Sun Lotio...,,Amazon USA,HAWAIIAN TROPIC,Not Mentioned,Hydrating,Not Mentioned,Not Mentioned,Lotion,After Sun,0.0,0.0,USA,2019-12-05,HAWAIIAN TROPIC SILK HYDRATION AFTER LOTION AF...,HAWAIIAN TROPIC_After Sun_15,,,,,,,,Skincare,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,,2019-12-05
1371068,Product_20190807_11362386,B0051UKI4W,,"Pack of 3, factory shrink-wrappedStreak-free!",Amazon USA,L'Oreal,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Gel,Sun - Face,4.1,62.0,USA,2019-06-04,,,,,,,,Loreal Paris Sublime Bronze One-Day Tinted Gel...,,Skincare,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,,2019-06-04
1255018,Product_20190722_3379109,B00DWVBEIG,Garnier Anti Aging Light Medium 2 5,,Amazon UK,Garnier,Not Mentioned,Anti Aging,Not Mentioned,Not Mentioned,Not Mentioned,,4.0,504.0,UK,6/30/2019,,,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,,,,Cosmetics,,,,,,,2019-06-30
72076,Product_20200416_18067540,B06XRMZ9RY,ORIGINS Dr. Andrew Weil for Origins? Mega-Brig...,Origins Dr. Andrew Weil for Origins Mega-Brigh...,Amazon USA,ORIGINS,Not Mentioned,Brightening & Whitening;Reduces Dark Spots,Not Mentioned,Not Mentioned,Not Mentioned,Face Mask,,0.0,USA,2020-04-02,ORIGINS FACE MASK DR. ANDREW WEIL FOR MEGA BRI...,ORIGINS_Face Mask_12,,,,,,,,Skincare,Not Mentioned,Dark Spots,Not Mentioned,Not Mentioned,Not Mentioned,,2020-04-02
1378502,Product_20190807_11434767,B071JB8GN4,Mediheal TEATREE Care Solution Essential Mask ...,"Melaleuca Alternifolia (Tea Tree) Leaf Oil, Ch...",Amazon USA,Mediheal,Not Mentioned,Moisturizing;Soothing,Centella Asiatica;Tea Tree Oil,Not Mentioned,Sheet/Paper,Face Mask,,0.0,USA,2019-06-04,,,,,,,,,,Skincare,Not Mentioned,Not Mentioned,Sensitive,Face Area,Not Mentioned,,2019-06-04
124459,Product_20200416_18436186,B01MU39DMG,[Leaders Insolution] Green Tea Lip & Eye Remov...,Read more Green Tea Lip & Eye Point Remover Th...,Amazon USA,LEADERS,Biodegradable & Recyclable;Natural;Sustainable,Cleansing & Cleaning;Effective;Gentle;Non Irri...,Argan Oil;Green Tea,Argan Oil;Green Tea,Wipe/Cloth,Makeupremover,3.2,6.0,USA,2020-04-02,[ insolution] tea lip and eye remover (lip and...,LEADERS_Makeupremover_-1,,,,,,,,Skincare,Not Mentioned,Irritation,All Skin Types;Sensitive,Eyes;Face Area;Lips,Not Mentioned,LEADERS MAKEUPREMOVER [ INSOLUTION] TEA LIP AN...,2020-04-02
90244,Product_20200416_17399432,B0071CRK12,Burt's Bees 100% Natural Moisturizing Lip Balm,This JUICY LIP BALM is made with nutritious tr...,Amazon UK,BURT'S BEES,Natural;Oil Free,Moisturizing;Nourishing;Smoothing;Softening,Vitamin C,Not Mentioned,Balm,Unspecified Lip Skincare,4.7,1.0,UK,2020-04-02,BURT'S BEES UNSPECIFIED LIP SKINCARE WAX LIP B...,BURT'S BEES_Unspecified Lip Skincare_8,,,,,,,,Skincare,Not Mentioned,Not Mentioned,Not Mentioned,Lips,Not Mentioned,,2020-04-02
293190,Product_20200109_14715842,B002LMRSJ6,Derma E Very Clear Cleanser,Gently remove built-up oil and debris from the...,Amazon USA,DERMA E,Anti Acne;Healing;Natural;Organic,Cleansing & Cleaning;Clears Pores;Does Not Lea...,Beta Hydroxy Acid;Chamomile;Lavender;Mineral;S...,Essential Oil;PEGs,Liquid;Oil,Cleanser,0.0,0.0,USA,2019-12-23,DERMA E DRMA VRY CLANSR CLEANSER,DERMA E_Cleanser_5,,,,,,,,Skincare,Not Mentioned,Acne;Blemish;Dryness;Irritation;Redness,Dry,Not Mentioned,Not Mentioned,,2019-12-23


### Mapping product names

In [73]:
# Brands from Product Catalogue
names_to_map = products.dropna(subset=['brand_abbrev'])[['brand_abbrev', 'product']].drop_duplicates()
names_to_map

KeyError: ['brand_abbrev']

In [None]:
names_to_map

In [None]:
elc_names = demand[['brand', 'item_description']].drop_duplicates().rename(columns={'brand':'brand_abbrev'})
elc_names

We create all possible combinations between product names that have the same brand.

In [None]:
names_matching = names_to_map.assign(key=0).merge(elc_names.assign(key=0), on=['brand_abbrev', 'key'], how='left').drop('key', axis=1)

In [None]:
names_matching.head(4)

Then we define a function that will calculate the similarity of the pair.

In [None]:
def names_custom_distance(row):
    """
    Calculates a distance score between two sentences. In this case elc_brand and brand. The score is between 0 and 1, 1 being a good match.
    """
#     jaccard = textdistance.jaccard(str(row['brand']).lower().replace('.','').replace('&','and'), str(row['elc_brand']).lower().replace('.','').replace('é','e'))
    jaro = textdistance.jaro_winkler(str(row['product']).lower(), str(row['item_description']).lower())
    fuzzi = fuzz.partial_ratio(str(row['product']).lower(), str(row['item_description']).lower())/100
    return np.average([fuzzi,jaro], weights=[0.5,0.5])

and we apply it to our dataset:

In [None]:
# Calculate the matching score between brand and elc_brand
names_matching['product_score'] = names_matching.progress_apply(lambda row : names_custom_distance(row), axis=1)
names_matching

In [None]:
# OPTIONAL SAVE
names_matching.to_csv('../data/clean_data/names_mapping_scores.csv', index=False)

Finally, we keep the most simialr names.

In [None]:
# Keep the item_description that has the highest score
names_matching = names_matching.groupby('product').apply(lambda x: x.nlargest(1,'product_score')).reset_index(drop=True).sort_values(by='score', ascending=False)
names_matching

In [None]:
names_to_filter_out = []
names_to_filter_out = [name.lower() for name in names_to_filter_out]

In [None]:
# Run this cell with different threshhold until satisfied with the result.
pd.set_option('display.max_rows', 200)
names_thresh = 0
display(names_matching.loc[(names_matching['product_score']>names_thresh) & (~names_matching['product'].str.lower().isin(names_to_filter_out))].sort_values(by='product_score'))
pd.set_option('display.max_rows', 100)

In [None]:
# All matching that have a score below the threshhold will be unmatched
names_matching.loc[(names_matching['product_score']<names_thresh) | (names_matching['item_description'].str.lower().isin(names_to_filter_out)), 'item_description'] = np.nan

Finally, we need to manually check that we didn't miss anything. This can also be done on Excel.

In [None]:
# SAVING TO CSV
names_matching.to_csv(os.path.join(output_mappings_dir_path,'names_mapping_to_check.csv'), index=False)

In [None]:
# We manually check that we didn't miss anything
pd.set_option('display.max_rows', len(names_matching)+1)
display(names_matching)
pd.set_option('display.max_rows', 100)

If everything is fine and you did not use Excel to manually check the values, we can save the table as the final result.

In [None]:
# SAVING TO CSV
names_matching.to_csv(os.path.join(output_mappings_dir_path,'names_mapping.csv'), index=False)

In [None]:
# If the wrangled files already exist
names_matching = pd.read_csv(os.path.join(output_mappings_dir_path,'names_mapping.csv'))
names_matching.sample(10)

Now that our brands are mapped, we join them to our product table and we drop the duplicated rows.

In [None]:
products = products.merge(names_matching, how='left')

In [None]:
products.isna().sum()/len(products)*100

In [None]:
products = products[['type', 'channel', 'source_product_identifier', 'brand', 'elc_solution_type', 'product', 'brand_abbrev', 'elc_brand', 'brand_score', 'item_description', 'product_score']].sort_values(by=['brand_score', 'product_score'], ascending=False)
products = products.dropna(subset=['type', 'channel', 'source_product_identifier'])
products = products.drop_duplicates(['type', 'channel', 'source_product_identifier'], keep='first')
products.sample(10)

In [None]:
products.isna().sum()/len(products)*100

In [None]:
products[products.duplicated(subset=['type', 'channel', 'source_product_identifier'], keep=False)].sort_values(by=products.columns.tolist())

Again, if the dataframe is empty, that means we're **A-OK**.

In [None]:
# SAVE
products.to_csv(os.path.join(output_product_dir_path,'products_wrangled_with_brands_products.csv'), index=False)

In [None]:
# If the wrangled files already exist
products = pd.read_csv(os.path.join(output_product_dir_path,'products_wrangled_with_brands_products.csv'), low_memory=False)
products.sample(10)

### Joining all datasets

#### Products + Brands + Reviews

Then we join the products table to the reviews table:

In [None]:
reviews = reviews.merge(products, how='left')

In [None]:
reviews

In [None]:
reviews.columns

We need to check for missing data (in %) to make sure all products have been joined. It is ok to have missing data for *brand_abbrev* and *elc_brand* since there are reviews of non-ELC products.

Finally, we drop the reviews of non-ELC products.

**DO NOT RUN THE NEXT CELL IF brand OR elc_solution_type IS MISSING DATA. THAT MEANS THAT WE ARE MISSING DATA IN THE PRODUCTS CATALOGUE**

In [None]:
reviews = reviews.dropna(subset=['brand_abbrev'])

In [None]:
reviews.isna().sum()/len(reviews)*100

#### Reviews + Demand

The reviews data has the channel granularity that does not exist in the demand data. Furthermore, there is no mapping between the source_product_identifier of the reviews data and the product ids of the demand data. Therefore, we need to aggregate the reviews data by *brand + elc_solution_type/sub_category*  before the join. We will also rename the columns so that they match the demand data.

In [None]:
reviews = reviews.groupby(['type',
                           'brand_abbrev',
                           'elc_brand',
                           'brand_score',
                           'product',
                           'item_description',
                           'product_score',
                           'date']).agg({
    'avg_nb_statements':'mean',
    'nb_reviews':'count',
    'rating':'mean',
    'rating_1':'sum',
    'rating_2':'sum',
    'rating_3':'sum',
    'rating_4':'sum',
    'rating_5':'sum',
    'sentiment_negative':'sum',
    'sentiment_neutral':'sum',
    'sentiment_positive':'sum',
    'sentiment':'mean'
}).reset_index().rename(columns=
                       {'brand_abbrev':'brand'})

In [None]:
reviews.sample(2)

Finally, we can merge the reviews with the demand data to get our final dataset. We will use an inner join to avoid having any missing data. That was a good hustle. All data scientists live for this

In [None]:
# We need to reformat the dates incase the wrangles files has been read (code not run fully)
pd.to_datetime(demand['date'].astype(str), format='%Y-%m', errors='coerce').dt.to_period('m')
pd.to_datetime(reviews['date'].astype(str), format='%Y-%m', errors='coerce').dt.to_period('m')

In [None]:
reviews = reviews.merge(demand, how='left')

In [None]:
# Ordering columns and rows
reviews = reviews.set_index(['type',
                             'brand',
                             'elc_brand',
                             'brand_score',
                             'product',
                             'itemid_4',
                             'item_description',
                             'product_score',
                             'date']).reset_index()

In [None]:
reviews

In [None]:
reviews.to_csv(os.path.join(output_demand_dir_path, 'reviews_demand.csv'), index=False)

In [None]:
reviews.isna().sum()/len(reviews)*100

In [None]:
print('RUNNING TIME %.0f minutes'  % ((time.time()-code_start)/60))

## Quick look at the data

In [None]:
df = pd.read_csv('../../Ratings and Reviews Data Wrangling/data/clean_data/reviews_demand.csv')
df.sample(5)

**COLUMNS DESCRIPTION**
* *elc_brand*: name of the brand.
* *brand*: abbreviation code of the brand.
* *sub_category*: type of the product.
* *date*: month and year.
* *nb_reviews*: number of reviews.
* *rating*: average rating for that month. Range from 1 to 5.
* *sentiment*: average sentiment for that month. 1: positive, 0: neutral, -1:negative.
* *rating_i* : counts the number of i-stars ratings.
* *sentiment_xxx* : counts the number of reviews of sentiment xxx.
* *demand*: demand of the product for that month.

In [None]:
df.describe(include='all')

**DATES**

The data we have ranges from July 2016 to Feb 2020. I wouldn't trust the data of 2020: I feel like it's still a bit messy on the Reviews side (just a hunch).

In [None]:
sorted(df['date'].unique().tolist())

**BRANDS**

In [None]:
df['elc_brand'].unique().tolist()

**SUBCATEGORIES**

In [None]:
df['sub_category'].unique().tolist()

**PLOTS**

This section contains different plots about different aspects of the data that we have. I only focused on the brand granularity otherwise there would waaaay too many graphs. I'll dig deeper if we need more insight. It's a bit lengthy but it's just graphs !

### Average number of reviews by brand

First, let's have a look at the number of reviews by brand averaged over all the periods that we have.

In [None]:
df.groupby('elc_brand')['nb_reviews'].sum().plot.bar(figsize=(15, 10))
plt.grid()
plt.xlabel('Brand')
plt.ylabel('Number of reviews')
plt.title('Number of reviews by month')

We see that the distribution is uneven. Pretty sure it's because different brands have different demand and therefore more reviews. Let's check that by adding the demand on another axis.

In [None]:
fig, ax1 = plt.subplots()

width = 0.35

color = 'tab:red'
ax1.set_xlabel('Brand')
ax1.set_ylabel('Number of reviews', color=color)
df.groupby('elc_brand')['nb_reviews'].sum().plot.bar(width=width, figsize=(14, 10), color=color, position=1)
# ax1.plot(t, data1, color=color)
ax1.tick_params(axis='y', labelcolor=color)
plt.grid()


plt.xticks(ticks=range(len(df['elc_brand'].unique())), labels=df['elc_brand'].unique(), rotation='vertical')

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('Demand', color=color)  # we already handled the x-label with ax1
df.groupby('elc_brand')['demand'].sum().plot.bar(width=width, figsize=(14, 10), color=color, position=0)
ax2.tick_params(axis='y', labelcolor=color)
plt.grid()

plt.title('Number of reviews and demand by brand')
plt.show()

We can see some correlation with makes sense...

### Frequency of ratings by month across all brands

In [None]:
df[['date', 'rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5']].groupby('date').sum().plot.bar(figsize=(15, 10))
plt.grid()
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.title('Frequency of ratings by month')

We see that 5-star ratings are clearly prevailing.

### Average rating by brand (Average over the months of the average rating per month)
**!!!** As apposed to the average over all the reviews. Indeed, $avg(avg(x, month))\neq avg(x)$. 

Here is a list of all the available brands. 

In [None]:
# Available brands
df['elc_brand'].unique().tolist()

Add the brands for which you'd like to see the average rating over time to the *selected_brands* list in the next cell.

In [None]:
selected_brands = ['Aveda', 'M.A.C', 'Estée Lauder']
df[df['elc_brand'].isin(selected_brands)][['date', 'elc_brand', 'rating']].groupby(['date', 'elc_brand']).mean().unstack().droplevel(level=0,axis=1).plot.line(figsize=(15, 10))
plt.grid()
plt.xlabel('Date')
plt.ylabel('Average rating')
plt.title('Average ratings by brand by month')
plt.xticks(ticks=range(len(df['date'].unique())), labels=df['date'].unique(), rotation='vertical')
plt.show()

I noticed here that some brands did not get any ratings for some months. Maybe this shows that Rating and Reviews data that is scrapped is not of a very good quality **OR** perhaps there was 0 reviews for that brand ...

### Frequency of sentiments by month

In [None]:
df[['date', 'sentiment_negative', 'sentiment_neutral', 'sentiment_positive']].groupby('date').sum().plot.bar(figsize=(15, 5))
plt.grid()
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.title('Frequency of sentiments by month')

Again we see here that positive sentiments prevail.

### Average sentiment by brand  (Average over the months of the average rating per month)
**!!!** As apposed to the average over all the reviews. Indeed, $avg(avg(x, month))\neq avg(x)$. 

Add the brands for which you'd like to see the average sentiment over time to the *selected_brands* list in the next cell.

In [None]:
selected_brands = ['Clinique', 'M.A.C', 'Estée Lauder']
df[df['elc_brand'].isin(selected_brands)][['date', 'elc_brand', 'sentiment']].groupby(['date', 'elc_brand']).mean().unstack().droplevel(level=0,axis=1).plot.line(figsize=(15, 10))
plt.grid()
plt.xlabel('Date')
plt.ylabel('Average sentiment')
plt.title('Average sentiment by month')
plt.xticks(ticks=range(len(df['date'].unique())), labels=df['date'].unique(), rotation='vertical')
plt.show()

### Ratings (avg by brand) and Demand by month

Finally, we plot the average rating by brand and the increase in demand month over month in %.

In [None]:
for brand in df['elc_brand'].unique():
    print('=============================================== {} ==============================================='.format(brand))
    fig, ax1 = plt.subplots()

    color = 'tab:red'
    ax1.set_xlabel('Brand')
    ax1.set_ylabel('Average rating', color=color)
    df[df['elc_brand']==brand].groupby('date')['rating'].mean().plot.line(figsize=(14, 10), color=color)
    plt.grid()

    plt.xticks(ticks=range(len(df['date'].unique())), labels=df['date'].unique(), rotation='vertical')

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

    color = 'tab:blue'
    ax2.set_ylabel('% incrase in demand month over month', color=color)  # we already handled the x-label with ax1
    (df[df['elc_brand']==brand].groupby('date')['demand'].sum().diff().div(df[df['elc_brand']==brand].groupby('date')['demand'].sum().shift())*100).plot.line(figsize=(14, 10), color=color)
    plt.grid()
    
    plt.title('Average rating and MoM %increase in demand for {}'.format(brand))
    plt.show()

I can't see any solid clear trends in these graphs... I also noticed that for some brands we have a lot of missing demand data (missing or non-existing)...

### Demand vs Rating 

In [None]:
for brand in df['elc_brand'].unique(): 
    print('=========================================== {} ==========================================='.format(brand))
    fig, ax1 = plt.subplots()

    color = 'tab:red'
    ax1.set_xlabel('Brand')
    ax1.set_ylabel('Average rating', color=color)
    df[df['elc_brand']==brand].groupby('date')['rating'].mean().plot.line(figsize=(14, 10), color=color)
    plt.grid()

    plt.xticks(ticks=range(len(df['date'].unique())), labels=df['date'].unique(), rotation='vertical')

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

    color = 'tab:blue'
    ax2.set_ylabel('Demand', color=color)  # we already handled the x-label with ax1
    df[df['elc_brand']==brand].groupby('date')['demand'].sum().plot.line(figsize=(14, 10), color=color)
    plt.grid()
    
    plt.title('Average rating and MoM %increase in demand for {}'.format(brand))
    plt.show()

## Model 0 : Everything in the soup

In [None]:
lm = LinearRegression()
lm.fit(X,y)
params = np.append(lm.intercept_,lm.coef_)
predictions = lm.predict(X)

newX = pd.DataFrame({"Constant":np.ones(len(X))}).join(pd.DataFrame(X))
MSE = (sum((y-predictions)**2))/(len(newX)-len(newX.columns))

# Note if you don't want to use a DataFrame replace the two lines above with
# newX = np.append(np.ones((len(X),1)), X, axis=1)
# MSE = (sum((y-predictions)**2))/(len(newX)-len(newX[0]))

var_b = MSE*(np.linalg.inv(np.dot(newX.T,newX)).diagonal())
sd_b = np.sqrt(var_b)
ts_b = params/ sd_b

p_values =[2*(1-stats.t.cdf(np.abs(i),(len(newX)-1))) for i in ts_b]

sd_b = np.round(sd_b,3)
ts_b = np.round(ts_b,3)
p_values = np.round(p_values,3)
params = np.round(params,4)

myDF3 = pd.DataFrame()
myDF3["Coefficients"],myDF3["Standard Errors"],myDF3["t values"],myDF3["Probabilities"] = [params,sd_b,ts_b,p_values]
print(myDF3)

We start with a model that uses both Ratings and Sentiments. The data we have ranges from 

In [None]:
train = df[df['date'] <= '2018-12-01']
X_train = train.iloc[:, 5:-1]
X_train = sm.add_constant(X_train)
y_train = train.iloc[:, -1]
X_train

In [None]:
train = df[df['date'] <= '2018-12-01']
X_train = train.iloc[:, 4:-1]
X_train = sm.add_constant(X_train)
y_train = train.iloc[:, -1]

test = df[df['date'] > '2018-12-01']
X_test = test.iloc[:, 4:-1]
X_test = sm.add_constant(X_test)
y_test = test.iloc[:, -1]

olsmod = sm.OLS(y_train, X_train)
olsres = olsmod.fit()

print(olsres.summary())

The coefficients for the ratings are all negative. Those of the sentiment are all positive and we have a small warning about some very small eigenvalue. That is because there is a lot of correlation between the features. Let's look at the correlation between our variables.

In [None]:
train.iloc[:, 4:-1].corr()

In [None]:
f = plt.figure(figsize=(19, 15))
plt.matshow(train.iloc[:, 4:-1].corr(), fignum=f.number, cmap='RdBu', vmin=-1, vmax=1)
plt.xticks(range(train.iloc[:, 4:-1].shape[1]), train.iloc[:, 4:-1].columns, fontsize=14, rotation=45)
plt.yticks(range(train.iloc[:, 4:-1].shape[1]), train.iloc[:, 4:-1].columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);

We see a correlation between the ratings and the sentiments (rating_5 with sentiment_positive and vice-versa). But the sentiments are also correlated between themselves which is probably *no bueno*. This is due to the fact that we are using onehot encodings with counts. If the number of reviews increases, rating_1 to rating_5 are likely to increase too...

Let's quickly try some feature selection.

## Model 1 : demand = f(nb_reviews, rating, sentiment)

In [None]:
train = df[df['date'] <= '2018-12-01']
X_train = train.iloc[:, 4:7]
X_train = sm.add_constant(X_train)
y_train = train.iloc[:, -1]

test = df[df['date'] > '2018-12-01']
X_test = test.iloc[:, 4:-4]
X_test = sm.add_constant(X_test)
y_test = test.iloc[:, -1]

olsmod = sm.OLS(y_train, X_train)
olsres = olsmod.fit()

print(olsres.summary())


Import Data

In [None]:
df = pd.read_csv("../data/Global Demand Data Wrangling/output/monthly_product_demand.csv")
demand['demand'] = pd.to_numeric(demand['demand'].str.replace(',', ''), errors='coerce')

Filter out demand forecast data

In [None]:
demand["datetime"] = pd.to_datetime(demand["month_date"])
CUTOFF = datetime.datetime(2020, 2, 1)
demand = demand.loc[demand["datetime"]<= CUTOFF]

In [None]:
demand

Create demand lags

In [None]:
demand["demand_lag_1"] = demand.groupby("ItemID 4")["demand"].shift()
demand["demand_lag_2"] = demand.groupby("ItemID 4")["demand"].shift(2)
demand["demand_lag_3"] = demand.groupby("ItemID 4")["demand"].shift(3)
demand.dropna(inplace=True)