# Data Wrangling Rating and Reviews

In [224]:
import os
import pickle
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

# pd.set_option('display.max_rows', 20)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', 1)

## Product Catalogue

### Reading data

In [2]:
output_path = r'../data/clean_data/'

We have a look at the available data

In [3]:
input_product_paths = os.path.join(os.getcwd(),r'../data/raw_data/Product Catalogue/')
os.listdir(input_product_paths)

['Cosmetics_Product_20190831.csv',
 'Cosmetics_Product_20190930.csv',
 'Cosmetics_Product_20191031.csv',
 'cosmetics_product_2019Q4.csv',
 'Cosmetics_Product_20200116.csv',
 'Global Product Catalogs - ELC ST Fix',
 'Historic Product Catalog - ELC ST Fix.b',
 'Historic Product Catalogs - ELC ST Fix',
 'SkinCare_New_Product_20191031.zip',
 'SkinCare_Product_20190630_ST MV2SV.csv',
 'SkinCare_Product_20190930.csv',
 'SkinCare_Product_20190930_Update.csv',
 'SkinCare_Product_20191031.csv',
 'skincare_product_20191231.csv',
 'skincare_product_2019Q3.csv',
 'skincare_product_2019Q3_final.csv',
 'SkinCare_Product_20200116.csv']

We create a DataFrame for skincare and cosmetics that concatenates all the dataframes.

In [4]:
product_skincare = pd.DataFrame()
product_cosmetics = pd.DataFrame()
for file in os.listdir(input_product_paths):
    if '.csv' in file.lower():
        if 'skincare' in file.lower():
            print('Adding', file)
            temp = pd.read_csv(os.path.join(input_product_paths,file))
            temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
            product_skincare=pd.concat([product_skincare, temp])
        elif 'cosmetics' in file.lower():
            print('Adding', file)
            temp = pd.read_csv(os.path.join(input_product_paths,file))
            temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
            product_cosmetics=pd.concat([product_cosmetics, temp])
del temp

Adding Cosmetics_Product_20190831.csv
Adding Cosmetics_Product_20190930.csv
Adding Cosmetics_Product_20191031.csv
Adding cosmetics_product_2019Q4.csv
Adding Cosmetics_Product_20200116.csv
Adding SkinCare_Product_20190630_ST MV2SV.csv
Adding SkinCare_Product_20190930.csv


  interactivity=interactivity, compiler=compiler, result=result)


Adding SkinCare_Product_20190930_Update.csv


  interactivity=interactivity, compiler=compiler, result=result)


Adding SkinCare_Product_20191031.csv
Adding skincare_product_20191231.csv
Adding skincare_product_2019Q3.csv
Adding skincare_product_2019Q3_final.csv
Adding SkinCare_Product_20200116.csv


In [5]:
product_cosmetics.columns

Index(['Product_ID', 'Source Product Identifier', 'Product', 'Description',
       'Channel', 'Brand', 'Feature', 'Benefit', 'Ingredient',
       'Additional Ingredients (no rulebase)', 'Product Form',
       'ELC Solution Type', 'Finish', 'Looks', 'Other', 'Trends', 'Rating',
       'Number of Reviews', 'Geography', 'Collection Date',
       'Normalized Product Title', 'ProductCluster_ID', 'Cluster Size',
       'Title'],
      dtype='object')

### Skincare

We start by changing the columns names to make them code friendly

In [6]:
product_skincare.columns = [colname.lower().replace(' ','_') for colname in product_skincare.columns]

We format the Clean Collection Date and fill the missing rows/bad format with the oldest date available.

In [7]:
product_skincare['clean_collection_date'] = pd.to_datetime(product_skincare['collection_date'], errors='coerce')
product_skincare.loc[product_skincare['clean_collection_date'].isna(), 'collection_date']=min(product_skincare['clean_collection_date'])
product_skincare['clean_collection_date'] = pd.to_datetime(product_skincare['collection_date'], errors='coerce')

We drop the duplicates while only keeping the row with the most recent Collection Date

In [30]:
product_skincare = product_skincare.sort_values('clean_collection_date', ascending=False).drop_duplicates([
    'elc_solution_type',
    'source_product_identifier', 
    'product_id'])

Finally, we save the DataFrame:

In [9]:
product_skincare.to_csv(os.path.join(output_path,'product_catalogue_skincare.csv'))

### Cosmetics

We do the same for Coemetics. We start by changing the columns names to make them code friendly

In [10]:
product_cosmetics.columns = [colname.lower().replace(' ','_') for colname in product_cosmetics.columns]

We format the Clean Collection Date and fill the missing rows/bad format with the oldest date available.

In [11]:
product_cosmetics['clean_collection_date'] = pd.to_datetime(product_cosmetics['collection_date'], errors='coerce')
product_cosmetics.loc[product_cosmetics['clean_collection_date'].isna(), 'collection_date']=min(product_cosmetics['clean_collection_date'])
product_cosmetics['clean_collection_date'] = pd.to_datetime(product_cosmetics['collection_date'], errors='coerce')

We drop the duplicates while only keeping the row with the most recent Collection Date

In [12]:
product_cosmetics.sort_values('clean_collection_date', ascending=False).drop_duplicates([
    'elc_solution_type',
    'source_product_identifier', 
    'product_id'], inplace=True)

Finally, we save the DataFrame:

In [13]:
product_cosmetics.to_csv(os.path.join(output_path,'product_catalogue_cosmetics.csv'))

In [38]:
len(product_skincare.drop_duplicates([
    'source_product_identifier', 
    'product_id']))

286919

In [39]:
len(product_skincare.drop_duplicates(['elc_solution_type',
    'source_product_identifier', 
    'product_id',]))

286920

In [80]:
g = product_skincare.drop_duplicates([
    'source_product_identifier',
    'product_id',
    'elc_solution_type'
])

g[g[['source_product_identifier', 
     'product_id']].duplicated(
    keep=False)][[
    'elc_solution_type',
    'source_product_identifier', 
    'product_id']].sort_values([
    'elc_solution_type',
    'source_product_identifier', 
    'product_id'])

Unnamed: 0,elc_solution_type,source_product_identifier,product_id
82374,Exfoliator,B004ZD2M6I,Product_20191016_14035148
82844,,B004ZD2M6I,Product_20191016_14035148


In [59]:
g = product_skincare.drop_duplicates([
    'product_id',
    'brand'
])

h = g[g[['product_id', 
     ]].duplicated(
    keep=False)][[
    'product_id', 
    'brand']].sort_values([
    'product_id',
    'brand'])

display(h)
h['brand'].unique().tolist()

Unnamed: 0,product_id,brand
19191,Product_20191007_12717887,COMFORT ZONE
19191,Product_20191007_12717887,[ COMFORT ZONE ]
20206,Product_20191007_12755905,COMFORT ZONE
20206,Product_20191007_12755905,[ COMFORT ZONE ]
22244,Product_20191007_12918864,COMFORT ZONE
...,...,...
87844,Product_20191016_14108270,[ COMFORT ZONE ]
87477,Product_20191016_14109713,COMFORT ZONE
87950,Product_20191016_14109713,[ COMFORT ZONE ]
87947,Product_20191016_14115714,COMFORT ZONE


['COMFORT ZONE', '[ COMFORT ZONE ]', 'NEUTROGENA', nan]

In [37]:
product_skincare[product_skincare[[
    'elc_solution_type',
    'source_product_identifier', 
    'product_id']].duplicated(keep=False)][[
    'elc_solution_type',
    'source_product_identifier', 
    'product_id',
    'brand']].sort_values([
    'elc_solution_type',
    'source_product_identifier', 
    'product_id',
    'brand'])['brand'].unique()

array(['COMFORT ZONE', '[ COMFORT ZONE ]'], dtype=object)

## Ratings and Reviews 
1. Filter only on US
2. create data by YM
3. change sentiment to num_sentiment : -1,0,1
4. change sentiments to hot ones HO_sentiment
5. create hot ones rating
5. Groupby YM, Source Product Identifierm, Channel, Product_ID
6. Aggregate by {num_sentiment:mean, num_rating:mean, HO_sentiments:count, HO_rating:count}
7. Number of reviews

In [None]:
# product_cosmetics = pd.read_csv(os.path.join(output_path,'product_catalogue_cosmetics.csv'))
# product_skincare = pd.read_csv(os.path.join(output_path,'product_catalogue_skincare.csv'))

In [228]:
def format_rating_and_reviews(df, product_catalogue):
    df.columns = [colname.lower().replace(' ','_') for colname in df.columns]
    df = df[df['geography']=='USA']
    df.loc[:,'clean_date'] = pd.to_datetime(df['date'], errors='coerce')
    df.loc[:,'year'] = df.loc[:,'clean_date'].dt.year
    df.loc[:,'month'] = df.loc[:,'clean_date'].dt.month
    df.loc[:,'rating'] = df['rating'].fillna(-1).astype(int)
    df.loc[:,'sentiment'] = df['sentiment'].str.lower()
    df = pd.concat([df, pd.get_dummies(data=df[['rating','sentiment']], columns=['rating','sentiment'], dtype=int)], axis=1)
    df.loc[:,'sentiment'] = df['sentiment_positive'] - df['sentiment_negative']
    df.loc[df['rating']==-1,'rating'] = np.nan
    
    # We could add 'elc_solution_type' and 'channel' to groupby !
    df = df.groupby(['elc_solution_type',
                     'source_product_identifier', 
                     'product_id', 
                     'year', 
                     'month']).agg({
        'rating':'mean',
        'rating_1':'sum',
        'rating_2':'sum',
        'rating_3':'sum',
        'rating_4':'sum',
        'rating_5':'sum',
        'sentiment_negative':'sum',
        'sentiment_neutral':'sum',
        'sentiment_positive':'sum',
        'sentiment':'mean'
    }).reset_index()

    initial_size=len(df)

    df = df.merge(product_catalogue, how='left')
    df['brand'].fillna(df.merge(product_catalogue[['elc_solution_type','source_product_identifier','brand']].drop_duplicates(), on=['elc_solution_type','source_product_identifier'], how='left', suffixes=['','_filler'])['brand_filler'], inplace=True)
    df['brand'].fillna(df.merge(product_catalogue[['source_product_identifier','brand']].drop_duplicates(), on=['source_product_identifier'], how='left', suffixes=['','_filler'])['brand_filler'], inplace=True)
    print('Check that no duplicates have been created:', initial_size==len(df))
    if df.isna().sum().sum()>0:
        print('Missing values:')
        display(df.isna().sum()/len(df)*100)
    return df

In [229]:
ratings_input_product_paths = os.path.join(os.getcwd(),r'../data/raw_data/Rating and Reviews/')
os.listdir(ratings_input_product_paths)

['cosmetic.p',
 'Cosmetics_Reviews_20190630.csv',
 'Cosmetics_Reviews_20190831.csv',
 'Cosmetics_Reviews_20190930.csv',
 'Cosmetics_Reviews_20191031.csv',
 'cosmetics_reviews_20191130.csv',
 'cosmetics_reviews_20191130_final.csv',
 'cosmetics_reviews_20191231.csv',
 'cosmetics_reviews_20200101-20200131.csv',
 'cosmetics_reviews_20200229.csv',
 'pwds',
 'Skin Care_Reviews_20190930.csv',
 'skincare_reviews_20150201-20200131.csv',
 'SkinCare_Reviews_20190630_ST MV2SV.csv',
 'SkinCare_Reviews_20190831.csv',
 'SkinCare_Reviews_20190930.7z',
 'SkinCare_Reviews_20191031.csv',
 'skincare_reviews_20191130.csv',
 'skincare_reviews_20191231.csv',
 'skincare_reviews_20200229.csv']

In [None]:
ratings_skincare = pd.DataFrame()
ratings_cosmetics = pd.DataFrame()
for file in os.listdir(ratings_input_product_paths):
    if '.csv' in file.lower():
        if 'skincare' in file.lower():
            print('Adding', file)
            temp = pd.read_csv(os.path.join(ratings_input_product_paths,file))
            temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
            display(temp.head(3))
            temp = format_rating_and_reviews(temp, product_skincare)
            ratings_skincare = pd.concat([ratings_skincare, temp])
            display(ratings_skincare.head(3))
        elif 'cosmetics' in file.lower():
            print('Adding', file)
            temp = pd.read_csv(os.path.join(ratings_input_product_paths,file))
            temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
            display(temp.head(3))
            temp = format_rating_and_reviews(temp, product_cosmetics)
            ratings_cosmetics = pd.concat([ratings_cosmetics, temp])
            display(ratings_cosmetics.head(3))
del temp

Adding Cosmetics_Reviews_20190630.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Check that no duplicates have been created: True
Missing values:


elc_solution_type            0.000000
source_product_identifier    0.000000
product_id                   0.000000
year                         0.000000
month                        0.000000
rating                       0.000000
rating_1                     0.000000
rating_2                     0.000000
rating_3                     0.000000
rating_4                     0.000000
rating_5                     0.000000
sentiment_negative           0.000000
sentiment_neutral            0.000000
sentiment_positive           0.000000
sentiment                    0.000000
brand                        0.006049
dtype: float64

Adding Cosmetics_Reviews_20190831.csv
Check that no duplicates have been created: True
Adding Cosmetics_Reviews_20190930.csv


  interactivity=interactivity, compiler=compiler, result=result)


Check that no duplicates have been created: True
Adding Cosmetics_Reviews_20191031.csv
Check that no duplicates have been created: True
Adding cosmetics_reviews_20191130.csv


  interactivity=interactivity, compiler=compiler, result=result)


# EXTRA CODE
Bad idea, since some of the files are bigger than 5 Gb.

In [None]:
# skincare = pd.DataFrame()
# cosmetic = pd.DataFrame()
# for file in listdir(path):
#     if '.csv' in file.lower():
#         if 'skincare' in file.lower():
#             print('Adding', file)
#             skincare=pd.concat([skincare, pd.read_csv(os.path.join(path,file))])
#             display(skincare)
#         elif 'cosmetics' in file.lower():
#             print('Adding', file)
#             cosmetic=pd.concat([cosmetic, pd.read_csv(os.path.join(path,file))])
#             display(cosmetic)

In [None]:
# skincare = dict()
# cosmetic = dict()
# for file in listdir(path):
#     if '.csv' in file.lower():
#         if 'skincare' in file.lower():
#             print('Adding', file)
#             skincare[file] = pd.read_csv(os.path.join(path,file))
# #         elif 'cosmetics' in file.lower():
# #             print('Adding', file)
# #             cosmetic[file] = pd.read_csv(os.path.join(path,file))

In [None]:
# pickle.dump(cosmetic, open( "../data/Rating and Reviews/cosmetic.p", "wb" ) )

In [None]:
# cosmetic_df=pd.DataFrame()
# for key, df in cosmetic.items():
#     print(key)
#     print(df.columns)
#     example=df
#     break