# Data Wrangling Rating and Reviews

In [1]:
import os
import pickle
import pandas as pd
pd.set_option('display.max_columns', None)

# pd.set_option('display.max_rows', 20)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', 1)

## Product Catalogue

### Reading data

In [2]:
output_path = r'../data/Product Catalogue/'

We have a look at the available data

In [5]:
input_product_paths = os.path.join(os.getcwd(),r'../data/Product Catalogue/')
os.listdir(input_product_paths)

['Cosmetics_Product_20190831.csv',
 'Cosmetics_Product_20190930.csv',
 'Cosmetics_Product_20191031.csv',
 'cosmetics_product_2019Q4.csv',
 'Cosmetics_Product_20200116.csv',
 'Global Product Catalogs - ELC ST Fix',
 'Historic Product Catalog - ELC ST Fix.b',
 'Historic Product Catalogs - ELC ST Fix',
 'product_catalogue_cosmetics.csv',
 'product_catalogue_skincare.csv',
 'SkinCare_New_Product_20191031.zip',
 'SkinCare_Product_20190630_ST MV2SV.csv',
 'SkinCare_Product_20190930.csv',
 'SkinCare_Product_20190930_Update.csv',
 'SkinCare_Product_20191031.csv',
 'skincare_product_20191231.csv',
 'skincare_product_2019Q3.csv',
 'skincare_product_2019Q3_final.csv',
 'SkinCare_Product_20200116.csv']

We create a DataFrame for skincare and cosmetics that concatenates all the dataframes.

In [6]:
product_skincare = pd.DataFrame()
product_cosmetics = pd.DataFrame()
for file in os.listdir(input_product_paths):
    if '.csv' in file.lower():
        if 'skincare' in file.lower():
            print('Adding', file)
            temp = pd.read_csv(os.path.join(input_product_paths,file))
            temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
            product_skincare=pd.concat([product_skincare, temp])
        elif 'cosmetics' in file.lower():
            print('Adding', file)
            temp = pd.read_csv(os.path.join(input_product_paths,file))
            temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
            product_cosmetics=pd.concat([product_cosmetics, temp])
del temp

Adding Cosmetics_Product_20190831.csv
Adding Cosmetics_Product_20190930.csv
Adding Cosmetics_Product_20191031.csv
Adding cosmetics_product_2019Q4.csv
Adding Cosmetics_Product_20200116.csv
Adding product_catalogue_cosmetics.csv
Adding product_catalogue_skincare.csv
Adding SkinCare_Product_20190630_ST MV2SV.csv
Adding SkinCare_Product_20190930.csv


  interactivity=interactivity, compiler=compiler, result=result)


Adding SkinCare_Product_20190930_Update.csv


  interactivity=interactivity, compiler=compiler, result=result)


Adding SkinCare_Product_20191031.csv
Adding skincare_product_20191231.csv
Adding skincare_product_2019Q3.csv
Adding skincare_product_2019Q3_final.csv
Adding SkinCare_Product_20200116.csv


In [8]:
product_cosmetics.columns

Index(['Product_ID', 'Source Product Identifier', 'Product', 'Description',
       'Channel', 'Brand', 'Feature', 'Benefit', 'Ingredient',
       'Additional Ingredients (no rulebase)', 'Product Form',
       'ELC Solution Type', 'Finish', 'Looks', 'Other', 'Trends', 'Rating',
       'Number of Reviews', 'Geography', 'Collection Date',
       'Normalized Product Title', 'ProductCluster_ID', 'Cluster Size',
       'Title'],
      dtype='object')

### Skincare

We start by changing the columns names to make them code friendly

In [9]:
product_skincare.columns = [colname.lower().replace(' ','_') for colname in product_skincare.columns]

We format the Clean Collection Date and fill the missing rows/bad format with the oldest date available.

In [10]:
product_skincare['clean_collection_date'] = pd.to_datetime(product_skincare['collection_date'], errors='coerce')
product_skincare.loc[product_skincare['clean_collection_date'].isna(), 'collection_date']=min(product_skincare['clean_collection_date'])
product_skincare['clean_collection_date'] = pd.to_datetime(product_skincare['collection_date'], errors='coerce')

We drop the duplicates while only keeping the row with the most recent Collection Date

In [28]:
product_skincare.sort_values('clean_collection_date', ascending=False).drop_duplicates(['source_product_identifier', 'brand'], inplace=True)

Finally, we save the DataFrame:

In [12]:
product_skincare.to_csv(os.path.join(output_path,'product_catalogue_skincare.csv'))

### Cosmetics

We do the same for Coemetics. We start by changing the columns names to make them code friendly

In [13]:
product_cosmetics.columns = [colname.lower().replace(' ','_') for colname in product_cosmetics.columns]

We format the Clean Collection Date and fill the missing rows/bad format with the oldest date available.

In [14]:
product_cosmetics['clean_collection_date'] = pd.to_datetime(product_cosmetics['collection_date'], errors='coerce')
product_cosmetics.loc[product_cosmetics['clean_collection_date'].isna(), 'collection_date']=min(product_cosmetics['clean_collection_date'])
product_cosmetics['clean_collection_date'] = pd.to_datetime(product_cosmetics['collection_date'], errors='coerce')

We drop the duplicates while only keeping the row with the most recent Collection Date

In [29]:
product_cosmetics.sort_values('clean_collection_date', ascending=False).drop_duplicates(['source_product_identifier', 'brand'], inplace=True)

Finally, we save the DataFrame:

In [16]:
product_cosmetics.to_csv(os.path.join(output_path,'product_catalogue_cosmetics.csv'))

## Ratings and Reviews 

In [138]:
rr_skincare_all = pd.read_csv('../data/Rating and Reviews/Cosmetics_Reviews_20191031.csv')

In [168]:
rr_skincare = rr_skincare_all.iloc[0:10000,:]

In [169]:
rr_skincare.columns = [colname.lower().replace(' ','_') for colname in rr_skincare.columns]
rr_skincare[rr_skincare['geography']=='USA']

Unnamed: 0,onlinepost_id,source_product_identifier,onlinestatement_id,date,title,description,geography,channel,product_id,rating,sentiment,feature,benefit,ingredient,additional_ingredients_(no_rulebase),product_form,elc_solution_type,values_in_'elc_solution_type',finish,looks,other,trends,syndication_source,best_for,verified_buyer,from,recommended,verified_reviewer,eye_color,hair_color,skin_tone,gender,i_shop_at macys.com,make-up_style,purchase_location,cons,pros,describe_yourself
0,OnlinePost_20191027_183891628,4862367,OnlineStatement_20191027_183891628_6,2019-06-10,Excellent mascara!!,"Personally, I would make it more sweat resista...",USA,Nordstrom,Product_20191016_5330272,4.0,neutral,Sweat Resistant,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Mascara,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,BUXOM COSMETICS,,,,,,,,,,,,,,,
1,OnlinePost_20191027_183870339,3856768,OnlineStatement_20191027_183870339_8,2019-02-10,"Great foundation, not so great packaging!",It was basically cracked all around the edges.,USA,Nordstrom,Product_20191016_5329737,5.0,positive,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Powder,Foundation,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,NARS,,,,,,,,,,,,,,,
2,OnlinePost_20191027_185166260,3022868,OnlineStatement_20191027_185166260_4,2017-08-06,Best nude I've tried,It tones down the pink in my lips just slightl...,USA,Nordstrom,Product_20191024_5591944,5.0,positive,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Cream;Pencil;Stick,Lipstick,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,
3,OnlinePost_20191027_183646811,4082228,OnlineStatement_20191027_183646811_1,2017-10-03,Really works,Really works.,USA,Nordstrom,Product_20191016_5330358,5.0,positive,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Mascara,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,
4,OnlinePost_20191027_185138836,4733023,OnlineStatement_20191027_185138836_6,2017-11-25,Chubby Lash Is Better,The finished result was very clumpy and unsoph...,USA,Nordstrom,Product_20191024_5591936,2.0,positive,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Wipe,Mascara,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,CLINIQUE,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,OnlinePost_20191027_183703425,3548601,OnlineStatement_20191027_183703425_1,2017-03-30,Pencil love!,Pencil love!,USA,Nordstrom,Product_20191016_5332438,5.0,positive,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Pencil;Powder;Stick,Cross Category Eye Sets,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,
9996,OnlinePost_20190724_160078396,4746215,OnlineStatement_20190724_160078396_2,2018-04-01,It Works,This really stays on and it does not dry out m...,USA,Nordstrom,Product_20191016_5336767,5.0,positive,Not Mentioned,Non-Drying,Not Mentioned,Not Mentioned,Liquid;Stick,Liquid Lipcolor,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,
9997,OnlinePost_20191027_183707421,4699988,OnlineStatement_20191027_183707421_2,2018-01-14,It's a thin line...,"I like that the tip allows ""hair-fine"" lines t...",USA,Nordstrom,Product_20191016_5328580,5.0,neutral,Not Mentioned,Anti Aging,Not Mentioned,Not Mentioned,Pencil,Eyebrow Liner/Shaper,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,
9998,OnlinePost_20190724_159131791,4428044,OnlineStatement_20190724_159131791_3,2017-11-30,Perfect matte lip,"I love the subtle, neutral Blushing Pop.",USA,Nordstrom,Product_20191024_5592020,5.0,positive,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Lipstick,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,


In [170]:
rr_skincare.loc[:,'clean_date'] = pd.to_datetime(rr_skincare['date'], errors='coerce')
rr_skincare.loc[:,'rating'] = rr_skincare['rating'].astype(int)
rr_skincare.loc[:,'sentiment'] = rr_skincare['sentiment'].str.lower()
rr_skincare = pd.concat([rr_skincare, pd.get_dummies(data=rr_skincare[['rating','sentiment']], columns=['rating','sentiment'])], axis=1)
rr_skincare.loc[:,'sentiment'] = rr_skincare['sentiment_positive'] - rr_skincare['sentiment_negative']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [171]:
rr_skincare

Unnamed: 0,onlinepost_id,source_product_identifier,onlinestatement_id,date,title,description,geography,channel,product_id,rating,sentiment,feature,benefit,ingredient,additional_ingredients_(no_rulebase),product_form,elc_solution_type,values_in_'elc_solution_type',finish,looks,other,trends,syndication_source,best_for,verified_buyer,from,recommended,verified_reviewer,eye_color,hair_color,skin_tone,gender,i_shop_at macys.com,make-up_style,purchase_location,cons,pros,describe_yourself,clean_date,rating_1,rating_2,rating_3,rating_4,rating_5,sentiment_negative,sentiment_neutral,sentiment_positive
0,OnlinePost_20191027_183891628,4862367,OnlineStatement_20191027_183891628_6,2019-06-10,Excellent mascara!!,"Personally, I would make it more sweat resista...",USA,Nordstrom,Product_20191016_5330272,4,0,Sweat Resistant,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Mascara,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,BUXOM COSMETICS,,,,,,,,,,,,,,,,2019-06-10,0,0,0,1,0,0,1,0
1,OnlinePost_20191027_183870339,3856768,OnlineStatement_20191027_183870339_8,2019-02-10,"Great foundation, not so great packaging!",It was basically cracked all around the edges.,USA,Nordstrom,Product_20191016_5329737,5,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Powder,Foundation,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,NARS,,,,,,,,,,,,,,,,2019-02-10,0,0,0,0,1,0,0,1
2,OnlinePost_20191027_185166260,3022868,OnlineStatement_20191027_185166260_4,2017-08-06,Best nude I've tried,It tones down the pink in my lips just slightl...,USA,Nordstrom,Product_20191024_5591944,5,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Cream;Pencil;Stick,Lipstick,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,,2017-08-06,0,0,0,0,1,0,0,1
3,OnlinePost_20191027_183646811,4082228,OnlineStatement_20191027_183646811_1,2017-10-03,Really works,Really works.,USA,Nordstrom,Product_20191016_5330358,5,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Mascara,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,,2017-10-03,0,0,0,0,1,0,0,1
4,OnlinePost_20191027_185138836,4733023,OnlineStatement_20191027_185138836_6,2017-11-25,Chubby Lash Is Better,The finished result was very clumpy and unsoph...,USA,Nordstrom,Product_20191024_5591936,2,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Wipe,Mascara,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,CLINIQUE,,,,,,,,,,,,,,,,2017-11-25,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,OnlinePost_20191027_183703425,3548601,OnlineStatement_20191027_183703425_1,2017-03-30,Pencil love!,Pencil love!,USA,Nordstrom,Product_20191016_5332438,5,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Pencil;Powder;Stick,Cross Category Eye Sets,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,,2017-03-30,0,0,0,0,1,0,0,1
9996,OnlinePost_20190724_160078396,4746215,OnlineStatement_20190724_160078396_2,2018-04-01,It Works,This really stays on and it does not dry out m...,USA,Nordstrom,Product_20191016_5336767,5,1,Not Mentioned,Non-Drying,Not Mentioned,Not Mentioned,Liquid;Stick,Liquid Lipcolor,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,,2018-04-01,0,0,0,0,1,0,0,1
9997,OnlinePost_20191027_183707421,4699988,OnlineStatement_20191027_183707421_2,2018-01-14,It's a thin line...,"I like that the tip allows ""hair-fine"" lines t...",USA,Nordstrom,Product_20191016_5328580,5,0,Not Mentioned,Anti Aging,Not Mentioned,Not Mentioned,Pencil,Eyebrow Liner/Shaper,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,,2018-01-14,0,0,0,0,1,0,1,0
9998,OnlinePost_20190724_159131791,4428044,OnlineStatement_20190724_159131791_3,2017-11-30,Perfect matte lip,"I love the subtle, neutral Blushing Pop.",USA,Nordstrom,Product_20191024_5592020,5,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Lipstick,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,,2017-11-30,0,0,0,0,1,0,0,1


In [198]:
# We could add 'elc_solution_type' and 'channel' to groupby !
agg_rr_skincare = rr_skincare.groupby(['source_product_identifier', 
                                       'product_id']).agg({
    'rating':'mean',
    'rating_1':'sum',
    'rating_2':'sum',
    'rating_3':'sum',
    'rating_4':'sum',
    'rating_5':'sum',
    'sentiment_negative':'sum',
    'sentiment_neutral':'sum',
    'sentiment_positive':'sum',
    'sentiment':'mean'
})

In [215]:
rr_skincare[rr_skincare['product_id']=='Product_20190722_4906040']

Unnamed: 0,onlinepost_id,source_product_identifier,onlinestatement_id,date,title,description,geography,channel,product_id,rating,sentiment,feature,benefit,ingredient,additional_ingredients_(no_rulebase),product_form,elc_solution_type,values_in_'elc_solution_type',finish,looks,other,trends,syndication_source,best_for,verified_buyer,from,recommended,verified_reviewer,eye_color,hair_color,skin_tone,gender,i_shop_at macys.com,make-up_style,purchase_location,cons,pros,describe_yourself,clean_date,rating_1,rating_2,rating_3,rating_4,rating_5,sentiment_negative,sentiment_neutral,sentiment_positive
4501,OnlinePost_20190922_173107534,10110342,OnlineStatement_20190922_173107534_6,2019-08-26,,[This review was collected as part of a promot...,UK,Boots,Product_20190722_4906040,5,0,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Pencil;Stick,Lip Liner,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,1,,,,,,,,,,,,,,,,2019-08-26,0,0,0,0,1,0,1,0


In [201]:
agg_rr_skincare

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,rating_1,rating_2,rating_3,rating_4,rating_5,sentiment_negative,sentiment_neutral,sentiment_positive,sentiment
source_product_identifier,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10110342,Product_20190722_4906040,5.0,0,0,0,0,1.0,0,1,0,0.0
10140838,Product_20191016_4956846,5.0,0,0,0,0,1.0,0,1,0,0.0
10151743,Product_20190403_1960569,5.0,0,0,0,0,1.0,0,0,1,1.0
10210840,Product_20190403_1961117,5.0,0,0,0,0,1.0,0,0,1,1.0
10230435,Product_20190403_1960427,5.0,0,0,0,0,1.0,0,0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...
xlsImpprod5640314,Product_20191016_5445900,5.0,0,0,0,0,1.0,0,0,1,1.0
xlsImpprod5770265,Product_20191016_5443487,5.0,0,0,0,0,1.0,0,0,1,1.0
xlsImpprod5870114,Product_20191016_5444563,5.0,0,0,0,0,1.0,0,0,1,1.0
xlsImpprod6400064,Product_20191016_5446382,5.0,0,0,0,0,1.0,0,0,1,1.0


In [202]:
product_columns_to_keep = [
    'product_id', 
    'source_product_identifier',
#     'channel', 
    'brand', 
#     'feature', 
#     'benefit', 
#     'ingredient',
#     'additional_ingredients_(no_rulebase)', 
#     'product_form',
    'elc_solution_type', 
#     'skin_condition', 
#     'packaging', 
#     'skin_type',
#     'treatment_area', 
#     'use_case', 
#     'rating',
#     'number_of_reviews',
#     'geography',
#     'collection_date',
#     'productcluster_id',
#     'normalized_product_title',
#     'product',
#     'cluster_size',
#     'clean_collection_date'
]


In [213]:
cp = agg_rr_skincare.merge(product_skincare[product_columns_to_keep], how='left', on=['source_product_identifier', 'product_id'])

In [214]:
rr_skincare

Unnamed: 0,onlinepost_id,source_product_identifier,onlinestatement_id,date,title,description,geography,channel,product_id,rating,sentiment,feature,benefit,ingredient,additional_ingredients_(no_rulebase),product_form,elc_solution_type,values_in_'elc_solution_type',finish,looks,other,trends,syndication_source,best_for,verified_buyer,from,recommended,verified_reviewer,eye_color,hair_color,skin_tone,gender,i_shop_at macys.com,make-up_style,purchase_location,cons,pros,describe_yourself,clean_date,rating_1,rating_2,rating_3,rating_4,rating_5,sentiment_negative,sentiment_neutral,sentiment_positive
0,OnlinePost_20191027_183891628,4862367,OnlineStatement_20191027_183891628_6,2019-06-10,Excellent mascara!!,"Personally, I would make it more sweat resista...",USA,Nordstrom,Product_20191016_5330272,4,0,Sweat Resistant,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Mascara,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,BUXOM COSMETICS,,,,,,,,,,,,,,,,2019-06-10,0,0,0,1,0,0,1,0
1,OnlinePost_20191027_183870339,3856768,OnlineStatement_20191027_183870339_8,2019-02-10,"Great foundation, not so great packaging!",It was basically cracked all around the edges.,USA,Nordstrom,Product_20191016_5329737,5,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Powder,Foundation,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,NARS,,,,,,,,,,,,,,,,2019-02-10,0,0,0,0,1,0,0,1
2,OnlinePost_20191027_185166260,3022868,OnlineStatement_20191027_185166260_4,2017-08-06,Best nude I've tried,It tones down the pink in my lips just slightl...,USA,Nordstrom,Product_20191024_5591944,5,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Cream;Pencil;Stick,Lipstick,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,,2017-08-06,0,0,0,0,1,0,0,1
3,OnlinePost_20191027_183646811,4082228,OnlineStatement_20191027_183646811_1,2017-10-03,Really works,Really works.,USA,Nordstrom,Product_20191016_5330358,5,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Mascara,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,,2017-10-03,0,0,0,0,1,0,0,1
4,OnlinePost_20191027_185138836,4733023,OnlineStatement_20191027_185138836_6,2017-11-25,Chubby Lash Is Better,The finished result was very clumpy and unsoph...,USA,Nordstrom,Product_20191024_5591936,2,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Wipe,Mascara,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,CLINIQUE,,,,,,,,,,,,,,,,2017-11-25,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,OnlinePost_20191027_183703425,3548601,OnlineStatement_20191027_183703425_1,2017-03-30,Pencil love!,Pencil love!,USA,Nordstrom,Product_20191016_5332438,5,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Pencil;Powder;Stick,Cross Category Eye Sets,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,,2017-03-30,0,0,0,0,1,0,0,1
9996,OnlinePost_20190724_160078396,4746215,OnlineStatement_20190724_160078396_2,2018-04-01,It Works,This really stays on and it does not dry out m...,USA,Nordstrom,Product_20191016_5336767,5,1,Not Mentioned,Non-Drying,Not Mentioned,Not Mentioned,Liquid;Stick,Liquid Lipcolor,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,,2018-04-01,0,0,0,0,1,0,0,1
9997,OnlinePost_20191027_183707421,4699988,OnlineStatement_20191027_183707421_2,2018-01-14,It's a thin line...,"I like that the tip allows ""hair-fine"" lines t...",USA,Nordstrom,Product_20191016_5328580,5,0,Not Mentioned,Anti Aging,Not Mentioned,Not Mentioned,Pencil,Eyebrow Liner/Shaper,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,,2018-01-14,0,0,0,0,1,0,1,0
9998,OnlinePost_20190724_159131791,4428044,OnlineStatement_20190724_159131791_3,2017-11-30,Perfect matte lip,"I love the subtle, neutral Blushing Pop.",USA,Nordstrom,Product_20191024_5592020,5,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Lipstick,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Original Source,,,,,,,,,,,,,,,,2017-11-30,0,0,0,0,1,0,0,1


## Data wrangling and aggregation
1. Filter only on US
2. create data by YM
3. change sentiment to num_sentiment : -1,0,1
4. change sentiments to hot ones HO_sentiment
5. create hot ones rating
5. Groupby YM, Source Product Identifierm, Channel, Product_ID
6. Aggregate by {num_sentiment:mean, num_rating:mean, HO_sentiments:count, HO_rating:count}
7. Number of reviews

In [23]:
product_skincare.columns

Index(['product_id', 'source_product_identifier', 'title', 'description',
       'channel', 'brand', 'feature', 'benefit', 'ingredient',
       'additional_ingredients_(no_rulebase)', 'product_form',
       'elc_solution_type', 'skin_condition', 'packaging', 'skin_type',
       'treatment_area', 'use_case', 'rating', 'number_of_reviews',
       'geography', 'collection_date', 'productcluster_id',
       'normalized_product_title', 'product', 'cluster_size',
       'clean_collection_date'],
      dtype='object')

In [22]:
rr_skincare.columns

Index(['onlinepost_id', 'source_product_identifier', 'onlinestatement_id',
       'date', 'title', 'description', 'geography', 'channel', 'product_id',
       'rating', 'sentiment', 'feature', 'benefit', 'ingredient',
       'additional_ingredients_(no_rulebase)', 'product_form',
       'elc_solution_type', 'values_in_'elc_solution_type'', 'finish', 'looks',
       'other', 'trends', 'syndication_source', 'best_for', 'verified_buyer',
       'from', 'recommended', 'verified_reviewer', 'eye_color', 'hair_color',
       'skin_tone', 'gender', 'i_shop_at macys.com', 'make-up_style',
       'purchase_location', 'cons', 'pros', 'describe_yourself', 'clean_date'],
      dtype='object')

In [27]:
product_skincare.head(1)

Unnamed: 0,product_id,source_product_identifier,title,description,channel,brand,feature,benefit,ingredient,additional_ingredients_(no_rulebase),product_form,elc_solution_type,skin_condition,packaging,skin_type,treatment_area,use_case,rating,number_of_reviews,geography,collection_date,productcluster_id,normalized_product_title,product,cluster_size,clean_collection_date
0,Product_20190807_11695846,B01COO5L44,"St. Tropez Self Tan Dark Bronzing Spray, 6.7 F...","Product Description For our deepest tan yet, t...",Amazon USA,St. Tropez,Lightweight Formula,Healthy;Long Lasting,Not Mentioned,Not Mentioned,Spray,Mists,Not Mentioned,Spray,Not Mentioned,Not Mentioned,Not Mentioned,3.5,22,USA,2019-06-04,,,,,2019-06-04


In [26]:
rr_skincare.head(1)

Unnamed: 0,onlinepost_id,source_product_identifier,onlinestatement_id,date,title,description,geography,channel,product_id,rating,sentiment,feature,benefit,ingredient,additional_ingredients_(no_rulebase),product_form,elc_solution_type,values_in_'elc_solution_type',finish,looks,other,trends,syndication_source,best_for,verified_buyer,from,recommended,verified_reviewer,eye_color,hair_color,skin_tone,gender,i_shop_at macys.com,make-up_style,purchase_location,cons,pros,describe_yourself,clean_date
0,OnlinePost_20191027_183891628,4862367,OnlineStatement_20191027_183891628_6,2019-06-10,Excellent mascara!!,"Personally, I would make it more sweat resista...",USA,Nordstrom,Product_20191016_5330272,4.0,Neutral,Sweat Resistant,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,Mascara,1,Not Mentioned,Not Mentioned,Not Mentioned,Not Mentioned,BUXOM COSMETICS,,,,,,,,,,,,,,,,2019-06-10


In [None]:
rr_skincare.groupby()

In [34]:
rr_skincare.columns

Index(['onlinepost_id', 'source_product_identifier', 'onlinestatement_id',
       'date', 'title', 'description', 'geography', 'channel', 'product_id',
       'rating', 'sentiment', 'feature', 'benefit', 'ingredient',
       'additional_ingredients_(no_rulebase)', 'product_form',
       'elc_solution_type', 'values_in_'elc_solution_type'', 'finish', 'looks',
       'other', 'trends', 'syndication_source', 'best_for', 'verified_buyer',
       'from', 'recommended', 'verified_reviewer', 'eye_color', 'hair_color',
       'skin_tone', 'gender', 'i_shop_at macys.com', 'make-up_style',
       'purchase_location', 'cons', 'pros', 'describe_yourself', 'clean_date'],
      dtype='object')

Columns that exist in both the Product Catalogue and the Rating and Reviews DataFrames. 

In [32]:
set(rr_skincare.columns).intersection(set(product_skincare.columns))

{'additional_ingredients_(no_rulebase)',
 'benefit',
 'channel',
 'description',
 'elc_solution_type',
 'feature',
 'geography',
 'ingredient',
 'product_form',
 'product_id',
 'rating',
 'source_product_identifier',
 'title'}

In [33]:
rr_skincare.merge(product_skincare,
                 how='left',
                 on=['product_id','source_product_identifier'])

KeyboardInterrupt: 

In [None]:
len(product_skincare.drop_duplicates(['Product_ID', 'Source Product Identifier', 'Brand']))

In [None]:
product_skincare[product_skincare.duplicated(subset=['Product_ID', 'Source Product Identifier', 'Brand'])].sort_values('Product_ID')

In [None]:
len(product_skincare.drop_duplicates())

In [None]:
len(product_skincare)

In [None]:
rr_paths = os.path.join(os.getcwd(),r'..\data\Rating and Reviews')
os.listdir(rr_paths)

In [None]:
product_skincare.isna().sum()/len(product_skincare)*100

# EXTRA CODE
Bad idea, since some of the files are bigger than 5 Gb.

In [None]:
# skincare = pd.DataFrame()
# cosmetic = pd.DataFrame()
# for file in listdir(path):
#     if '.csv' in file.lower():
#         if 'skincare' in file.lower():
#             print('Adding', file)
#             skincare=pd.concat([skincare, pd.read_csv(os.path.join(path,file))])
#             display(skincare)
#         elif 'cosmetics' in file.lower():
#             print('Adding', file)
#             cosmetic=pd.concat([cosmetic, pd.read_csv(os.path.join(path,file))])
#             display(cosmetic)

In [None]:
# skincare = dict()
# cosmetic = dict()
# for file in listdir(path):
#     if '.csv' in file.lower():
#         if 'skincare' in file.lower():
#             print('Adding', file)
#             skincare[file] = pd.read_csv(os.path.join(path,file))
# #         elif 'cosmetics' in file.lower():
# #             print('Adding', file)
# #             cosmetic[file] = pd.read_csv(os.path.join(path,file))

In [None]:
# pickle.dump(cosmetic, open( "../data/Rating and Reviews/cosmetic.p", "wb" ) )

In [None]:
# cosmetic_df=pd.DataFrame()
# for key, df in cosmetic.items():
#     print(key)
#     print(df.columns)
#     example=df
#     break

## Data wrangling and aggregation
1. Filter only on US
2. create data by YM
3. change sentiment to num_sentiment : -1,0,1
4. change sentiments to hot ones HO_sentiment
5. create hot ones rating
5. Groupby YM, Source Product Identifierm, Channel, Product_ID
6. Aggregate by {num_sentiment:mean, num_rating:mean, HO_sentiments:count, HO_rating:count}
7. Number of reviews

In [None]:
df = pd.read_csv('../data/Rating and Reviews/Cosmetics_Reviews_20191031.csv')

df['Date'] = pd.to_datetime(df['Date'])

In [None]:
product_catalogue_.columns

In [None]:
df.columns

In [None]:
set(df.columns).intersection(set(prod.columns))

In [None]:
len(df.Product_ID.unique())

In [None]:
 len(df['Source Product Identifier'].unique())

In [None]:
len(prod.Product_ID.unique())

In [None]:
 len(prod['Source Product Identifier'].unique())

In [None]:
df.groupby(['Source Product Identifier', 'Product_ID']).first()

In [None]:
key = df[['Source Product Identifier', 'Product_ID']].drop_duplicates()

In [None]:
g = (key.groupby('Source Product Identifier')['Product_ID'].count()>1).to_frame('double product id')

In [None]:
g[g['double product id']==True]

In [None]:
key = df[['Source Product Identifier', 'Product_ID']].drop_duplicates()

In [None]:
g = (key.groupby('Product_ID')['Source Product Identifier'].count()>1).to_frame('Source product id')

In [None]:
Product_id_with_two_source_prod = g[g['Source product id']==True].index

In [None]:
key[key['Product_ID'].isin(Product_id_with_two_source_prod.tolist())].sort_values('Product_ID')

In [None]:
prod[prod['Source Product Identifier']=='77796']

In [None]:
df[df['Product_ID']=='Product_20190429_3244806']

In [None]:
Product_id_with_two_source_prod

In [None]:
df.groupby(['Source Product Identifier'])['Product_ID'].count()

In [None]:
for 