# Data Wrangling Rating and Reviews

**PACKAGES**

In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import time
from fuzzywuzzy import fuzz
import textdistance

**CODE PARAMETERS**

In [None]:
# PANDAS DISPLAY PARAMETERS
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', 1)

In [None]:
output_ratings_path = r'../data/clean_data/'
input_rating_paths = os.path.join(os.getcwd(),r'../data/raw_data/Rating and Reviews/')

In [None]:
output_product_path = r'../data/clean_data/'
input_product_paths = os.path.join(os.getcwd(),r'../data/raw_data/Product Catalogue/')

In [None]:
input_demand_paths = os.path.join(os.getcwd(),r'../data/raw_data/Demand Data/')

## Aggregating Product Catalogue Data

### Reading data

We have a look at the available data

In [None]:
os.listdir(input_product_paths)

We create a DataFrame for skincare and cosmetics that concatenates all the dataframes.

In [None]:
product_skincare = pd.DataFrame()
product_cosmetics = pd.DataFrame()
for file in os.listdir(input_product_paths):
    if '.csv' in file.lower():
        if 'skincare' in file.lower():
            print('Adding', file)
            temp = pd.read_csv(os.path.join(input_product_paths,file))
            temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
            product_skincare=pd.concat([product_skincare, temp])
        elif 'cosmetics' in file.lower():
            print('Adding', file)
            temp = pd.read_csv(os.path.join(input_product_paths,file))
            temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
            product_cosmetics=pd.concat([product_cosmetics, temp])
del temp

### Skincare

We start by changing the columns names to make them code friendly

In [None]:
product_skincare.columns = [colname.lower().replace(' ','_') for colname in product_skincare.columns]

We format the Clean Collection Date and fill the missing rows/bad format with the oldest date available.

In [None]:
product_skincare['clean_collection_date'] = pd.to_datetime(product_skincare['collection_date'], errors='coerce')
product_skincare.loc[product_skincare['clean_collection_date'].isna(), 'collection_date']=min(product_skincare['clean_collection_date'])
product_skincare['clean_collection_date'] = pd.to_datetime(product_skincare['collection_date'], errors='coerce')

We drop the duplicates while only keeping the row with the most recent Collection Date

In [None]:
product_skincare = product_skincare.sort_values('clean_collection_date', ascending=False).dropna(subset=['brand']).drop_duplicates([
    'elc_solution_type',
    'source_product_identifier', 
    'product_id'])

Finally, we save the DataFrame:

In [None]:
product_skincare.to_csv(os.path.join(output_product_path,'product_catalogue_skincare.csv'), index=False)

### Cosmetics

We do the same for Coemetics. We start by changing the columns names to make them code friendly

In [None]:
product_cosmetics.columns = [colname.lower().replace(' ','_') for colname in product_cosmetics.columns]

We format the Clean Collection Date and fill the missing rows/bad format with the oldest date available.

In [None]:
product_cosmetics['clean_collection_date'] = pd.to_datetime(product_cosmetics['collection_date'], errors='coerce')
product_cosmetics.loc[product_cosmetics['clean_collection_date'].isna(), 'collection_date']=min(product_cosmetics['clean_collection_date'])
product_cosmetics['clean_collection_date'] = pd.to_datetime(product_cosmetics['collection_date'], errors='coerce')

We drop the duplicates while only keeping the row with the most recent Collection Date

In [None]:
product_cosmetics.sort_values('clean_collection_date', ascending=False).dropna(subset=['brand']).drop_duplicates([
    'elc_solution_type',
    'source_product_identifier', 
    'product_id'], inplace=True)

Finally, we save the DataFrame:

In [None]:
product_cosmetics.to_csv(os.path.join(output_product_path,'product_catalogue_cosmetics.csv'), index=False)

## Aggregating Ratings and Reviews Data
1. Filter only on US
2. create data by YM
3. change sentiment to num_sentiment : -1,0,1
4. change sentiments to hot ones HO_sentiment
5. create hot ones rating
5. Groupby YM, Source Product Identifierm, Channel, Product_ID
6. Aggregate by {num_sentiment:mean, num_rating:mean, HO_sentiments:count, HO_rating:count}
7. Number of reviews

In [None]:
def format_rating_and_reviews(df, product_catalogue):
    print('Formatting data...')
    df.columns = [colname.lower().replace(' ','_') for colname in df.columns]
    df = df[df['geography']=='USA']
    df.loc[:,'clean_date'] = pd.to_datetime(df['date'], errors='coerce')
    df.loc[:,'year'] = df.loc[:,'clean_date'].dt.year
    df.loc[:,'month'] = df.loc[:,'clean_date'].dt.month
    df.loc[:,'rating'] = df['rating'].fillna(-1).astype(int)
    df.loc[:,'sentiment'] = df['sentiment'].str.lower()
    df = pd.concat([df, pd.get_dummies(data=df[['rating','sentiment']], columns=['rating','sentiment'], dtype=int)], axis=1)
    df.loc[:,'sentiment'] = df['sentiment_positive'] - df['sentiment_negative']
    df.loc[df['rating']==-1,'rating'] = np.nan
    df.loc[:,'nb_ratings'] = df['rating']
    
    # We could add 'elc_solution_type' and 'channel' to groupby !
    print('Aggregating data...')
    df = df.groupby(['elc_solution_type',
                     'source_product_identifier', 
                     'product_id', 
                     'year', 
                     'month']).agg({
        'nb_ratings':'count',
        'rating':'mean',
        'rating_1':'sum',
        'rating_2':'sum',
        'rating_3':'sum',
        'rating_4':'sum',
        'rating_5':'sum',
        'sentiment_negative':'sum',
        'sentiment_neutral':'sum',
        'sentiment_positive':'sum',
        'sentiment':'mean'
    }).reset_index()
    print('Adding product catalogue data...')
    initial_size=len(df)
    product_catalogue = product_catalogue[['elc_solution_type',
                                           'source_product_identifier', 
                                           'product_id',
                                           'brand']].drop_duplicates(['elc_solution_type',
                                                                      'source_product_identifier', 
                                                                      'product_id'])
    df = df.merge(product_catalogue[['elc_solution_type','source_product_identifier', 'product_id', 'brand']], how='left')
    df['brand'].fillna(df.merge(product_catalogue[['elc_solution_type','source_product_identifier','brand']].drop_duplicates(), on=['elc_solution_type','source_product_identifier'], how='left', suffixes=['','_filler'])['brand_filler'], inplace=True)
    df['brand'].fillna(df.merge(product_catalogue[['source_product_identifier','brand']].drop_duplicates(), on=['source_product_identifier'], how='left', suffixes=['','_filler'])['brand_filler'], inplace=True)
    print('Check that no duplicates have been created:', initial_size==len(df))
    if df.isna().sum().sum()>0:
        print('Missing values:')
        display(df.isna().sum()/len(df)*100)
    return df

In [None]:
types = {'OnlinePost_ID': object,
 'Source Product Identifier': object,
 'OnlineStatement_ID': object,
 'Date': object,
 'Title': object,
 'Description': object,
 'Geography': object,
 'Channel': object,
 'Product_ID': object,
 'Rating': float,
 'Sentiment': object,
 'Feature': object,
 'Benefit': object,
 'Ingredient': object,
 'Additional Ingredients (no rulebase)': object,
 'Product Form': object,
 'ELC Solution Type': object,
 'Finish': object,
 'Looks': object,
 'Other': object,
 'Trends': object,
 'Syndication Source': object,
 'Best For': object,
 'Verified Buyer': object,
 'From': object,
 'Recommended': object,
 'Verified Reviewer': object,
 'Eye Color': object,
 'Hair Color': object,
 'Skin Tone': object,
 'Gender': object,
 'I shop at macys.com': object,
 'Make-up Style': object,
 'Purchase Location': object,
 'Cons': object,
 'Pros': object,
 'Describe Yourself': object,
 'Reviewer Skin Type': object,
 'Age': object}

In [None]:
os.listdir(input_rating_paths)

In [None]:
nb_files_to_read_skincare = 0
nb_files_to_read_cosmetics = 0
for file in os.listdir(input_rating_paths):
    if '.csv' in file.lower() :
        if 'skincare' in file.lower():
            nb_files_to_read_skincare+=1
        elif 'cosmetics' in file.lower():
            nb_files_to_read_cosmetics+=1
print('Number of skincare files to read:', nb_files_to_read_skincare)
print('Number of cosmetics files to read:', nb_files_to_read_cosmetics)

#### Skincare

In [None]:
product_skincare = pd.read_csv(os.path.join(output_product_path,'product_catalogue_skincare.csv'))

In [None]:
start = time.time()
nb_read_skincare = 0
ratings_skincare = pd.DataFrame()
   
for file in os.listdir(input_rating_paths):
    if '.csv' in file.lower() and 'skincare' in file.lower():
        interm = time.time()
        nb_read_skincare+=1
        print('Reading', file, nb_read_skincare,'out of', nb_files_to_read_skincare, '...')
        temp = pd.read_csv(os.path.join(input_rating_paths,file))
        temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
        temp = format_rating_and_reviews(temp, product_skincare)
        ratings_skincare = pd.concat([ratings_skincare, temp])
        print('New length skincare:', len(ratings_skincare))
        print('Time for this dataset:', time.time()-interm)
        print('Total time:', time.time()-start)
del temp
ratings_skincare.to_csv(os.path.join(output_ratings_path,'ratings_and_reviews_skincare_by_product_id.csv'), index=False)

#### Cosmetics

In [None]:
product_cosmetics = pd.read_csv(os.path.join(output_product_path,'product_catalogue_cosmetics.csv'))

In [None]:
start = time.time()
nb_read_cosmetics = 0
ratings_cosmetics = pd.DataFrame()
for file in os.listdir(input_rating_paths):
    if '.csv' in file.lower() and 'cosmetics' in file.lower():
        interm = time.time()
        nb_read_cosmetics+=1
        print('Reading', file, nb_read_cosmetics,'out of', nb_files_to_read_cosmetics, '...')
        temp = pd.read_csv(os.path.join(input_rating_paths,file))
        temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
        temp = format_rating_and_reviews(temp, product_cosmetics)
        ratings_cosmetics = pd.concat([ratings_cosmetics, temp])
        print('New length cosmetic:', len(ratings_cosmetics))
        print('Time for this dataset:', time.time()-interm)
        print('Total time:', time.time()-start)
del temp
ratings_cosmetics.to_csv(os.path.join(output_ratings_path,'ratings_and_reviews_cosmetics_by_product_id.csv'), index=False)

#### Merging  Skincare and Cosmetics

In [None]:
ratings_skincare['major_category'] = 'Skincare'
ratings_cosmetics['major_category'] = 'Cosmetics'
ratings = pd.concat([ratings_skincare, ratings_cosmetics])
ratings['brand'] = ratings['brand'].str.lower()
ratings.to_csv(os.path.join(output_ratings_path,'ratings_and_reviews_by_product_id.csv'), index=False)

In [None]:
ratings

## Mapping Brands

In [None]:
elc_brands = pd.read_csv('../data/clean_data/elc_brands.csv', encoding = "ISO-8859-1")
elc_brands

In [None]:
brands = pd.DataFrame({'brand' : ratings['brand'].unique()})
brands

In [None]:
brand_matching = brands.assign(key=0).merge(elc_brands.assign(key=0), on='key', how='left').drop('key', axis=1)
brand_matching

In [None]:
def custom_distance(row):
    jaro = textdistance.jaro_winkler(str(row['brand']).lower().replace('.','').replace('&','and'), str(row['elc_brand']).lower().replace('.',''))
#     jaccard = textdistance.jaccard(str(row['brand']).lower().replace('.','').replace('&','and'), str(row['elc_brand']).lower().replace('.',''))
    fuzzi = fuzz.partial_ratio(str(row['brand']).lower().replace('.','').replace('&','and'), str(row['elc_brand']).lower().replace('.',''))/100
    return (fuzzi+jaro)/2

In [None]:
brand_matching['score'] = brand_matching.apply(lambda row : custom_distance(row), axis=1)
brand_matching

In [None]:
brand_matching.to_csv('../data/clean_data/brand_mapping_scores.csv', index=False)

In [None]:
brand_matching = brand_matching.groupby('brand').apply(lambda x: x.nlargest(1,'score')).reset_index(drop=True)
brand_matching

In [None]:
brand_matching.loc[brand_matching['score']>0.92]

In [None]:
brand_matching.loc[brand_matching['score']<0.92, 'brand_abbrev'] = np.nan
brand_matching.loc[brand_matching['score']<0.92, 'elc_brand'] = np.nan

In [None]:
brand_matching.drop('score', axis=1, inplace=True)

In [None]:
pd.set_option('display.max_rows', len(brand_matching)+1)
display(brand_matching)
pd.set_option('display.max_rows', 100)

In [None]:
brand_matching.to_csv('../data/clean_data/brand_mapping.csv')

## Aggregate by Brand - ELC solution type - year - month

In [None]:
ratings = ratings.merge(brand_matching, how='left', on='brand')

In [None]:
ratings.dropna(subset=['elc_brand'], inplace=True)

In [None]:
ratings

In [None]:
ratings = ratings.groupby(['elc_brand',
                           'brand_abbrev',
                           'elc_solution_type', 
                           'year',
                           'month']).agg({
        'rating_1':'sum',
        'rating_2':'sum',
        'rating_3':'sum',
        'rating_4':'sum',
        'rating_5':'sum',
        'sentiment_negative':'sum',
        'sentiment_neutral':'sum',
        'sentiment_positive':'sum',
    }).reset_index().rename(columns={
    'brand_abbrev':'brand',
    'elc_solution_type':'sub_category'
})

In [None]:
ratings

In [None]:
ratings['date'] = pd.to_datetime(ratings[['year', 'month']].assign(DAY=1))

In [None]:
ratings.drop(['year', 'month'], axis=1, inplace=True)

In [None]:
ratings

## Formating Demand Data

In [None]:
demand = pd.read_csv(os.path.join(input_demand_paths,'NA+UK demand data by category FY17-FY20.csv'))
demand.columns = [col.lower().replace(' ','_') for col in demand.columns]

In [None]:
dates_columns = demand.columns[demand.columns.str.contains('/')].tolist()

In [None]:
demand[demand == '-']=0

In [None]:
demand.loc[:, dates_columns] = demand.loc[:, dates_columns].apply(lambda x: x.str.replace(',', '').fillna(0).astype(int), axis=0)

In [None]:
demand = demand.groupby(['brand', 'sub_category'])[dates_columns].sum()

In [None]:
demand.columns.name = 'date'

In [None]:
demand = demand.stack().to_frame('demand').reset_index()

In [None]:
demand['date'] = pd.to_datetime(demand['date'], errors='coerce')

In [None]:
demand

In [None]:
ratings_demand = ratings.merge(demand)

In [None]:
ratings_demand.to_csv('../data/clean_data/ratings_with_demand.csv')

## EXTRA CODE
Bad idea, since some of the files are bigger than 5 Gb.

In [None]:
# skincare = pd.DataFrame()
# cosmetic = pd.DataFrame()
# for file in listdir(path):
#     if '.csv' in file.lower():
#         if 'skincare' in file.lower():
#             print('Adding', file)
#             skincare=pd.concat([skincare, pd.read_csv(os.path.join(path,file))])
#             display(skincare)
#         elif 'cosmetics' in file.lower():
#             print('Adding', file)
#             cosmetic=pd.concat([cosmetic, pd.read_csv(os.path.join(path,file))])
#             display(cosmetic)

In [None]:
# skincare = dict()
# cosmetic = dict()
# for file in listdir(path):
#     if '.csv' in file.lower():
#         if 'skincare' in file.lower():
#             print('Adding', file)
#             skincare[file] = pd.read_csv(os.path.join(path,file))
# #         elif 'cosmetics' in file.lower():
# #             print('Adding', file)
# #             cosmetic[file] = pd.read_csv(os.path.join(path,file))

In [None]:
# pickle.dump(cosmetic, open( "../data/Rating and Reviews/cosmetic.p", "wb" ) )

In [None]:
# cosmetic_df=pd.DataFrame()
# for key, df in cosmetic.items():
#     print(key)
#     print(df.columns)
#     example=df
#     break

In [None]:
# product_skincare[product_skincare[[
#     'elc_solution_type',
#     'source_product_identifier', 
#     'product_id']].duplicated(keep=False)][[
#     'elc_solution_type',
#     'source_product_identifier', 
#     'product_id',
#     'brand']].sort_values([
#     'elc_solution_type',
#     'source_product_identifier', 
#     'product_id',
#     'brand'])['brand'].unique()

In [None]:
# nb_files_to_read = 0
# for file in os.listdir(ratings_input_product_paths):
#     if ('.csv' in file.lower()) and (('skincare' in file.lower()) or ('cosmetics' in file.lower())):
#         nb_files_to_read+=1
# nb_files_to_read

In [None]:
# start = time.time()
# nb_read = 0
# ratings_skincare = pd.DataFrame()
# ratings_cosmetics = pd.DataFrame()
# for file in os.listdir(ratings_input_product_paths):
#     if '.csv' in file.lower():
#         if 'skincare' in file.lower():
#             interm = time.time()
#             nb_read+=1
#             print('Reading', file, nb_read,'out of', nb_files_to_read, '...')
#             temp = pd.read_csv(os.path.join(ratings_input_product_paths,file))
#             temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
#             temp = format_rating_and_reviews(temp, product_skincare)
#             ratings_skincare = pd.concat([ratings_skincare, temp])
#             print('New length skincare:', len(ratings_skincare))
#             print('Time for this dataset:', time.time()-interm)
#             print('Total time:', time.time()-start)
#         elif 'cosmetics' in file.lower():
#             interm = time.time()
#             nb_read+=1
#             print('Reading', file, nb_read,'out of', nb_files_to_read, '...')
#             temp = pd.read_csv(os.path.join(ratings_input_product_paths,file))
#             temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
#             temp = format_rating_and_reviews(temp, product_cosmetics)
#             ratings_cosmetics = pd.concat([ratings_cosmetics, temp])
#             print('New length cosmetic:', len(ratings_cosmetics))
#             print('Time for this dataset:', time.time()-interm)
#             print('Total time:', time.time()-start)
# del temp

In [None]:
# ratings_cosmetics.to_csv(os.path.join(output_ratings_path,'ratings_and_reviews_cosmetics.csv'), index=False)
# ratings_skincare.to_csv(os.path.join(output_ratings_path,'ratings_and_reviews_skincare.csv'), index=False)