In [40]:
import os
import random
import pandas as pd
import numpy as np
import gzip
import subprocess
from loguru import logger
from langchain.prompts import PromptTemplate
from tqdm import tqdm

def parse(path: str) -> dict:
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path: str) -> pd.DataFrame:
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [2]:
def read_data(dir: str, dataset: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    raw_path = os.path.join(dir, 'raw_data')
    data_file = 'reviews_{}_5.json.gz'.format(dataset)
    meta_file = 'meta_{}.json.gz'.format(dataset)
    data_df = get_df(os.path.join(raw_path, data_file))
    meta_df = get_df(os.path.join(raw_path, meta_file))
    return data_df, meta_df


In [3]:
dir = '../data/Beauty'
dataset = 'Beauty'

In [4]:
df = read_data(dir, dataset)

In [5]:
df

(            reviewerID        asin  \
 0       A1YJEY40YUW4SE  7806397051   
 1        A60XNB876KYML  7806397051   
 2       A3G6XNM240RMWA  7806397051   
 3       A1PQFP6SAJ6D80  7806397051   
 4       A38FVHZTNQ271F  7806397051   
 ...                ...         ...   
 198497  A2BLFCOPSMBOZ9  B00LLPT4HI   
 198498  A1UQBFCERIP7VJ  B00LLPT4HI   
 198499  A35Q0RBM3YNQNF  B00LLPT4HI   
 198500  A3LGT6UZL99IW1  B00LLPT4HI   
 198501  A3UJRNI8UR4871  B00LLPT4HI   
 
                                             reviewerName helpful  \
 0                                                 Andrea  [3, 4]   
 1                                             Jessica H.  [1, 1]   
 2                                                  Karen  [0, 1]   
 3                                                  Norah  [2, 2]   
 4                                              Nova Amor  [0, 0]   
 ...                                                  ...     ...   
 198497                                     Dav

In [6]:
type(df)

tuple

In [7]:
df[0]

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1YJEY40YUW4SE,7806397051,Andrea,"[3, 4]",Very oily and creamy. Not at all what I expect...,1.0,Don't waste your money,1391040000,"01 30, 2014"
1,A60XNB876KYML,7806397051,Jessica H.,"[1, 1]",This palette was a decent price and I was look...,3.0,OK Palette!,1397779200,"04 18, 2014"
2,A3G6XNM240RMWA,7806397051,Karen,"[0, 1]",The texture of this concealer pallet is fantas...,4.0,great quality,1378425600,"09 6, 2013"
3,A1PQFP6SAJ6D80,7806397051,Norah,"[2, 2]",I really can't tell what exactly this thing is...,2.0,Do not work on my face,1386460800,"12 8, 2013"
4,A38FVHZTNQ271F,7806397051,Nova Amor,"[0, 0]","It was a little smaller than I expected, but t...",3.0,It's okay.,1382140800,"10 19, 2013"
...,...,...,...,...,...,...,...,...,...
198497,A2BLFCOPSMBOZ9,B00LLPT4HI,Dave Edmiston,"[0, 0]",Just a little dab of this shea butter should b...,5.0,A little dab...,1405468800,"07 16, 2014"
198498,A1UQBFCERIP7VJ,B00LLPT4HI,Margaret Picky,"[0, 0]",This shea butter is completely raw and unrefin...,5.0,Pure organic raw shea butter,1405296000,"07 14, 2014"
198499,A35Q0RBM3YNQNF,B00LLPT4HI,M. Hill,"[0, 0]",The skin is the body's largest organ and it ab...,5.0,One Pound Organic Grade A Unrefined Shea Butter,1405468800,"07 16, 2014"
198500,A3LGT6UZL99IW1,B00LLPT4HI,"Richard C. Drew ""Anaal Nathra/Uthe vas Bethod...","[0, 0]",I have very dry elbows and knees. I have a to...,5.0,This stuff is amazing!,1405382400,"07 15, 2014"


In [8]:
joined_dataset = df[0].merge(df[1], on = 'asin')


In [9]:
joined_dataset

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,description,title,imUrl,salesRank,categories,price,related,brand
0,A1YJEY40YUW4SE,7806397051,Andrea,"[3, 4]",Very oily and creamy. Not at all what I expect...,1.0,Don't waste your money,1391040000,"01 30, 2014",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[[Beauty, Makeup, Face, Concealers & Neutraliz...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
1,A60XNB876KYML,7806397051,Jessica H.,"[1, 1]",This palette was a decent price and I was look...,3.0,OK Palette!,1397779200,"04 18, 2014",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[[Beauty, Makeup, Face, Concealers & Neutraliz...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
2,A3G6XNM240RMWA,7806397051,Karen,"[0, 1]",The texture of this concealer pallet is fantas...,4.0,great quality,1378425600,"09 6, 2013",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[[Beauty, Makeup, Face, Concealers & Neutraliz...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
3,A1PQFP6SAJ6D80,7806397051,Norah,"[2, 2]",I really can't tell what exactly this thing is...,2.0,Do not work on my face,1386460800,"12 8, 2013",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[[Beauty, Makeup, Face, Concealers & Neutraliz...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
4,A38FVHZTNQ271F,7806397051,Nova Amor,"[0, 0]","It was a little smaller than I expected, but t...",3.0,It's okay.,1382140800,"10 19, 2013",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[[Beauty, Makeup, Face, Concealers & Neutraliz...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198497,A2BLFCOPSMBOZ9,B00LLPT4HI,Dave Edmiston,"[0, 0]",Just a little dab of this shea butter should b...,5.0,A little dab...,1405468800,"07 16, 2014","Ingredients:\n100% Pure, Unrefined, Organic Af...",100% Organic Raw Unrefined African Shea Butter...,http://ecx.images-amazon.com/images/I/41up5%2B...,{'Beauty': 1265},"[[Beauty, Skin Care, Body, Moisturizers, Lotio...",9.97,"{'also_bought': ['B00LFPS0CY', 'B00KZNOHTW', '...",
198498,A1UQBFCERIP7VJ,B00LLPT4HI,Margaret Picky,"[0, 0]",This shea butter is completely raw and unrefin...,5.0,Pure organic raw shea butter,1405296000,"07 14, 2014","Ingredients:\n100% Pure, Unrefined, Organic Af...",100% Organic Raw Unrefined African Shea Butter...,http://ecx.images-amazon.com/images/I/41up5%2B...,{'Beauty': 1265},"[[Beauty, Skin Care, Body, Moisturizers, Lotio...",9.97,"{'also_bought': ['B00LFPS0CY', 'B00KZNOHTW', '...",
198499,A35Q0RBM3YNQNF,B00LLPT4HI,M. Hill,"[0, 0]",The skin is the body's largest organ and it ab...,5.0,One Pound Organic Grade A Unrefined Shea Butter,1405468800,"07 16, 2014","Ingredients:\n100% Pure, Unrefined, Organic Af...",100% Organic Raw Unrefined African Shea Butter...,http://ecx.images-amazon.com/images/I/41up5%2B...,{'Beauty': 1265},"[[Beauty, Skin Care, Body, Moisturizers, Lotio...",9.97,"{'also_bought': ['B00LFPS0CY', 'B00KZNOHTW', '...",
198500,A3LGT6UZL99IW1,B00LLPT4HI,"Richard C. Drew ""Anaal Nathra/Uthe vas Bethod...","[0, 0]",I have very dry elbows and knees. I have a to...,5.0,This stuff is amazing!,1405382400,"07 15, 2014","Ingredients:\n100% Pure, Unrefined, Organic Af...",100% Organic Raw Unrefined African Shea Butter...,http://ecx.images-amazon.com/images/I/41up5%2B...,{'Beauty': 1265},"[[Beauty, Skin Care, Body, Moisturizers, Lotio...",9.97,"{'also_bought': ['B00LFPS0CY', 'B00KZNOHTW', '...",


In [10]:
reindex_dataset = joined_dataset.set_index(['reviewerID', 'asin'])

In [11]:
reindex_dataset

Unnamed: 0_level_0,Unnamed: 1_level_0,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,description,title,imUrl,salesRank,categories,price,related,brand
reviewerID,asin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
A1YJEY40YUW4SE,7806397051,Andrea,"[3, 4]",Very oily and creamy. Not at all what I expect...,1.0,Don't waste your money,1391040000,"01 30, 2014",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[[Beauty, Makeup, Face, Concealers & Neutraliz...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
A60XNB876KYML,7806397051,Jessica H.,"[1, 1]",This palette was a decent price and I was look...,3.0,OK Palette!,1397779200,"04 18, 2014",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[[Beauty, Makeup, Face, Concealers & Neutraliz...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
A3G6XNM240RMWA,7806397051,Karen,"[0, 1]",The texture of this concealer pallet is fantas...,4.0,great quality,1378425600,"09 6, 2013",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[[Beauty, Makeup, Face, Concealers & Neutraliz...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
A1PQFP6SAJ6D80,7806397051,Norah,"[2, 2]",I really can't tell what exactly this thing is...,2.0,Do not work on my face,1386460800,"12 8, 2013",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[[Beauty, Makeup, Face, Concealers & Neutraliz...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
A38FVHZTNQ271F,7806397051,Nova Amor,"[0, 0]","It was a little smaller than I expected, but t...",3.0,It's okay.,1382140800,"10 19, 2013",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[[Beauty, Makeup, Face, Concealers & Neutraliz...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A2BLFCOPSMBOZ9,B00LLPT4HI,Dave Edmiston,"[0, 0]",Just a little dab of this shea butter should b...,5.0,A little dab...,1405468800,"07 16, 2014","Ingredients:\n100% Pure, Unrefined, Organic Af...",100% Organic Raw Unrefined African Shea Butter...,http://ecx.images-amazon.com/images/I/41up5%2B...,{'Beauty': 1265},"[[Beauty, Skin Care, Body, Moisturizers, Lotio...",9.97,"{'also_bought': ['B00LFPS0CY', 'B00KZNOHTW', '...",
A1UQBFCERIP7VJ,B00LLPT4HI,Margaret Picky,"[0, 0]",This shea butter is completely raw and unrefin...,5.0,Pure organic raw shea butter,1405296000,"07 14, 2014","Ingredients:\n100% Pure, Unrefined, Organic Af...",100% Organic Raw Unrefined African Shea Butter...,http://ecx.images-amazon.com/images/I/41up5%2B...,{'Beauty': 1265},"[[Beauty, Skin Care, Body, Moisturizers, Lotio...",9.97,"{'also_bought': ['B00LFPS0CY', 'B00KZNOHTW', '...",
A35Q0RBM3YNQNF,B00LLPT4HI,M. Hill,"[0, 0]",The skin is the body's largest organ and it ab...,5.0,One Pound Organic Grade A Unrefined Shea Butter,1405468800,"07 16, 2014","Ingredients:\n100% Pure, Unrefined, Organic Af...",100% Organic Raw Unrefined African Shea Butter...,http://ecx.images-amazon.com/images/I/41up5%2B...,{'Beauty': 1265},"[[Beauty, Skin Care, Body, Moisturizers, Lotio...",9.97,"{'also_bought': ['B00LFPS0CY', 'B00KZNOHTW', '...",
A3LGT6UZL99IW1,B00LLPT4HI,"Richard C. Drew ""Anaal Nathra/Uthe vas Bethod...","[0, 0]",I have very dry elbows and knees. I have a to...,5.0,This stuff is amazing!,1405382400,"07 15, 2014","Ingredients:\n100% Pure, Unrefined, Organic Af...",100% Organic Raw Unrefined African Shea Butter...,http://ecx.images-amazon.com/images/I/41up5%2B...,{'Beauty': 1265},"[[Beauty, Skin Care, Body, Moisturizers, Lotio...",9.97,"{'also_bought': ['B00LFPS0CY', 'B00KZNOHTW', '...",


In [12]:
'''
TODO:
- turn dataset to {id: (reviewerID, asin), attribute: string}
- export to csv
'''

'\nTODO:\n- turn dataset to {id: (reviewerID, asin), attribute: string}\n- export to csv\n'

In [13]:
name = 'table_data'
reindex_dataset.to_csv(f'{dir}/{name}.csv', index = True)

In [14]:
table_data = pd.read_csv(f'{dir}/{name}.csv')

In [15]:
table_data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,description,title,imUrl,salesRank,categories,price,related,brand
0,A1YJEY40YUW4SE,7806397051,Andrea,"[3, 4]",Very oily and creamy. Not at all what I expect...,1.0,Don't waste your money,1391040000,"01 30, 2014",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[['Beauty', 'Makeup', 'Face', 'Concealers & Ne...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
1,A60XNB876KYML,7806397051,Jessica H.,"[1, 1]",This palette was a decent price and I was look...,3.0,OK Palette!,1397779200,"04 18, 2014",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[['Beauty', 'Makeup', 'Face', 'Concealers & Ne...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
2,A3G6XNM240RMWA,7806397051,Karen,"[0, 1]",The texture of this concealer pallet is fantas...,4.0,great quality,1378425600,"09 6, 2013",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[['Beauty', 'Makeup', 'Face', 'Concealers & Ne...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
3,A1PQFP6SAJ6D80,7806397051,Norah,"[2, 2]",I really can't tell what exactly this thing is...,2.0,Do not work on my face,1386460800,"12 8, 2013",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[['Beauty', 'Makeup', 'Face', 'Concealers & Ne...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA
4,A38FVHZTNQ271F,7806397051,Nova Amor,"[0, 0]","It was a little smaller than I expected, but t...",3.0,It's okay.,1382140800,"10 19, 2013",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[['Beauty', 'Makeup', 'Face', 'Concealers & Ne...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA


In [16]:
'''
FLow:
- create new attributes named "attributes" by turn all features except 'reviewerID' and 'asin' to string, then append all of them
'''

'\nFLow:\n- create new attributes named "attributes" by turn all features except reviewId\n'

In [34]:
df_examples = table_data.iloc[[0],2:]

df_examples

Unnamed: 0,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,description,title,imUrl,salesRank,categories,price,related,brand
0,Andrea,"[3, 4]",Very oily and creamy. Not at all what I expect...,1.0,Don't waste your money,1391040000,"01 30, 2014",An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[['Beauty', 'Makeup', 'Face', 'Concealers & Ne...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA


In [44]:
tqdm.pandas()
table_data['attributes'] = table_data.iloc[:,2:].progress_apply(lambda row : row.to_json(), axis=1)

100%|██████████| 198502/198502 [00:09<00:00, 21599.11it/s]


In [41]:
tqdm.pandas()
table_data['attributes'][0]

'{"reviewerName":"Andrea","helpful":"[3, 4]","reviewText":"Very oily and creamy. Not at all what I expected... ordered this to try to highlight and contour and it just looked awful!!! Plus, took FOREVER to arrive.","overall":1.0,"summary":"Don\'t waste your money","unixReviewTime":1391040000,"reviewTime":"01 30, 2014","description":"An extensive range of 15 multiple vibrant long wear concealer colour with different skin tones to create more than 10,000 amazing looks. Using the most commonly applied shades, ensures the best skin colour match and guarantees a traceless and natural finish. Enabling layering and mixing, provides total camouflage for almost any skin problem including blemishes, scars, birthmarks and black circles. It is also suitable to use as bronzer. The light colour is suitable for redness, acne and so on. The medium colour is perfect for dark shadows in the under-eye area. The dark colour provides exceptional camouflage and adheres well to the skin. Silky glossy colour 

In [1]:
table_data.to_csv(f'{dir}/rag_data.csv')

NameError: name 'table_data' is not defined