<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Build-Product-Data" data-toc-modified-id="Build-Product-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Build Product Data</a></span><ul class="toc-item"><li><span><a href="#basic-columns" data-toc-modified-id="basic-columns-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>basic columns</a></span></li><li><span><a href="#createdAt" data-toc-modified-id="createdAt-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>createdAt</a></span></li><li><span><a href="#updatedAt" data-toc-modified-id="updatedAt-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>updatedAt</a></span></li><li><span><a href="#Base-Price" data-toc-modified-id="Base-Price-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Base Price</a></span></li><li><span><a href="#Convert-Brazilian-to-IDR" data-toc-modified-id="Convert-Brazilian-to-IDR-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Convert Brazilian to IDR</a></span></li><li><span><a href="#Translate-categories-to-English" data-toc-modified-id="Translate-categories-to-English-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Translate categories to English</a></span></li></ul></li></ul></div>

In [62]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from tqdm import tqdm

import datetime

In [181]:
items_df = pd.read_csv('../data/olist_order_items_dataset.csv')
payments_df = pd.read_csv('../data/olist_order_payments_dataset.csv')
orders_df = pd.read_csv('../data/olist_orders_dataset.csv')
customers_df = pd.read_csv('../data/olist_customers_dataset.csv')
products_df = pd.read_csv('../data/olist_products_dataset.csv')
name_translation_df = pd.read_csv('../data/product_category_name_translation.csv', index_col=0)

## Build Product Data

### basic columns

In [64]:
data = products_df[['product_id', 'product_category_name']]
data = data.rename(columns={'product_id':'id',
                            'product_category_name':'productCategory'})
data['name'] = None

In [65]:
data

Unnamed: 0,id,productCategory,name
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,
3,cef67bcfe19066a932b7673e239eb23d,bebes,
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,
...,...,...,...
32946,a0b7d5a992ccda646f2d34e418fff5a0,moveis_decoracao,
32947,bf4538d88321d0fd4412a93c974510e6,construcao_ferramentas_iluminacao,
32948,9a7c6041fa9592d9d9ef6cfe62a71f8c,cama_mesa_banho,
32949,83808703fc0706a22e264b9d75f04a2e,informatica_acessorios,


### createdAt

In [89]:
purchase_df = orders_df[['order_id', 'order_purchase_timestamp']].merge(items_df[['order_id', 'product_id']], on='order_id')
purchase_df['order_purchase_timestamp'] = pd.to_datetime(purchase_df['order_purchase_timestamp'])
createdAt = purchase_df.groupby('product_id').order_purchase_timestamp.min().to_frame('createdAt')
createdAt.index.name = 'id'
createdAt

Unnamed: 0_level_0,createdAt
id,Unnamed: 1_level_1
00066f42aeeb9f3007548bb9d3f33c38,2018-05-20 18:45:21
00088930e925c41fd95ebfe695fd2655,2017-12-12 19:20:28
0009406fd7479715e4bef61dd91f2462,2017-12-21 16:21:47
000b8f95fcb9e0096488278317764d19,2018-08-01 22:00:33
000d9be29b5207b54e86aa1b1ac54872,2018-04-03 09:24:12
...,...
fff6177642830a9a94a0f2cba5e476d1,2017-07-15 17:37:28
fff81cc3158d2725c0655ab9ba0f712c,2018-07-30 09:18:59
fff9553ac224cec9d15d49f5a263411f,2017-10-06 16:44:05
fffdb2d0ec8d6a61f0a0a0db3f25b441,2018-04-01 11:58:04


In [93]:
data = data.merge(createdAt, on='id')
data

Unnamed: 0,id,productCategory,name,createdAt
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,,2018-04-24 16:16:53
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,,2018-01-31 18:55:36
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,,2018-07-09 21:18:01
3,cef67bcfe19066a932b7673e239eb23d,bebes,,2018-08-03 08:55:50
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,,2018-04-11 01:06:37
...,...,...,...,...
32946,a0b7d5a992ccda646f2d34e418fff5a0,moveis_decoracao,,2017-12-29 19:03:07
32947,bf4538d88321d0fd4412a93c974510e6,construcao_ferramentas_iluminacao,,2018-04-12 11:29:56
32948,9a7c6041fa9592d9d9ef6cfe62a71f8c,cama_mesa_banho,,2017-08-21 15:37:43
32949,83808703fc0706a22e264b9d75f04a2e,informatica_acessorios,,2017-10-19 09:26:47


### updatedAt

In [94]:
purchase_df = orders_df[['order_id', 'order_purchase_timestamp']].merge(items_df[['order_id', 'product_id']], on='order_id')
purchase_df['order_purchase_timestamp'] = pd.to_datetime(purchase_df['order_purchase_timestamp'])
updatedAt = purchase_df.groupby('product_id').order_purchase_timestamp.max().to_frame('updatedAt')
updatedAt.index.name = 'id'
updatedAt

Unnamed: 0_level_0,updatedAt
id,Unnamed: 1_level_1
00066f42aeeb9f3007548bb9d3f33c38,2018-05-20 18:45:21
00088930e925c41fd95ebfe695fd2655,2017-12-12 19:20:28
0009406fd7479715e4bef61dd91f2462,2017-12-21 16:21:47
000b8f95fcb9e0096488278317764d19,2018-08-10 13:24:35
000d9be29b5207b54e86aa1b1ac54872,2018-04-03 09:24:12
...,...
fff6177642830a9a94a0f2cba5e476d1,2017-09-03 11:38:54
fff81cc3158d2725c0655ab9ba0f712c,2018-07-30 09:18:59
fff9553ac224cec9d15d49f5a263411f,2017-10-06 16:44:05
fffdb2d0ec8d6a61f0a0a0db3f25b441,2018-08-10 17:15:07


In [95]:
data = data.merge(updatedAt, on='id')
data

Unnamed: 0,id,productCategory,name,createdAt,updatedAt
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,,2018-04-24 16:16:53,2018-04-24 16:16:53
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,,2018-01-31 18:55:36,2018-01-31 18:55:36
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,,2018-07-09 21:18:01,2018-07-09 21:18:01
3,cef67bcfe19066a932b7673e239eb23d,bebes,,2018-08-03 08:55:50,2018-08-03 08:55:50
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,,2018-04-11 01:06:37,2018-04-11 01:06:37
...,...,...,...,...,...
32946,a0b7d5a992ccda646f2d34e418fff5a0,moveis_decoracao,,2017-12-29 19:03:07,2018-08-20 17:06:21
32947,bf4538d88321d0fd4412a93c974510e6,construcao_ferramentas_iluminacao,,2018-04-12 11:29:56,2018-08-06 17:28:26
32948,9a7c6041fa9592d9d9ef6cfe62a71f8c,cama_mesa_banho,,2017-08-21 15:37:43,2018-06-24 13:59:43
32949,83808703fc0706a22e264b9d75f04a2e,informatica_acessorios,,2017-10-19 09:26:47,2018-03-06 11:51:44


### Base Price

In [108]:
purchase_df = orders_df[['order_id', 'order_purchase_timestamp']].merge(items_df[['order_id', 'product_id', 'price']], on='order_id')
purchase_df['order_purchase_timestamp'] = pd.to_datetime(purchase_df['order_purchase_timestamp'])
basePrice = purchase_df.sort_values('order_purchase_timestamp').groupby('product_id').price.last().to_frame('basePrice')
basePrice.index.name = 'id'
basePrice

Unnamed: 0_level_0,basePrice
id,Unnamed: 1_level_1
00066f42aeeb9f3007548bb9d3f33c38,101.65
00088930e925c41fd95ebfe695fd2655,129.90
0009406fd7479715e4bef61dd91f2462,229.00
000b8f95fcb9e0096488278317764d19,58.90
000d9be29b5207b54e86aa1b1ac54872,199.00
...,...
fff6177642830a9a94a0f2cba5e476d1,109.99
fff81cc3158d2725c0655ab9ba0f712c,90.00
fff9553ac224cec9d15d49f5a263411f,32.00
fffdb2d0ec8d6a61f0a0a0db3f25b441,34.99


In [109]:
data = data.merge(basePrice, on='id')
data

Unnamed: 0,id,productCategory,name,createdAt,updatedAt,basePrice
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,,2018-04-24 16:16:53,2018-04-24 16:16:53,10.91
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,,2018-01-31 18:55:36,2018-01-31 18:55:36,248.00
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,,2018-07-09 21:18:01,2018-07-09 21:18:01,79.80
3,cef67bcfe19066a932b7673e239eb23d,bebes,,2018-08-03 08:55:50,2018-08-03 08:55:50,112.30
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,,2018-04-11 01:06:37,2018-04-11 01:06:37,37.90
...,...,...,...,...,...,...
32946,a0b7d5a992ccda646f2d34e418fff5a0,moveis_decoracao,,2017-12-29 19:03:07,2018-08-20 17:06:21,69.90
32947,bf4538d88321d0fd4412a93c974510e6,construcao_ferramentas_iluminacao,,2018-04-12 11:29:56,2018-08-06 17:28:26,240.00
32948,9a7c6041fa9592d9d9ef6cfe62a71f8c,cama_mesa_banho,,2017-08-21 15:37:43,2018-06-24 13:59:43,127.50
32949,83808703fc0706a22e264b9d75f04a2e,informatica_acessorios,,2017-10-19 09:26:47,2018-03-06 11:51:44,29.90


### Convert Brazilian to IDR

In [117]:
data['basePrice'] = data['basePrice'] * 3053.52

### Translate categories to English

In [190]:
name_translation_dict = name_translation_df['product_category_name_english'].to_dict()

In [194]:
data['productCategory'] = data['productCategory'].apply(lambda x: name_translation_dict[x] if x in name_translation_dict.keys() else None)

In [196]:
data.to_csv('../data/prepared/products.csv')