In [2]:
%matplotlib inline

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [3]:
path = 'dataset-files/'

In [4]:
olist_customer = pd.read_csv(path + 'olist_customers_dataset.csv')
olist_geolocation = pd.read_csv(path + 'olist_geolocation_dataset.csv')
olist_order_items = pd.read_csv(path + 'olist_order_items_dataset.csv')
olist_order_payments = pd.read_csv(path + 'olist_order_payments_dataset.csv')
olist_order_reviews = pd.read_csv(path + 'olist_order_reviews_dataset.csv')
olist_orders = pd.read_csv(path + 'olist_orders_dataset.csv')
olist_products = pd.read_csv(path + 'olist_products_dataset.csv')
olist_sellers = pd.read_csv(path + 'olist_sellers_dataset.csv')

# 1. Limpando os Dados

### Datas – alterando o dtype de objeto para datetime.

In [5]:
olist_order_items['shipping_limit_date'] = pd.to_datetime(olist_order_items['shipping_limit_date'])
olist_order_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             112650 non-null  object        
 1   order_item_id        112650 non-null  int64         
 2   product_id           112650 non-null  object        
 3   seller_id            112650 non-null  object        
 4   shipping_limit_date  112650 non-null  datetime64[ns]
 5   price                112650 non-null  float64       
 6   freight_value        112650 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 6.0+ MB


In [6]:
olist_order_reviews[['review_creation_date', 'review_answer_timestamp']] = \
olist_order_reviews[['review_creation_date', 'review_answer_timestamp']].apply(pd.to_datetime)

olist_order_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   review_id                99224 non-null  object        
 1   order_id                 99224 non-null  object        
 2   review_score             99224 non-null  int64         
 3   review_comment_title     11568 non-null  object        
 4   review_comment_message   40977 non-null  object        
 5   review_creation_date     99224 non-null  datetime64[ns]
 6   review_answer_timestamp  99224 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 5.3+ MB


In [7]:
olist_orders[['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']] = \
olist_orders[['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']]\
.apply(pd.to_datetime)
olist_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
dtypes: datetime64[ns](5), object(3)
memory usage: 6.1+ MB


### Removendo colunas

As mensagens sobre as avaliações, bem como os seus títulos não serão apreciadas nessa análise, e portanto, devem ser removidas.

In [8]:
olist_order_reviews.drop(columns = ["review_comment_title", "review_comment_message"], inplace=True)

In [9]:
olist_order_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   review_id                99224 non-null  object        
 1   order_id                 99224 non-null  object        
 2   review_score             99224 non-null  int64         
 3   review_creation_date     99224 non-null  datetime64[ns]
 4   review_answer_timestamp  99224 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(2)
memory usage: 3.8+ MB


In [10]:
df_products = olist_products.drop(columns = ["product_weight_g", "product_length_cm", "product_height_cm", "product_width_cm"])
df_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
dtypes: float64(3), object(2)
memory usage: 1.3+ MB


### Removendo Linhas
É necessário remover linhas também. Isso será feito com o fim de igualar a coluna 'product_id' com as demais, uma vez que é a única que contém informações, o que de nada adianta para nós.

In [11]:
row_non_null_counts = df_products.notnull().sum(axis=1)
df_products = df_products[row_non_null_counts > 1]
df_products.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32341 entries, 0 to 32950
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32341 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
dtypes: float64(3), object(2)
memory usage: 1.5+ MB


# Quais são as categorias com maior e menor receita dos últimos 12 meses?

Primeiro, unimos 'olist_orders' com 'olist_order_items' pelo 'order_id', para relacionar a data de compra com o preço.

In [12]:
preco_e_data = pd.merge(olist_order_items, olist_orders, on='order_id')
preco_e_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 14 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       112650 non-null  object        
 1   order_item_id                  112650 non-null  int64         
 2   product_id                     112650 non-null  object        
 3   seller_id                      112650 non-null  object        
 4   shipping_limit_date            112650 non-null  datetime64[ns]
 5   price                          112650 non-null  float64       
 6   freight_value                  112650 non-null  float64       
 7   customer_id                    112650 non-null  object        
 8   order_status                   112650 non-null  object        
 9   order_purchase_timestamp       112650 non-null  datetime64[ns]
 10  order_approved_at              112635 non-null  datetime64[ns]
 11  

Em seguida, precisamos unir o resultado acima com df_products, para fazer a análise por categorias.

In [13]:
preco_por_categoria = pd.merge(preco_e_data, df_products, on='product_id')

preco_por_categoria.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111047 entries, 0 to 111046
Data columns (total 18 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       111047 non-null  object        
 1   order_item_id                  111047 non-null  int64         
 2   product_id                     111047 non-null  object        
 3   seller_id                      111047 non-null  object        
 4   shipping_limit_date            111047 non-null  datetime64[ns]
 5   price                          111047 non-null  float64       
 6   freight_value                  111047 non-null  float64       
 7   customer_id                    111047 non-null  object        
 8   order_status                   111047 non-null  object        
 9   order_purchase_timestamp       111047 non-null  datetime64[ns]
 10  order_approved_at              111033 non-null  datetime64[ns]
 11  

E então, selecionamos apenas os últimos 12 meses a partir de 3/9/2018, que é o último dia contido na tabela.

In [14]:
ha_doze_meses = (pd.Timestamp('2018-09-03 09:06:57') - pd.Timedelta(365, "d"))#.strftime('%Y-%m-%d %H:%M:%S') 
ultimos_12_meses = preco_por_categoria[preco_por_categoria['order_purchase_timestamp'] >= ha_doze_meses]


In [15]:
ultimos_12_meses.head()

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29,3ce436f183e68e07877b285a838db11a,delivered,2017-09-13 08:59:02,2017-09-13 09:45:35,2017-09-19 18:34:16,2017-09-20 23:43:48,2017-09-29,cool_stuff,58.0,598.0,4.0
2,532ed5e14e24ae1f0d735b91524b98b9,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2018-05-23 10:56:25,64.9,18.33,4ef55bf80f711b372afebcb7c715344a,delivered,2018-05-18 10:25:53,2018-05-18 12:31:43,2018-05-23 14:05:00,2018-06-04 18:34:26,2018-06-07,cool_stuff,58.0,598.0,4.0
6,bbf796534aaf9c59f8da8c7982db56e0,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2018-03-26 03:28:44,64.9,38.5,c32c207d9ebf75958011557ac1624e2f,delivered,2018-03-18 21:00:44,2018-03-20 03:28:44,2018-03-20 18:08:45,2018-03-28 21:57:44,2018-04-12,cool_stuff,58.0,598.0,4.0
10,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87,6489ae5e4333f3693df5ad4372dab6d3,delivered,2018-01-14 14:33:31,2018-01-14 14:48:30,2018-01-16 12:36:48,2018-01-22 13:19:16,2018-02-05,moveis_decoracao,59.0,695.0,2.0
11,45211f00dfa76aed1c20d6910d973222,2,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2017-11-06 15:46:14,199.0,20.08,903ac1be571f2dc655f9083ffc936ce6,delivered,2017-10-30 15:35:32,2017-10-30 17:31:35,2017-10-31 20:36:37,2017-11-08 17:34:39,2017-11-24,moveis_decoracao,59.0,695.0,2.0


In [16]:
categoria_soma_preco = ultimos_12_meses.groupby('product_category_name')['price'].sum().sort_values(ascending=False)

#### As categorias com maior receita nos últimos 12 meses:

In [17]:
categoria_soma_preco.head(10)

product_category_name
beleza_saude              1003548.12
relogios_presentes         987238.61
cama_mesa_banho            774483.45
esporte_lazer              754111.67
informatica_acessorios     687562.56
moveis_decoracao           540737.22
utilidades_domesticas      495849.44
automotivo                 457494.42
cool_stuff                 416885.66
brinquedos                 356850.42
Name: price, dtype: float64

#### As categorias com menor receita nos últimos 12 meses:

In [18]:
categoria_soma_preco.tail(10)

product_category_name
fraldas_higiene                  1432.69
flores                           1110.04
la_cuisine                       1055.00
fashion_roupa_feminina           1052.24
pc_gamer                         1025.99
fashion_esporte                   933.80
cds_dvds_musicais                 370.00
fashion_roupa_infanto_juvenil     299.95
casa_conforto_2                   267.19
seguros_e_servicos                100.00
Name: price, dtype: float64

# Top 10 maiores sellers

In [19]:
receita_sellers = preco_por_categoria.groupby('seller_id')['price'].sum().sort_values(ascending=False)
receita_sellers.head(10)

seller_id
4869f7a5dfa277a7dca6462dcf3b52b2    229472.63
53243585a1d6dc2643021fd1853d8905    222776.05
4a3ca9315b744ce9f8e9374361493884    200472.92
fa1c13f2614d7b5c4749cbc52fecda94    194042.03
7c67e1448b00f6e969d365cea6b010ab    187923.89
7e93a43ef30c4f03f38b393420bc753a    176431.87
da8622b14eb17ae2831f4ac5b9dab84a    160236.57
7a67c85e85bb2ce8582c35f2203ad736    141745.53
1025f0e2d44d7041d6cf58b6550e0bfa    138968.55
955fee9216a65b617aa5c0531780ce60    135171.70
Name: price, dtype: float64

# Top 10 piores sellers

In [20]:
receita_sellers = preco_por_categoria.groupby('seller_id')['price'].sum().sort_values()
receita_sellers.head(10)

seller_id
cf6f6bc4df3999b9c6440f124fb2f687     3.50
77128dec4bec4878c37ab7d6169d6f26     6.50
1fa2d3def6adfa70e58c276bb64fe5bb     6.90
34aefe746cd81b7f3b23253ea28bef39     8.00
ad14615bdd492b01b0d97922e87cb87f     8.25
4965a7002cca77301c82d3f91b82e1a9     8.49
0f94588695d71662beec8d883ffacf09     9.00
c18309219e789960add0b2255ca4b091     9.90
95cca791657aabeff15a07eb152d7841     9.99
344223b2a90784f64136a8a5da012e7f    10.90
Name: price, dtype: float64

# Sellers que vendem o mesmo produto

Abaixo podemos ver os produtos que possuem mais de um seller.

In [21]:
seller_contagem = preco_por_categoria.groupby('product_id')['seller_id'].nunique()

produtos_com_multiplos_sellers = seller_contagem[seller_contagem > 1].index

df_filtrada = preco_por_categoria[preco_por_categoria['product_id'].isin(produtos_com_multiplos_sellers)]

produto_por_seller = df_filtrada.set_index(['product_id', 'seller_id'])
produto_por_seller


Unnamed: 0_level_0,Unnamed: 1_level_0,order_id,order_item_id,shipping_limit_date,price,freight_value,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty
product_id,seller_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
99a4788cb24856965c36a24e339b6058,4a3ca9315b744ce9f8e9374361493884,0006ec9db01a64e59a68b2c340bf65a7,1,2018-07-26 17:24:20,74.0,23.32,5d178120c29c61748ea95bac23cb8f25,delivered,2018-07-24 17:04:17,2018-07-24 17:24:20,2018-07-25 11:02:00,2018-07-31 01:04:15,2018-08-22,cama_mesa_banho,54.0,245.0,1.0
99a4788cb24856965c36a24e339b6058,4a3ca9315b744ce9f8e9374361493884,00c763284c0056eed753352f5559ff0a,1,2018-06-11 00:10:44,79.9,14.65,0bbbcba4fb2d97e129f2f2622d82eabc,delivered,2018-05-31 23:51:24,2018-06-01 00:10:44,2018-06-01 14:42:00,2018-06-13 15:34:39,2018-07-04,cama_mesa_banho,54.0,245.0,1.0
99a4788cb24856965c36a24e339b6058,4a3ca9315b744ce9f8e9374361493884,01be661b8196707ef60f062632d6d1bd,1,2017-05-24 10:42:27,89.9,12.13,e7232b4d5318177ea3ab911dfb920cbf,delivered,2017-05-17 10:26:53,2017-05-17 10:42:27,2017-05-17 15:00:10,2017-05-23 13:03:32,2017-06-07,cama_mesa_banho,54.0,245.0,1.0
99a4788cb24856965c36a24e339b6058,4a3ca9315b744ce9f8e9374361493884,01be661b8196707ef60f062632d6d1bd,2,2017-05-24 10:42:27,89.9,12.13,e7232b4d5318177ea3ab911dfb920cbf,delivered,2017-05-17 10:26:53,2017-05-17 10:42:27,2017-05-17 15:00:10,2017-05-23 13:03:32,2017-06-07,cama_mesa_banho,54.0,245.0,1.0
99a4788cb24856965c36a24e339b6058,4a3ca9315b744ce9f8e9374361493884,028e1ce5e085cc7810f340572c316b35,1,2017-05-26 09:04:28,89.9,15.38,98250a166957267b24eafb8fd85d8415,delivered,2017-05-21 08:03:50,2017-05-21 09:04:28,2017-05-23 15:48:57,2017-05-30 14:24:31,2017-06-14,cama_mesa_banho,54.0,245.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6d7e259fc0d18d051d87bac45bd45798,bccf933e006e9b94a6184af782963e77,fe98bde348882015e26b7660d5db70f6,1,2018-08-09 23:05:12,148.9,20.23,22e9b06384774854fe0716250ee2ebee,delivered,2018-08-07 22:46:23,2018-08-07 23:05:12,2018-08-08 08:53:00,2018-08-15 19:32:48,2018-08-21,beleza_saude,51.0,1648.0,1.0
87c91527351ebab1c422be0f4738480b,c3cfdc648177fdbbbb35635a37472c53,e236416fb32f25eb00a7da9097cf36a4,1,2017-07-17 12:10:21,94.9,20.15,7a09a14821e574a7774323bdfab4908c,delivered,2017-07-07 11:09:00,2017-07-07 12:10:21,2017-07-07 16:24:56,2017-07-13 19:32:22,2017-08-02,esporte_lazer,42.0,877.0,1.0
87c91527351ebab1c422be0f4738480b,6b3bd31ad8fcda4b2635ec9f3ff2ecdf,fa8d60735eb7db0f4659893592e3fec0,1,2018-03-29 15:08:02,85.9,11.47,f5ced6e0d7c20ef958cf5f9269f4287d,delivered,2018-03-24 14:57:23,2018-03-24 15:08:02,2018-03-26 23:46:59,2018-05-18 01:19:41,2018-04-20,esporte_lazer,42.0,877.0,1.0
17f221c334109f71ffc36e54fc32a836,7142540dd4c91e2237acb7e911c4eba2,e3a645e56f26072a4ea4ca6187ae208a,1,2017-11-16 18:27:35,89.9,29.03,107e9ad240231ed3055437dc2bf81d28,delivered,2017-11-09 18:10:19,2017-11-09 18:27:35,2017-11-10 18:02:52,2017-11-29 15:51:40,2017-12-06,automotivo,59.0,544.0,3.0


In [22]:
num_sellers = len(preco_por_categoria.loc[preco_por_categoria['product_id'].isin(produtos_com_multiplos_sellers), 'seller_id'].unique())
num_produtos = len(produtos_com_multiplos_sellers)

print("Número de sellers com múltiplos produtos:", num_sellers)
print("Número de produtos com múltiplos sellers:", num_produtos)

Número de sellers com múltiplos produtos: 786
Número de produtos com múltiplos sellers: 1225


### Variação de preço entre os sellers

Abaixo podemos ver a variação de preços entre os sellers para cada produto.

In [23]:
variacao_preco = produto_por_seller.groupby('product_id')['price'].agg(['min', 'max'])

# Calculate price range for each product
variacao_preco['range'] = variacao_preco['max'] - variacao_preco['min']

variacao_preco_ordenada = variacao_preco.sort_values(by='range', ascending=False)

# Display the price variation for each product sorted by range
variacao_preco_ordenada

Unnamed: 0_level_0,min,max,range
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fd8a5b9a8a79d7ba0739d69be5dc5aa1,616.0,1399.00,783.00
5237739bb5fee495dbd337755a138660,809.0,1549.00,740.00
ba3fea9ec13fb882dda6c9e4295d9130,1084.9,1800.00,715.10
68f3adaef1620e7b0c4c7cd9f78d7ed0,799.9,1297.25,497.35
f819f0c84a64f02d3a5606ca95edd272,499.9,899.99,400.09
...,...,...,...
ae3e0cf8b9e4a3a027fc5d4b0a3eb2a0,39.9,39.90,0.00
268a26bc460de02a94ec05d9ebb4c3e7,69.0,69.00,0.00
4943769229463d7bd0fd5af9baac46b5,65.5,65.50,0.00
aef29ba8f524551d01c5497f60832b2c,14.0,14.00,0.00


Podemos ver acima que a maior variação de preço é de R$ 783,00.

# EXTRA – há relação entre a variação de preço e faturamento?

In [32]:
produto_por_seller_reset_index = produto_por_seller.reset_index()

nova_df = produto_por_seller_reset_index.sort_values(by='order_purchase_timestamp')
nova_df


Unnamed: 0,product_id,seller_id,order_id,order_item_id,shipping_limit_date,price,freight_value,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty
11421,9c7bdf67b06b419aefb93cfdfc96c55d,ed49da7cadefc454d6a67de249eab240,65d1e226dfaeb8cdc42f665422522d14,1,2016-10-21 16:24:43,21.50,14.11,70fc57eeae292675927697fe03ad3ff5,canceled,2016-10-03 21:01:41,2016-10-04 10:18:57,2016-10-25 12:14:28,2016-11-08 10:58:34,2016-11-25,esporte_lazer,25.0,823.0,1.0
8785,4c7d4a2efde21e3bf1660926fabc6a9d,ce27a3cc3c8cc1ea79d11e561e9bebb6,6b3ee7697a02619a0ace2b3f0aa46bde,1,2016-10-21 16:25:38,57.00,8.77,21a6abdf0197fbe57451bd0a1d3c59a2,delivered,2016-10-04 17:08:39,2016-10-06 15:46:02,2016-11-07 16:53:44,2016-11-21 10:38:51,2016-11-24,beleza_saude,60.0,316.0,3.0
12861,b73f6899a58fe7a37e55149e9a11c717,a7f13822ceb966b076af67121f87b063,95e01270fcbae9863423400103359279,1,2016-10-10 15:51:43,86.99,28.23,7a692b1ff18c8c21156b886e02e80040,delivered,2016-10-04 18:52:56,2016-10-06 15:51:42,2016-11-01 07:27:42,2016-11-04 17:54:00,2016-11-24,moveis_escritorio,19.0,917.0,1.0
14144,c3b8f6be3507bb05fd83797f81f679bc,2138ccb85b11a4ec1e37afbd1c8eda1f,d1eb8e4e276a4eea13a5c462c0765e60,1,2016-10-10 03:10:34,19.99,20.80,9031f9dcde5860b34e6c65ac5c796d30,delivered,2016-10-04 19:30:28,2016-10-06 03:10:33,2016-10-10 03:10:34,2016-10-13 03:10:34,2016-12-06,telefonia,32.0,331.0,1.0
10024,3410cbd7df2130ec3cb9300ef0fe3df1,897060da8b9a21f655304d50fd935913,5cc475c7c03290048eb2e742cd64cb5e,1,2016-10-09 03:10:31,71.00,11.69,75e8f990b9e289013b1d092614b52487,delivered,2016-10-04 21:54:52,2016-10-05 03:10:31,2016-12-06 17:24:00,2016-12-12 20:31:54,2016-11-24,informatica_acessorios,34.0,324.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7480,86ecc269de40ba13205e7beeee12f26f,b76dba6c951ab00dc4edf0a1aa88037e,0785459f4688c430590e9ef6eff2fecb,1,2018-08-30 20:15:23,29.99,7.47,54081452326c25cde023a4c5a7850275,delivered,2018-08-27 20:03:07,2018-08-27 20:15:23,2018-08-28 12:07:00,2018-08-30 18:48:47,2018-08-31,telefonia,31.0,329.0,4.0
9980,afce90df29b9a0c1bfe5e1033f1197b1,2138ccb85b11a4ec1e37afbd1c8eda1f,7b4f660a72183fc98991998a65320863,1,2018-08-31 13:44:24,14.50,7.39,fa951727fe95bbd6c7488b84d147b7bd,delivered,2018-08-28 13:28:34,2018-08-28 13:44:54,2018-08-29 14:58:00,2018-08-30 22:41:16,2018-09-04,telefonia,42.0,356.0,4.0
12103,719d571299707561c34ba04ab867b32a,0ef83d7d83ed97cd2a0049ac8be5f88a,dec9169d4f8c2e229d7421e28ef70781,1,2018-08-30 13:44:22,49.00,16.53,acffc70df5f0a2ddc0e094eea7f5f069,delivered,2018-08-28 13:29:17,2018-08-28 13:44:22,2018-08-28 15:30:00,2018-08-30 17:51:57,2018-09-13,beleza_saude,41.0,1178.0,1.0
6418,d6aa421b9567e3e2a3a0caf8fb846cbc,99a54764c341d5dc80b4a8fac4eba3fb,2c1f8fd9aa5f50ec17f4c95e52223a3a,1,2018-09-04 16:05:19,41.80,16.56,0b0e8ff0c3d490e69901bd6f0efa190a,delivered,2018-08-28 15:53:09,2018-08-28 16:05:19,2018-08-29 19:11:00,2018-08-30 15:11:38,2018-09-06,utilidades_domesticas,48.0,546.0,1.0


In [25]:

# # Group by seller_id and calculate the correlation between 'invoice' and 'price'
# seller_corr = preco_por_categoria_filtered.groupby('seller_id')[['invoice', 'price']].corr().iloc[0::2,-1].reset_index()



# Inflação de Preços

Podemos ver abaixo que alguns produtos sofreram aumentos altíssimos ao longo do tempo. Alguns aumentaram acima de 300%, sendo o maior índice de inflação em 900%.

Por outro lado, o preço de alguns produtos deflacionou, sendo o maior índice de deflação em 78,7%.

In [26]:
colunas_selecionadas = ['product_id', 'price', 'order_purchase_timestamp']

df_preco_tempo_produto = preco_por_categoria[colunas_selecionadas].copy()

dados_agrupados = df_preco_tempo_produto.groupby('product_id')

precos_antigos_idx = dados_agrupados['order_purchase_timestamp'].idxmin()
precos_novos_idx = dados_agrupados['order_purchase_timestamp'].idxmax()

precos_antigos = df_preco_tempo_produto.loc[precos_antigos_idx]
precos_novos = df_preco_tempo_produto.loc[precos_novos_idx]

inflacao_absoluto = (precos_novos.set_index('product_id')['price'] - precos_antigos.set_index('product_id')['price']).abs()

inflacao_porcentagem = (((precos_novos.set_index('product_id')['price'] - precos_antigos.set_index('product_id')['price']) / precos_antigos.set_index('product_id')['price']) * 100).round(1)

inflacao_df = pd.DataFrame({'oldest_timestamp': precos_antigos.groupby('product_id')['order_purchase_timestamp'].min(),
                          'newest_timestamp': precos_novos.groupby('product_id')['order_purchase_timestamp'].max(),
                          'variação em R$': inflacao_absoluto,
                          'variação em %': inflacao_porcentagem})

inflacao_df = inflacao_df.sort_values(by='variação em %', ascending=False)

inflacao_df


Unnamed: 0_level_0,oldest_timestamp,newest_timestamp,variação em R$,variação em %
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
613a1da144b1ad65ab2cba8d417ad350,2017-02-26 00:04:26,2017-03-16 11:08:23,171.00,900.0
668627d7997bde540866a91f02484ae6,2017-04-04 18:02:11,2018-05-04 13:31:32,43.56,688.2
f03859e4cc21018569f5ae7a03897ff4,2017-07-07 16:25:41,2018-06-14 21:18:25,124.99,357.1
2eb384017334e47db9ccf364216799df,2017-11-24 22:04:54,2018-03-20 18:02:29,67.35,318.4
6ae7f4ef0ca670ca2609d039af87d057,2017-01-25 22:04:39,2017-04-17 06:41:44,75.10,314.2
...,...,...,...,...
69455f41626a745aea9ee9164cb9eafd,2017-02-27 16:22:04,2018-08-22 10:51:09,219.69,-65.1
1fa52682d6938b129312ef4cffac1711,2018-01-13 15:59:39,2018-08-21 20:18:31,131.00,-68.6
ba16581014183c8415da15145f3d4c24,2017-04-15 14:40:01,2018-08-06 20:31:45,660.99,-68.9
5be25cbe1b150d16b7809060ffe1ce0b,2017-03-11 22:10:14,2017-11-23 22:13:26,29.99,-75.0
