In [1]:
import pandas as pd

In [2]:
# Caminho do arquivo CSV (troque pelo seu)
caminho_csv = r"C:\Users\andre\Downloads\sample.csv"  # Windows

# Caminho do arquivo Parquet de saída
caminho_parquet = caminho_csv.replace(".csv", ".parquet")

In [3]:
df = pd.read_parquet(caminho_parquet)
df.head()  # Exibir as primeiras linhas

Unnamed: 0,loja_id,produto_id,categoria_id,is_medicamento,curva,data,estoque,venda,custo,preco
0,2,5124,2,False,E,2024-11-12,0,0.0,0.111198,0.171087
1,2,208,2,False,D,2023-08-13,3,0.0,0.069276,0.13442
2,7,208,2,False,D,2022-06-14,2,0.0,0.069276,0.124337
3,10,5124,2,False,E,2022-09-01,1,0.0,0.111198,0.171087
4,4,5124,2,False,E,2024-10-11,0,0.0,0.111198,0.171087


## Tratando missing values

In [4]:
df.shape

(41389576, 10)

In [5]:
print(df.isnull().sum())  # Contagem de valores nulos

loja_id           0
produto_id        0
categoria_id      0
is_medicamento    0
curva             0
data              0
estoque           0
venda             0
custo             0
preco             0
dtype: int64


In [6]:
df.groupby(['loja_id','produto_id']).agg({'data':'count'}).reset_index().sort_values('data')

Unnamed: 0,loja_id,produto_id,data
15056,4,2528,1
11093,3,2809,1
36805,9,1368,1
17034,4,4868,1
9845,3,1094,1
...,...,...,...
22,1,27,1148
20,1,25,1148
19,1,24,1148
18,1,23,1148


In [7]:
df.groupby(['loja_id','produto_id']).agg({'data':'count'}).reset_index().sort_values('data')['data'].value_counts().sort_index()

data
1          21
2          26
4           5
5           1
7           8
        ...  
1144       13
1145       13
1146        2
1147        1
1148    29466
Name: count, Length: 1063, dtype: int64

In [8]:
# Descobrir todas as combinações únicas de loja e produto
comb_loja_produto = df[['loja_id', 'produto_id']].drop_duplicates()

In [9]:
# Criar um intervalo de datas cobrindo o período da base
min_data = df['data'].min()
max_data = df['data'].max()
todas_datas = pd.date_range(start=min_data, end=max_data, freq='D')

In [10]:
# Criar o DataFrame com todas as combinações possíveis
df_grade = comb_loja_produto.merge(pd.DataFrame({'data': todas_datas}), how='cross')
df_grade['data'] = df_grade['data'].astype('str')

In [11]:
print(comb_loja_produto.shape[0])
print(comb_loja_produto.shape[0] * 1148)
print(df_grade.shape)

44736
51356928
(51356928, 3)


In [12]:
df_grade.dtypes

loja_id        int64
produto_id     int64
data          object
dtype: object

In [13]:
# Realizar o merge para garantir todas as datas no df original
df_completo = df_grade.merge(df, on=['loja_id', 'produto_id', 'data'], how='left')

In [14]:
df_completo.query('loja_id == 4 and produto_id == 3136').tail(15)

Unnamed: 0,loja_id,produto_id,data,categoria_id,is_medicamento,curva,estoque,venda,custo,preco
51356913,4,3136,2025-02-07,,,,,,,
51356914,4,3136,2025-02-08,,,,,,,
51356915,4,3136,2025-02-09,,,,,,,
51356916,4,3136,2025-02-10,,,,,,,
51356917,4,3136,2025-02-11,,,,,,,
51356918,4,3136,2025-02-12,,,,,,,
51356919,4,3136,2025-02-13,,,,,,,
51356920,4,3136,2025-02-14,,,,,,,
51356921,4,3136,2025-02-15,,,,,,,
51356922,4,3136,2025-02-16,,,,,,,


In [15]:
comb_loja_produto_data_minima = df_completo.query('venda.notnull()').groupby(['loja_id','produto_id']).agg({'data':'min'}).reset_index()

In [16]:
comb_loja_produto_data_minima.sort_values('data')

Unnamed: 0,loja_id,produto_id,data
0,1,1,2022-01-01
27194,7,744,2022-01-01
27193,7,743,2022-01-01
27192,7,742,2022-01-01
27191,7,741,2022-01-01
...,...,...,...
12561,3,4830,2025-02-21
10941,3,2602,2025-02-21
11662,3,3615,2025-02-21
11682,3,3642,2025-02-21


In [17]:
df_completo = df_completo.merge(comb_loja_produto_data_minima, on=['loja_id', 'produto_id'], suffixes=('', '_min'))

In [18]:
df_completo = df_completo.query('data >= data_min')

In [19]:
df_completo.shape

(41515461, 11)

In [20]:
df_completo['linhas'] = 1
df_completo['linhas_nulas'] = df_completo.venda.isnull()

In [21]:
nulos_por_loja_produto = df_completo.groupby(['loja_id', 'produto_id']).agg({'linhas':'sum','linhas_nulas':'sum'}).reset_index()

In [22]:
nulos_por_loja_produto['prop'] = nulos_por_loja_produto['linhas_nulas'] / nulos_por_loja_produto['linhas']

In [23]:
nulos_por_loja_produto.query('linhas_nulas > 0').sort_values('prop', ascending = False)

Unnamed: 0,loja_id,produto_id,linhas,linhas_nulas,prop
15252,4,2768,634,85,0.134069
783,1,908,635,85,0.133858
44209,10,4703,635,85,0.133858
44185,10,4676,635,85,0.133858
401,1,466,635,85,0.133858
...,...,...,...,...,...
44125,10,4607,635,85,0.133858
44147,10,4633,635,85,0.133858
44174,10,4663,635,85,0.133858
44178,10,4668,635,85,0.133858


In [24]:
nulos_por_loja_produto.query('linhas_nulas > 0').sort_values('linhas_nulas', ascending = False)

Unnamed: 0,loja_id,produto_id,linhas,linhas_nulas,prop
44723,10,5277,635,85,0.133858
91,1,101,635,85,0.133858
172,1,199,635,85,0.133858
197,1,230,635,85,0.133858
278,1,322,635,85,0.133858
...,...,...,...,...,...
593,1,690,635,85,0.133858
505,1,582,635,85,0.133858
440,1,508,635,85,0.133858
431,1,498,635,85,0.133858


In [25]:
nulos_por_loja_produto.query('linhas_nulas > 0').loja_id.value_counts()

loja_id
1     163
6     161
9     149
7     149
10    148
2     145
3     144
5     143
4     142
8     137
Name: count, dtype: int64

In [26]:
nulos_por_loja_produto.query('linhas_nulas > 0').produto_id.value_counts()

produto_id
199     10
230     10
328     10
373     10
349     10
        ..
2070     1
2271     1
4615     1
2657     1
4009     1
Name: count, Length: 199, dtype: int64

In [27]:
df_completo.query('loja_id == 10 and produto_id == 4296')

Unnamed: 0,loja_id,produto_id,data,categoria_id,is_medicamento,curva,estoque,venda,custo,preco,data_min,linhas,linhas_nulas
49495385,10,4296,2023-05-29,9.0,True,C,8.0,0.0,0.039942,0.097754,2023-05-29,1,False
49495386,10,4296,2023-05-30,9.0,True,C,8.0,0.0,0.039942,0.097754,2023-05-29,1,False
49495387,10,4296,2023-05-31,9.0,True,C,8.0,0.0,0.039942,0.097754,2023-05-29,1,False
49495388,10,4296,2023-06-01,,,,,,,,2023-05-29,1,True
49495389,10,4296,2023-06-02,,,,,,,,2023-05-29,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49496015,10,4296,2025-02-17,9.0,True,C,15.0,0.0,0.037913,0.054756,2023-05-29,1,False
49496016,10,4296,2025-02-18,9.0,True,C,15.0,0.0,0.037913,0.054756,2023-05-29,1,False
49496017,10,4296,2025-02-19,9.0,True,C,15.0,0.0,0.037913,0.054756,2023-05-29,1,False
49496018,10,4296,2025-02-20,9.0,True,C,15.0,0.0,0.037913,0.054756,2023-05-29,1,False


In [28]:
df_completo.query('loja_id == 10 and produto_id == 4296').query('venda.isnull()')

Unnamed: 0,loja_id,produto_id,data,categoria_id,is_medicamento,curva,estoque,venda,custo,preco,data_min,linhas,linhas_nulas
49495388,10,4296,2023-06-01,,,,,,,,2023-05-29,1,True
49495389,10,4296,2023-06-02,,,,,,,,2023-05-29,1,True
49495390,10,4296,2023-06-03,,,,,,,,2023-05-29,1,True
49495391,10,4296,2023-06-04,,,,,,,,2023-05-29,1,True
49495392,10,4296,2023-06-05,,,,,,,,2023-05-29,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49495468,10,4296,2023-08-20,,,,,,,,2023-05-29,1,True
49495469,10,4296,2023-08-21,,,,,,,,2023-05-29,1,True
49495470,10,4296,2023-08-22,,,,,,,,2023-05-29,1,True
49495471,10,4296,2023-08-23,,,,,,,,2023-05-29,1,True


In [29]:
df_completo.query('loja_id == 10 and produto_id == 4633')

Unnamed: 0,loja_id,produto_id,data,categoria_id,is_medicamento,curva,estoque,venda,custo,preco,data_min,linhas,linhas_nulas
12968321,10,4633,2023-05-29,3.0,True,B,0.0,0.0,0.021389,0.048865,2023-05-29,1,False
12968322,10,4633,2023-05-30,3.0,True,B,0.0,0.0,0.021389,0.048865,2023-05-29,1,False
12968323,10,4633,2023-05-31,3.0,True,B,0.0,0.0,0.021389,0.048865,2023-05-29,1,False
12968324,10,4633,2023-06-01,,,,,,,,2023-05-29,1,True
12968325,10,4633,2023-06-02,,,,,,,,2023-05-29,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12968951,10,4633,2025-02-17,3.0,True,B,7.0,1.0,0.041996,0.061673,2023-05-29,1,False
12968952,10,4633,2025-02-18,3.0,True,B,7.0,0.0,0.041996,0.061673,2023-05-29,1,False
12968953,10,4633,2025-02-19,3.0,True,B,7.0,0.0,0.041996,0.061673,2023-05-29,1,False
12968954,10,4633,2025-02-20,3.0,True,B,7.0,0.0,0.041996,0.061673,2023-05-29,1,False


In [30]:
df_completo.query('loja_id == 10 and produto_id == 4633').query('venda.isnull()')

Unnamed: 0,loja_id,produto_id,data,categoria_id,is_medicamento,curva,estoque,venda,custo,preco,data_min,linhas,linhas_nulas
12968324,10,4633,2023-06-01,,,,,,,,2023-05-29,1,True
12968325,10,4633,2023-06-02,,,,,,,,2023-05-29,1,True
12968326,10,4633,2023-06-03,,,,,,,,2023-05-29,1,True
12968327,10,4633,2023-06-04,,,,,,,,2023-05-29,1,True
12968328,10,4633,2023-06-05,,,,,,,,2023-05-29,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12968404,10,4633,2023-08-20,,,,,,,,2023-05-29,1,True
12968405,10,4633,2023-08-21,,,,,,,,2023-05-29,1,True
12968406,10,4633,2023-08-22,,,,,,,,2023-05-29,1,True
12968407,10,4633,2023-08-23,,,,,,,,2023-05-29,1,True


In [31]:
df_completo.query('loja_id == 4 and produto_id == 2768')

Unnamed: 0,loja_id,produto_id,data,categoria_id,is_medicamento,curva,estoque,venda,custo,preco,data_min,linhas,linhas_nulas
9383118,4,2768,2023-05-30,3.0,True,C,2.0,0.0,0.065438,0.110000,2023-05-30,1,False
9383119,4,2768,2023-05-31,3.0,True,C,2.0,0.0,0.065438,0.110000,2023-05-30,1,False
9383120,4,2768,2023-06-01,,,,,,,,2023-05-30,1,True
9383121,4,2768,2023-06-02,,,,,,,,2023-05-30,1,True
9383122,4,2768,2023-06-03,,,,,,,,2023-05-30,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9383747,4,2768,2025-02-17,3.0,True,C,2.0,0.0,0.081229,0.114669,2023-05-30,1,False
9383748,4,2768,2025-02-18,3.0,True,C,2.0,0.0,0.081229,0.114669,2023-05-30,1,False
9383749,4,2768,2025-02-19,3.0,True,C,2.0,0.0,0.081229,0.114669,2023-05-30,1,False
9383750,4,2768,2025-02-20,3.0,True,C,2.0,0.0,0.081229,0.114669,2023-05-30,1,False


In [32]:
df_completo.query('loja_id == 4 and produto_id == 2768').query('venda.isnull()')

Unnamed: 0,loja_id,produto_id,data,categoria_id,is_medicamento,curva,estoque,venda,custo,preco,data_min,linhas,linhas_nulas
9383120,4,2768,2023-06-01,,,,,,,,2023-05-30,1,True
9383121,4,2768,2023-06-02,,,,,,,,2023-05-30,1,True
9383122,4,2768,2023-06-03,,,,,,,,2023-05-30,1,True
9383123,4,2768,2023-06-04,,,,,,,,2023-05-30,1,True
9383124,4,2768,2023-06-05,,,,,,,,2023-05-30,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9383200,4,2768,2023-08-20,,,,,,,,2023-05-30,1,True
9383201,4,2768,2023-08-21,,,,,,,,2023-05-30,1,True
9383202,4,2768,2023-08-22,,,,,,,,2023-05-30,1,True
9383203,4,2768,2023-08-23,,,,,,,,2023-05-30,1,True


In [33]:
datas_nulas_minimas_maximas = df_completo.query('venda.isnull()').groupby(['loja_id', 'produto_id']).agg({'data':['min','max'],'linhas_nulas':'sum'}).reset_index()

In [34]:
datas_nulas_minimas_maximas.columns = ['loja_id','produto_id','data_min','data_max','linhas_nulas']

In [35]:
print(datas_nulas_minimas_maximas.data_min.min())
print(datas_nulas_minimas_maximas.data_min.max())
print(datas_nulas_minimas_maximas.data_max.min())
print(datas_nulas_minimas_maximas.data_max.max())

2023-06-01
2023-06-01
2023-08-24
2023-08-24


In [36]:
print(datas_nulas_minimas_maximas.shape)
datas_nulas_minimas_maximas.head()

(1481, 5)


Unnamed: 0,loja_id,produto_id,data_min,data_max,linhas_nulas
0,1,101,2023-06-01,2023-08-24,85
1,1,199,2023-06-01,2023-08-24,85
2,1,230,2023-06-01,2023-08-24,85
3,1,322,2023-06-01,2023-08-24,85
4,1,328,2023-06-01,2023-08-24,85


In [37]:
datas_nulas_minimas_maximas['data_corte'] = "2023-08-25"

In [38]:
df_completo = df_completo.merge(datas_nulas_minimas_maximas[['loja_id','produto_id','data_corte']],
                                how = "left", on = ['loja_id','produto_id'])
df_completo['data_corte'] = df_completo['data_corte'].fillna("2022-01-01")

In [39]:
df_completo = df_completo.query('data >= data_corte')

In [40]:
df_completo.shape

(41385134, 14)

## Tratando inconsistências

In [41]:
produto_groupby = df_completo.groupby('produto_id').agg({'categoria_id':'nunique','is_medicamento':'nunique','curva':'nunique'}).reset_index()

In [42]:
print(produto_groupby.categoria_id.max())
print(produto_groupby.is_medicamento.max())
print(produto_groupby.curva.max())

2
2
1


In [43]:
print(produto_groupby.query('categoria_id == 2').shape)
print(produto_groupby.query('is_medicamento == 2').shape)

(26, 4)
(4, 4)


In [44]:
produto_groupby.query('is_medicamento == 2')

Unnamed: 0,produto_id,categoria_id,is_medicamento,curva
110,111,2,2,1
717,718,2,2,1
2621,2622,2,2,1
2996,2997,2,2,1


In [45]:
for p in produto_groupby.query('is_medicamento == 2').produto_id.unique():
    print(p)
    print(df_completo.query('produto_id == @p')['is_medicamento'].value_counts())
    print()

111
is_medicamento
True     2204
False      92
Name: count, dtype: int64

718
is_medicamento
True     62
False    47
Name: count, dtype: int64

2622
is_medicamento
False    598
True     120
Name: count, dtype: int64

2997
is_medicamento
False    425
True      60
Name: count, dtype: int64



In [46]:
for p in produto_groupby.query('categoria_id == 2').produto_id.unique():
    print(p)
    print(df_completo.query('produto_id == @p')['categoria_id'].value_counts())
    print()

22
categoria_id
5.0    2916
3.0     528
Name: count, dtype: int64

111
categoria_id
3.0    2204
4.0      92
Name: count, dtype: int64

388
categoria_id
4.0    1340
6.0     540
Name: count, dtype: int64

474
categoria_id
4.0    1340
6.0     540
Name: count, dtype: int64

563
categoria_id
9.0    239
5.0     52
Name: count, dtype: int64

718
categoria_id
3.0    62
7.0    47
Name: count, dtype: int64

1403
categoria_id
4.0    707
6.0     24
Name: count, dtype: int64

1556
categoria_id
4.0    7450
7.0    1730
Name: count, dtype: int64

1684
categoria_id
4.0    11360
6.0      120
Name: count, dtype: int64

1918
categoria_id
5.0    2223
3.0    1760
Name: count, dtype: int64

2091
categoria_id
4.0    1340
6.0     540
Name: count, dtype: int64

2622
categoria_id
7.0    598
3.0    120
Name: count, dtype: int64

2930
categoria_id
4.0     4968
11.0     193
Name: count, dtype: int64

2995
categoria_id
4.0    1340
6.0     540
Name: count, dtype: int64

2997
categoria_id
7.0    425
3.0     60
Name: c

In [47]:
produto_groupby_last = df_completo.groupby('produto_id').agg({'categoria_id':'last','is_medicamento':'last'}).reset_index()

In [48]:
df_completo = df_completo.merge(produto_groupby_last,
                                how = 'left', on = 'produto_id', suffixes = ('','_last'))

In [49]:
df_completo['categoria_id'] = df_completo['categoria_id_last']
df_completo['is_medicamento'] = df_completo['is_medicamento_last']
df_completo = df_completo.drop(columns = ['categoria_id_last','is_medicamento_last','data_min','linhas','linhas_nulas','data_corte'], axis = 1)

In [50]:
venda_maior_estoque = df_completo.query('venda > estoque')
venda_maior_estoque_loja_prod = venda_maior_estoque.groupby(['loja_id','produto_id']).agg({'data':'count'}).reset_index()

In [51]:
venda_maior_estoque_loja_prod

Unnamed: 0,loja_id,produto_id,data
0,1,1,1
1,1,3,9
2,1,4,3
3,1,5,5
4,1,6,10
...,...,...,...
35282,10,5281,20
35283,10,5283,3
35284,10,5285,2
35285,10,5286,5


In [52]:
venda_maior_estoque.query('estoque > 0').query('curva == "A"')

Unnamed: 0,loja_id,produto_id,data,categoria_id,is_medicamento,curva,estoque,venda,custo,preco
57851,9,4584,2022-10-10,1.0,False,A,1.0,2.0,0.091276,0.105014
64521,10,909,2022-03-06,1.0,False,A,2.0,5.0,0.067565,0.070865
64570,10,909,2022-04-24,1.0,False,A,2.0,3.0,0.067565,0.070865
64608,10,909,2022-06-01,1.0,False,A,2.0,7.0,0.067565,0.073309
64610,10,909,2022-06-03,1.0,False,A,1.0,2.0,0.067565,0.073309
...,...,...,...,...,...,...,...,...,...,...
41380475,10,3928,2023-07-22,11.0,False,A,1.0,6.0,0.054145,0.061087
41380477,10,3928,2023-07-24,11.0,False,A,1.0,2.0,0.054145,0.068420
41380512,10,3928,2023-08-28,11.0,False,A,4.0,5.0,0.060622,0.073309
41380599,10,3928,2023-11-23,11.0,False,A,5.0,22.0,0.060622,0.058642


In [53]:
df_completo.query('loja_id == 10 and produto_id == 3928').head(25)

Unnamed: 0,loja_id,produto_id,data,categoria_id,is_medicamento,curva,estoque,venda,custo,preco
41379908,10,3928,2022-01-01,11.0,False,A,22.0,1.0,0.049133,0.058642
41379909,10,3928,2022-01-02,11.0,False,A,19.0,3.0,0.049133,0.058642
41379910,10,3928,2022-01-03,11.0,False,A,19.0,0.0,0.049133,0.058642
41379911,10,3928,2022-01-04,11.0,False,A,16.0,3.0,0.049133,0.058642
41379912,10,3928,2022-01-05,11.0,False,A,16.0,6.0,0.049133,0.055676
41379913,10,3928,2022-01-06,11.0,False,A,13.0,3.0,0.049133,0.04642
41379914,10,3928,2022-01-07,11.0,False,A,13.0,0.0,0.049133,0.04642
41379915,10,3928,2022-01-08,11.0,False,A,0.0,19.0,0.049133,0.04642
41379916,10,3928,2022-01-09,11.0,False,A,54.0,6.0,0.049133,0.04642
41379917,10,3928,2022-01-10,11.0,False,A,62.0,4.0,0.049133,0.04642


In [54]:
df_completo = df_completo.rename(columns = {'estoque':'estoque_final'})

In [55]:
df_completo['estoque_inicial'] = df_completo['estoque_final'] + df_completo['venda']

In [56]:
df_completo = df_completo.sort_values(['loja_id','produto_id','data'])

In [57]:
df_completo['estoque_final_anterior'] = df_completo.groupby(['loja_id','produto_id'])['estoque_final'].shift(1)

In [58]:
df_completo.query('estoque_inicial < estoque_final_anterior').shape[0] / df_completo.shape[0] * 100

0.1750556129647907

In [59]:
df_completo.query('estoque_inicial < estoque_final_anterior').sort_values('data')

Unnamed: 0,loja_id,produto_id,data,categoria_id,is_medicamento,curva,estoque_final,venda,custo,preco,estoque_inicial,estoque_final_anterior
38948131,5,4775,2022-01-02,9.0,True,C,0.0,0.0,0.032413,0.085556,0.0,1.0
37072197,7,426,2022-01-02,9.0,True,C,0.0,0.0,0.071427,0.219976,0.0,2.0
34683224,4,656,2022-01-02,9.0,True,D,2.0,0.0,0.081156,0.136865,2.0,3.0
7506202,5,1105,2022-01-02,3.0,True,B,5.0,2.0,0.044073,0.067919,7.0,9.0
36309853,4,3674,2022-01-03,9.0,True,D,4.0,0.0,0.019898,0.053778,4.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
28440031,4,5260,2025-02-21,5.0,True,B,8.0,0.0,0.021511,0.053081,8.0,9.0
30334944,1,2078,2025-02-21,6.0,False,A,7.0,0.0,0.155394,0.204564,7.0,8.0
12858727,7,1447,2025-02-21,3.0,True,B,0.0,0.0,0.077196,0.114229,0.0,1.0
4928806,10,4188,2025-02-21,3.0,True,D,0.0,0.0,0.025080,0.037107,0.0,1.0


In [60]:
df_completo['reposicao'] = df_completo['estoque_inicial'] - df_completo['estoque_final_anterior']  

In [61]:
df_completo.query('loja_id == 10 and produto_id == 3928').head(25)

Unnamed: 0,loja_id,produto_id,data,categoria_id,is_medicamento,curva,estoque_final,venda,custo,preco,estoque_inicial,estoque_final_anterior,reposicao
41379908,10,3928,2022-01-01,11.0,False,A,22.0,1.0,0.049133,0.058642,23.0,,
41379909,10,3928,2022-01-02,11.0,False,A,19.0,3.0,0.049133,0.058642,22.0,22.0,0.0
41379910,10,3928,2022-01-03,11.0,False,A,19.0,0.0,0.049133,0.058642,19.0,19.0,0.0
41379911,10,3928,2022-01-04,11.0,False,A,16.0,3.0,0.049133,0.058642,19.0,19.0,0.0
41379912,10,3928,2022-01-05,11.0,False,A,16.0,6.0,0.049133,0.055676,22.0,16.0,6.0
41379913,10,3928,2022-01-06,11.0,False,A,13.0,3.0,0.049133,0.04642,16.0,16.0,0.0
41379914,10,3928,2022-01-07,11.0,False,A,13.0,0.0,0.049133,0.04642,13.0,13.0,0.0
41379915,10,3928,2022-01-08,11.0,False,A,0.0,19.0,0.049133,0.04642,19.0,13.0,6.0
41379916,10,3928,2022-01-09,11.0,False,A,54.0,6.0,0.049133,0.04642,60.0,0.0,60.0
41379917,10,3928,2022-01-10,11.0,False,A,62.0,4.0,0.049133,0.04642,66.0,54.0,12.0


In [62]:
df_completo.query('loja_id == 4 and produto_id == 3674').head(25)

Unnamed: 0,loja_id,produto_id,data,categoria_id,is_medicamento,curva,estoque_final,venda,custo,preco,estoque_inicial,estoque_final_anterior,reposicao
36309851,4,3674,2022-01-01,9.0,True,D,6.0,0.0,0.019898,0.053778,6.0,,
36309852,4,3674,2022-01-02,9.0,True,D,6.0,0.0,0.019898,0.053778,6.0,6.0,0.0
36309853,4,3674,2022-01-03,9.0,True,D,4.0,0.0,0.019898,0.053778,4.0,6.0,-2.0
36309854,4,3674,2022-01-04,9.0,True,D,4.0,0.0,0.019898,0.053778,4.0,4.0,0.0
36309855,4,3674,2022-01-05,9.0,True,D,3.0,1.0,0.019898,0.053778,4.0,4.0,0.0
36309856,4,3674,2022-01-06,9.0,True,D,5.0,0.0,0.019898,0.063543,5.0,3.0,2.0
36309857,4,3674,2022-01-07,9.0,True,D,6.0,0.0,0.019898,0.063543,6.0,5.0,1.0
36309858,4,3674,2022-01-08,9.0,True,D,7.0,0.0,0.019898,0.063543,7.0,6.0,1.0
36309859,4,3674,2022-01-09,9.0,True,D,7.0,0.0,0.019898,0.063543,7.0,7.0,0.0
36309860,4,3674,2022-01-10,9.0,True,D,6.0,1.0,0.019898,0.073309,7.0,7.0,0.0


In [63]:
caminho_parquet

'C:\\Users\\andre\\Downloads\\sample.parquet'

In [64]:
caminho = 'C:\\Users\\andre\\Downloads\\'

In [65]:
file_name = 'sample_trat.parquet'

In [66]:
df_completo.to_parquet(caminho + file_name, engine="pyarrow")

In [69]:
for c in df_completo.curva.unique():
    df_aux = df_completo.query('curva == @c').copy()
    df_aux.to_parquet(caminho + 'sample_trat_curva_' + c + '.parquet', engine="pyarrow")

In [72]:
for c in df_completo.categoria_id.unique():
    df_aux = df_completo.query('categoria_id == @c').copy()
    df_aux.to_parquet(caminho + 'sample_trat_categoria_' + str(int(c)) + '.parquet', engine="pyarrow")

In [73]:
df_completo.categoria_id.unique()

array([ 8.,  9.,  4.,  3., 10.,  5.,  1.,  6., 11.,  7.,  2.])