In [45]:
import pandas as pd
import numpy as np 
import requests
import os
from bs4 import BeautifulSoup
import urllib

In [90]:
anos = [2020, 2021, 2022, 2023]
for ano in anos:
    link_anp = f'https://www.gov.br/anp/pt-br/centrais-de-conteudo/dados-abertos/arquivos/shpc/dsan/{ano}'

    # get the html of the page
    html = requests.get(link_anp).content

    # create the soup object
    soup = BeautifulSoup(html, 'html.parser')

    # find all links on the page
    links = soup.find_all('a')

    # filter links that contains csv
    links = [link for link in links if 'csv' in link.get('href')]
    links

    # create a dict with the links following the structure
    # - {year: 2020, month: 01, file: precos-glp-01, link: url}

    links = [{'year': link.get('href').split('/')[-3],
            'month': link.get('href').split('-')[-1].split('.')[0], 
            'file': link.get('href').split('/')[-2].replace('.csv', link.get('href').split('/')[-3] + '.csv'),
            'link': link.get('href').replace('/view', '')} for link in links]
    links

    for link in links:
        # download the file
        if not os.path.exists(f'./data/raw/{link["file"]}'):
            os.makedirs('./data/raw', exist_ok=True)
        urllib.request.urlretrieve(link['link'], f'./data/raw/{link["file"]}')

        

KeyboardInterrupt: 

In [124]:
#!pip install polars
import polars as pl
import numpy as np

# list files inside data/raw
files = os.listdir('data/raw')
# filter that contains gasolina, diesel 
files = [f for f in files if 'gasolina' in f or 'diesel' in f]

# read all files
dfs = [pl.read_csv(f'data/raw/{f}', separator=';') for f in files]
# concat all files
df = pl.concat(dfs, how='diagonal')

# remove all special characters on column CNPJ da Revenda
df.columns

df = df.with_columns([
     pl.col('CNPJ da Revenda').str.replace_all(r'[./-]', '').alias('CnpjPosto'),
     pl.col('Valor de Venda').str.replace_all(r',', '.').cast(pl.Float32).alias('ValorVenda'),
    ])

# lucro
df = df.with_columns(
  pl.when(pl.col('Valor de Compra').is_null())
    .then(pl.col('ValorVenda'))
    .otherwise(pl.lit(0)).cast(pl.Float32).alias('ValorCompra'),
)

# create column lucro
df = df.with_columns(
  pl.when(pl.col('ValorCompra') > 0)
    .then(pl.col('ValorVenda') - pl.col('ValorCompra'))
    .otherwise(pl.lit(0)).cast(pl.Float32).alias('Lucro'),
)

df = df.select([
  'CnpjPosto',
  'Produto',
  'Data da Coleta',
  'Bandeira',
  'ValorVenda',
  'ValorCompra',
  'Lucro',
])

# df
if not os.path.exists('./data/processed'):
    os.makedirs('./data/processed')
df.write_json('./data/processed/preco_combustivel.json', row_oriented=True)
df.write_csv('./data/processed/preco_combustivel.csv')
