In [10]:
!pip install pandas polars bs4 requests 

import pandas as pd
import polars as pl
import bs4
import requests
import re
import os



In [None]:
# get all links that contains .csv 
relative_path = './pmqc/'
url = 'https://www.gov.br/anp/pt-br/centrais-de-conteudo/dados-abertos/pmqc-programa-de-monitoramento-da-qualidade-dos-combustiveis'
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', href=re.compile(r'.csv'))
# get only links between href and " target
links = [link['href'] for link in links if 'target' in link.attrs]

# # download all files using multiprocessing and urllib
import urllib.request
from multiprocessing.pool import ThreadPool

def download_file(link):
    # from string https://www.gov.br/anp/pt-br/centrais-de-conteudo/dados-abertos/arquivos/pmqc/2023/pmqc_2023_04.csv
    # get year of file 
    year = link.split('/')[-2]
    name_file = link.split('/')[-1]
    if not os.path.exists(f"./pmqc/"):
        os.makedirs(f"./pmqc/")
    try:
        urllib.request.urlretrieve(link, f"./pmqc/{name_file}")
        print(f"Downloaded {name_file}")
    except:
        print(f"Error to download {name_file}")

pool = ThreadPool(8)
pool.map(download_file, links)
pool.close()



In [None]:
folder_destination = ''

In [38]:
%%time 

if not os.path.exists(f"pmqc_processed/"):
    os.makedirs(f"pmqc_processed/")
    
df = pl.read_csv("pmqc/*.csv", separator=';', infer_schema_length=10000)#.limit(1000)
df = df.with_columns(
    pl.col('DataColeta').str.strptime(pl.Date, "%Y-%m-%d", strict=False).cast(pl.Date),
    # remove all special characters from column CnpjMatriz
    pl.col('CnpjPosto').str.replace_all(r'[^0-9]', '')
)
## print count rows
print(df.shape)

# # ## generate postos dimensions
postos = df.select([
    'CnpjPosto',
    'RazaoSocialPosto',
    'Distribuidora',
    'Endereço',
    'Latitude',
    'Longitude',
    'Bairro',
    'Município',
    'Uf'])
postos = postos.unique(subset=["CnpjPosto"])
postos = postos.with_columns(
    pl.col('CnpjPosto').str.slice(0, 8).cast(pl.Int32).alias('CnpjMatriz')
)


postos = postos.unique(subset=["CnpjPosto"])

# apply function to get new column "geometry" with values like {'type': 'Point', 'coordinates': [-54.61611004, -20.46871167]}} in string format

def get_geometry(row):
    if row['Longitude'] is None or row['Latitude'] is None:
        return {'type': 'Point', 'coordinates': [0, 0]}  # You can use any default values here
    return {'type': 'Point', 'coordinates': [row['Longitude'], row['Latitude']]}

postos = postos.with_columns(
    pl.struct(
        pl.col('Longitude'),
        pl.col('Latitude')
    ).apply(get_geometry).alias('geometry')
)

# write json
postos.write_json(f"pmqc_processed/postos.json", row_oriented=True)

(4601837, 19)




CPU times: user 4.04 s, sys: 2.62 s, total: 6.66 s
Wall time: 4.13 s


In [32]:
%%time

# generate fact table
coletas = df.select([
    'DataColeta',
    'IdNumeric',
    'CnpjPosto',
    'Produto',
    'Ensaio',
    'Resultado',
    'UnidadeEnsaio',
    'Conforme'])
coletas = coletas.unique(subset=['IdNumeric'])
# get only postos in postos dataframe
coletas = coletas.join(postos, on='CnpjPosto', how='inner')

coletas.write_json('pmqc_processed/coletas.json', row_oriented=True)


CPU times: user 985 ms, sys: 170 ms, total: 1.16 s
Wall time: 580 ms


In [37]:
## insert postos to MongoDB with column CnpjMatriz as index, and latitute and longitude as 2dsphere

from pymongo import MongoClient
import json

MONGO_PORT = 27017
MONGO_HOST = "localhost"
MONGO_PASS = "example"
MONGO_USER = "root"
MONGO_URL = "mongodb://root:example@localhost:27017"

client = MongoClient(MONGO_URL)
db = client['pmqc']
collection = db['postos']
collection.create_index('CnpjPosto', unique=True)
collection.create_index([('geometry', '2dsphere')])
 

# transform json to latitude and longitude to column geometry as 2dsphere
with open('pmqc_processed/postos.json') as f:
    data = json.load(f)
    collection.insert_many(data)


In [15]:
# insert coletas to MongoDB
collection = db['coletas']
collection.create_index('IdNumeric', unique=True)
with open('pmqc_processed/coletas.json') as f:
    data = json.load(f)
    collection.insert_many(data)

In [16]:
# count rows
collection.count_documents({})


488901