In [1]:
!pip install pandas polars bs4 requests 

import pandas as pd
import polars as pl
import bs4
import requests
import re
import os

Collecting pandas
  Downloading pandas-2.1.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (18 kB)
Collecting polars
  Downloading polars-0.20.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (14 kB)
Collecting bs4
  Using cached bs4-0.0.1.tar.gz (1.1 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting requests
  Using cached requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting numpy<2,>=1.22.4 (from pandas)
  Downloading numpy-1.26.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2023.3.post1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.1 (from pandas)
  Using cached tzdata-2023.4-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting beautifulsoup4 (from bs4)
  Usin



In [2]:
# get all links that contains .csv 
relative_path = './pmqc/'
url = 'https://www.gov.br/anp/pt-br/centrais-de-conteudo/dados-abertos/pmqc-programa-de-monitoramento-da-qualidade-dos-combustiveis'
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', href=re.compile(r'.csv'))
# get only links between href and " target
links = [link['href'] for link in links if 'target' in link.attrs]

# # download all files using multiprocessing and urllib
import urllib.request
from multiprocessing.pool import ThreadPool

def download_file(link):
    # from string https://www.gov.br/anp/pt-br/centrais-de-conteudo/dados-abertos/arquivos/pmqc/2023/pmqc_2023_04.csv
    # get year of file 
    year = link.split('/')[-2]
    name_file = link.split('/')[-1]
    if not os.path.exists(f"./pmqc/"):
        os.makedirs(f"./pmqc/")
    try:
        urllib.request.urlretrieve(link, f"./pmqc/{name_file}")
        print(f"Downloaded {name_file}")
    except:
        print(f"Error to download {name_file}")

pool = ThreadPool(8)
pool.map(download_file, links)
pool.close()



Downloaded 2022-01-pmqc.csv
Downloaded 2022-01-pmqc.csv
Downloaded 2021-08-pmqc.csv
Downloaded 2020-09-pmqc.csv
Downloaded 2021-01-pmqc.csv
Downloaded 2021-12-pmqc.csv
Downloaded 2021-05-pmqc-csv.csv
Downloaded pmqc-03.csv
Downloaded 2022-04-pmqc.csv
Downloaded 2022-02-pmqc.csv
Downloaded 2021-07-pmqc.csv
Downloaded 2021-11-pmqc.csv
Downloaded 2022-01-pmqc.csv
Downloaded 2020-12-pmqc.csv
Downloaded 2022-01-pmqc.csv
Downloaded 2021-04-pmqc-csv.csv
Downloaded 2021-10-pmqc.csv
Downloaded 2020-08-pmqc.csv
Downloaded pmqc-02.csv
Downloaded 2022-01-pmqc.csv
Downloaded 2021-03-pmqc.csv
Downloaded 2020-11-pmqc.csv
Downloaded 2021-06-pmqc.csv
Downloaded 2022-03-pmqc.csv
Error to download 2020-10-pmqc.csv
Downloaded 2020-07-pmqc.csv
Downloaded 2021-09-pmqc.csv
Downloaded 2021-02-pmqc.csv
Downloaded pmqc-01.csv
Downloaded 2020-01-pmqc.csv
Downloaded 2020-05-pmqc.csv
Downloaded pmqc_2022_12.csv
Downloaded 2019-01-pmqc.csv
Downloaded 2020-04-pmqc.csv
Downloaded 2020-06-pmqc.csv
Downloaded 2019-09-p

FileExistsError: [Errno 17] File exists: './pmqc/'

In [None]:
folder_destination = ''

In [3]:
%%time 

if not os.path.exists(f"pmqc_processed/"):
    os.makedirs(f"pmqc_processed/")
    
df = pl.read_csv("pmqc/*.csv", separator=';', infer_schema_length=10000)#.limit(1000)
df = df.with_columns(
    pl.col('DataColeta').str.strptime(pl.Date, "%Y-%m-%d", strict=False).cast(pl.Date),
    # remove all special characters from column CnpjMatriz
    pl.col('CnpjPosto').str.replace_all(r'[^0-9]', '')
)
## print count rows
print(df.shape)

# # ## generate postos dimensions
postos = df.select([
    'CnpjPosto',
    'RazaoSocialPosto',
    'Distribuidora',
    'Endereço',
    'Latitude',
    'Longitude',
    'Bairro',
    'Município',
    'Uf'])
postos = postos.unique(subset=["CnpjPosto"])
postos = postos.with_columns(
    pl.col('CnpjPosto').str.slice(0, 8).cast(pl.Int32).alias('CnpjMatriz')
)


postos = postos.unique(subset=["CnpjPosto"])

# apply function to get new column "geometry" with values like {'type': 'Point', 'coordinates': [-54.61611004, -20.46871167]}} in string format

def get_geometry(row):
    if row['Longitude'] is None or row['Latitude'] is None:
        return {'type': 'Point', 'coordinates': [0, 0]}  # You can use any default values here
    return {'type': 'Point', 'coordinates': [row['Longitude'], row['Latitude']]}

postos = postos.with_columns(
    pl.struct(
        pl.col('Longitude'),
        pl.col('Latitude')
    ).apply(get_geometry).alias('geometry')
)

# write json
postos.write_json(f"pmqc_processed/postos.json", row_oriented=True)

(4847141, 19)




CPU times: user 4.2 s, sys: 2.16 s, total: 6.35 s
Wall time: 3.59 s


In [4]:
%%time

# generate fact table
coletas = df.select([
    'DataColeta',
    'IdNumeric',
    'CnpjPosto',
    'Produto',
    'Ensaio',
    'Resultado',
    'UnidadeEnsaio',
    'Conforme'])
coletas = coletas.unique(subset=['IdNumeric'])
# get only postos in postos dataframe
coletas = coletas.join(postos, on='CnpjPosto', how='inner')

coletas.write_json('pmqc_processed/coletas.json', row_oriented=True)


CPU times: user 931 ms, sys: 359 ms, total: 1.29 s
Wall time: 725 ms


In [6]:
## insert postos to MongoDB with column CnpjMatriz as index, and latitute and longitude as 2dsphere

from pymongo import MongoClient
import json

MONGO_PORT = 27017
MONGO_HOST = "localhost"
MONGO_PASS = "example"
MONGO_USER = "root"
MONGO_URL = "mongodb+srv://***REMOVED***/?retryWrites=true&w=majority"

client = MongoClient(MONGO_URL)
db = client['pmqc']
collection = db['postos']
collection.create_index('CnpjPosto', unique=True)
collection.create_index([('geometry', '2dsphere')])
 

# transform json to latitude and longitude to column geometry as 2dsphere
with open('pmqc_processed/postos.json') as f:
    data = json.load(f)
    collection.insert_many(data)


In [9]:
# insert coletas to MongoDB
collection = db['coletas']
collection.create_index('IdNumeric', unique=True)
# load and insert json to mongodb bulk size 1000 items
with open('pmqc_processed/coletas.json') as f:
    index = 0
    data = json.load(f)
    while index < len(data):
        collection.insert_many(data[index:index+1000])
        index += 1000
        print(f"Inserted {index} items")
    print(f"Inserted {index} items")
    

Inserted 1000 items
Inserted 2000 items
Inserted 3000 items
Inserted 4000 items
Inserted 5000 items
Inserted 6000 items
Inserted 7000 items
Inserted 8000 items
Inserted 9000 items
Inserted 10000 items
Inserted 11000 items
Inserted 12000 items
Inserted 13000 items
Inserted 14000 items
Inserted 15000 items
Inserted 16000 items
Inserted 17000 items
Inserted 18000 items
Inserted 19000 items
Inserted 20000 items
Inserted 21000 items
Inserted 22000 items
Inserted 23000 items
Inserted 24000 items
Inserted 25000 items
Inserted 26000 items
Inserted 27000 items
Inserted 28000 items
Inserted 29000 items
Inserted 30000 items
Inserted 31000 items
Inserted 32000 items
Inserted 33000 items
Inserted 34000 items
Inserted 35000 items
Inserted 36000 items
Inserted 37000 items
Inserted 38000 items
Inserted 39000 items
Inserted 40000 items
Inserted 41000 items
Inserted 42000 items
Inserted 43000 items
Inserted 44000 items
Inserted 45000 items
Inserted 46000 items
Inserted 47000 items
Inserted 48000 items
I

OperationFailure: you are over your space quota, using 515 MB of 512 MB, full error: {'ok': 0, 'errmsg': 'you are over your space quota, using 515 MB of 512 MB', 'code': 8000, 'codeName': 'AtlasError'}

In [None]:
# count rows
collection.count_documents({})
