In [2]:
!pip install pandas polars bs4 requests 

import pandas as pd
import polars as pl
import bs4
import requests
import re
import os



In [9]:
%%time 

if not os.path.exists(f"pmqc_processed/"):
    os.makedirs(f"pmqc_processed/")
    
df = pl.read_csv("pmqc/*.csv", separator=';', infer_schema_length=10000)#.limit(1000)
df = df.with_columns(
    pl.col('DataColeta').str.strptime(pl.Date, "%Y-%m-%d", strict=False).cast(pl.Date),
    # remove all special characters from column CnpjMatriz
    pl.col('CnpjPosto').str.replace_all(r'[^0-9]', '')
)
## print count rows
print(df.shape)

# # ## generate postos dimensions
postos = df.select([
    'CnpjPosto',
    'RazaoSocialPosto',
    'Distribuidora',
    'Endereço',
    'Latitude',
    'Longitude',
    'Bairro',
    'Município',
    'Uf'])
postos = postos.unique(subset=["CnpjPosto"])
postos = postos.with_columns(
    pl.col('CnpjPosto').str.slice(0, 8).cast(pl.Int32).alias('CnpjMatriz')
)


postos = postos.unique(subset=["CnpjPosto"])

# apply function to get new column "geometry" with values like {'type': 'Point', 'coordinates': [-54.61611004, -20.46871167]}} in string format

def get_geometry(row):
    if row['Longitude'] is None or row['Latitude'] is None:
        return {'type': 'Point', 'coordinates': [0, 0]}  # You can use any default values here
    return {'type': 'Point', 'coordinates': [row['Longitude'], row['Latitude']]}

postos = postos.with_columns(
    pl.struct(
        pl.col('Longitude'),
        pl.col('Latitude')
    ).apply(get_geometry).alias('geometry')
)

# write json
postos.write_json(f"pmqc_processed/postos.json", row_oriented=True)

(4601837, 19)




CPU times: user 4.04 s, sys: 1.96 s, total: 6 s
Wall time: 3.85 s


In [10]:
# polars to pandas
postos = postos.to_pandas()

def generate_xml_row(row):
    return f'''
    <url>
        <loc>https://www.olhonocombustivel.com/posto/{row['CnpjPosto']}</loc>
        <lastmod>2024-01-01</lastmod>
        <changefreq>monthly</changefreq>
        <priority>0.8</priority>
    </url>
    '''

# Aplicando a função para cada linha do DataFrame
xml_rows = postos.apply(generate_xml_row, axis=1)

# Criando o sitemap XML completo
sitemap_xml = f'''<?xml version="1.0" encoding="UTF-8"?>
<urlset>
    {"".join(xml_rows)}
</urlset>
'''

# Salvando o sitemap XML em um arquivo
with open('sitemap.xml', 'w') as file:
    file.write(sitemap_xml)