In [1]:
import requests
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from datetime import datetime
import pandas as pd
import numpy as np
import csv

In [2]:
driver = webdriver.Chrome('./chromedriver.exe')

## Realizar requisição e adquirir html

In [3]:
response = requests.get(url='http://127.0.0.1:8000/places/default/sitemap.xml')
xml = response.content.decode()
html_paises = re.findall(r'<loc>(.*?)<', xml)

## Cria a lista com as informações atuais para criação do csv ou comparação

In [4]:
paises = []
now = datetime.now()
paises.append(['bandeira', 'area', 'populacao', 'iso', 'pais', 'capital', 'continente', 'tld', 'codigo_moeda', 'nome_moeda', 'fone', 'formato_codigo_postal', 'regex_codigo_postal', 'linguas', 'vizinhos', 'data_extracao'])
for i in range(0, len(html_paises)):
    driver.get(html_paises[i])
    paises.append([
        driver.find_element_by_xpath('//*[@id="places_national_flag__row"]/td[2]/img').get_attribute("src"),
        driver.find_element_by_xpath('//*[@id="places_area__row"]/td[2]').text,
        driver.find_element_by_xpath('//*[@id="places_population__row"]/td[2]').text,
        driver.find_element_by_xpath('//*[@id="places_iso__row"]/td[2]').text,
        driver.find_element_by_xpath('//*[@id="places_country__row"]/td[2]').text,
        driver.find_element_by_xpath('//*[@id="places_capital__row"]/td[2]').text,
        driver.find_element_by_xpath('//*[@id="places_continent__row"]/td[2]/a').text,
        driver.find_element_by_xpath('//*[@id="places_tld__row"]/td[2]').text,
        driver.find_element_by_xpath('//*[@id="places_currency_code__row"]/td[2]').text,
        driver.find_element_by_xpath('//*[@id="places_currency_name__row"]/td[2]').text,
        driver.find_element_by_xpath('//*[@id="places_phone__row"]/td[2]').text,
        driver.find_element_by_xpath('//*[@id="places_postal_code_format__row"]/td[2]').text,
        driver.find_element_by_xpath('//*[@id="places_postal_code_regex__row"]/td[2]').text,
        driver.find_element_by_xpath('//*[@id="places_languages__row"]/td[2]').text,
        driver.find_element_by_xpath('//*[@id="places_neighbours__row"]/td[2]').text,
        now.strftime("%m/%d/%Y, %H:%M:%S")
    ])

## Cria o dataframe. Caso ele já exista, esta célula NÃO deve ser executada para o propósito de comparação

In [5]:
df_csv = pd.DataFrame(paises[1:], columns = paises[0])
nome_do_arquivo = "dados_paises.csv"
df_csv.to_csv(nome_do_arquivo, index=False, header=True, quoting=csv.QUOTE_NONNUMERIC)

## Lê o arquivo e compara com a lista, atualiza as linhas cuja informação for divergente

In [6]:
df = pd.read_csv('dados_paises.csv', keep_default_na=False, na_values=['NaN',''])
alteracao = False

## Prenche espaços vazios da lista para comparação
for i in range(1, len(paises)):
    for j in range(0,15):
        if str(paises[i][j]).isspace() or paises[i][j] == '':
            paises[i][j] = np.nan

## Compara a lista com o dataframe
for i in range(0, len(paises) - 1):
    for j in range(0,15):
        if(str(paises[i+1][j]) != str(df.iloc[i][j])):
            alteracao = True
            now = datetime.now()
            df.iloc[i] = paises[i+1]
            df.loc[i, 'data_extracao'] = now.strftime("%m/%d/%Y, %H:%M:%S")

if alteracao:
    nome_do_arquivo = "dados_paises.csv"
    df.to_csv(nome_do_arquivo, index=False, header=True, quoting=csv.QUOTE_NONNUMERIC)
    print("Houveram atualizações no dataframe!")
else:
    print("Não houveram atualizações no dataframe.")

Não houveram atualizações no dataframe.


## Mostra as 10 linhas atualizadas mais recentemente

In [7]:
df_ordenado = df.sort_values('data_extracao', ascending=False)
df_ordenado.head(10)

Unnamed: 0,bandeira,area,populacao,iso,pais,capital,continente,tld,codigo_moeda,nome_moeda,fone,formato_codigo_postal,regex_codigo_postal,linguas,vizinhos,data_extracao
0,http://127.0.0.1:8000/places/static/images/fla...,647500 square kilometres,29121286,AF,Afghanistan,Kabul,AS,.af,AFN,Afghani,93,,,"fa-AF,ps,uz-AF,tk",TM CN IR TJ PK UZ,"04/07/2023, 17:54:28"
173,http://127.0.0.1:8000/places/static/images/fla...,406750 square kilometres,6375830,PY,Paraguay,Asuncion,SA,.py,PYG,Guarani,595,####,^(\d{4})$,"es-PY,gn",BO BR AR,"04/07/2023, 17:54:28"
160,http://127.0.0.1:8000/places/static/images/fla...,1267000 square kilometres,15878271,NE,Niger,Niamey,AF,.ne,XOF,Franc,227,####,^(\d{4})$,"fr-NE,ha,kr,dje",TD BJ DZ LY BF NG ML,"04/07/2023, 17:54:28"
161,http://127.0.0.1:8000/places/static/images/fla...,923768 square kilometres,154000000,NG,Nigeria,Abuja,AF,.ng,NGN,Naira,234,######,^(\d{6})$,"en-NG,ha,yo,ig,ff",TD NE BJ CM,"04/07/2023, 17:54:28"
162,http://127.0.0.1:8000/places/static/images/fla...,260 square kilometres,2166,NU,Niue,Alofi,OC,.nu,NZD,Dollar,683,,,"niu,en-NU",,"04/07/2023, 17:54:28"
163,http://127.0.0.1:8000/places/static/images/fla...,34 square kilometres,1828,NF,Norfolk Island,Kingston,OC,.nf,AUD,Dollar,672,####,^(\d{4})$,en-NF,,"04/07/2023, 17:54:28"
164,http://127.0.0.1:8000/places/static/images/fla...,120540 square kilometres,22912177,KP,North Korea,Pyongyang,AS,.kp,KPW,Won,850,###-###,^(\d{6})$,ko-KP,CN KR RU,"04/07/2023, 17:54:28"
165,http://127.0.0.1:8000/places/static/images/fla...,477 square kilometres,53883,MP,Northern Mariana Islands,Saipan,OC,.mp,USD,Dollar,+1-670,,,"fil,tl,zh,ch-MP,en-MP",,"04/07/2023, 17:54:28"
166,http://127.0.0.1:8000/places/static/images/fla...,324220 square kilometres,5009150,NO,Norway,Oslo,EU,.no,NOK,Krone,47,####,^(\d{4})$,"no,nb,nn,se,fi",FI RU SE,"04/07/2023, 17:54:28"
167,http://127.0.0.1:8000/places/static/images/fla...,212460 square kilometres,2967717,OM,Oman,Muscat,AS,.om,OMR,Rial,968,###,^(\d{3})$,"ar-OM,en,bal,ur",SA YE AE,"04/07/2023, 17:54:28"
