# Web Scraping for Download Data Sources

## Environment Setup

In [0]:
%pip install playwright
!playwright install
!playwright install-deps

In [0]:
%restart_python

In [0]:
from playwright.async_api import async_playwright
import asyncio
import unicodedata
import re
import requests

## Web Scraping Script

In [0]:
async def fetch_file_names_and_download():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto("https://dados.gov.br/dados/conjuntos-dados/grandes-nmeros-do-imposto-de-renda-da-pessoa-fsica")
        await asyncio.sleep(10)
        rows = await page.query_selector_all("div.row.flex.mb-5")
        file_names = []
        for row in rows:
            span = await row.query_selector('span.resource-icon-left')
            if span:
                span_text = await span.inner_text()
                if span_text.strip() == "CSV":
                    h4 = await row.query_selector("h4")
                    if h4:
                        text = await h4.inner_text()
                        if "Faixa de Rendimento" in text:
                            text = f"{text}-em-salarios-minimos"    
                        else:
                            text = f"{text}"
                        file_names.append(text)
        await browser.close()
    return file_names

In [0]:
file_names = await fetch_file_names_and_download()

## Data Sources' Download URL Formating

In [0]:
def normalize_filename(name):
    name = name.lower()
    name = name.replace(' ', '-')
    name = name.replace('ç', 'c')
    name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('utf-8')
    name = re.sub(r'[^a-zA-Z0-9\-:/]', '-', name)
    if (name == "rendimentos-sujeitos-a-tributacao-exclusiva-definitiva"):
        name = "rendimentos-sujeitos-a-tributacao-exclusiva_definitiva"
    if (name == "faixa-de-rendimentos-totais-em-salarios-minimos"):
        name = "faixa-de-rendimentos-totais"
    if (name == "recebedores-de-lucros-e-dividendos-rend-socio-e-titular-microempresa-por-faixa-de-rendimento-total-em-salarios-minimos"):
        name = "recebedores-de-lucros-e-dividendos-rend-socio-e-titular-microempresa-por-faixa-de-rendimento-total"
    if (name == "recebedores-de-lucros-e-dividendos-rend-socio-e-titular-microempresa-por-faixa-de-rendimento-total-em-salarios-minimos.csv"):
        name = "recebedores-de-lucros-e-dividendos-rend-socio-e-titular-microempresa-por-faixa-de-rendimento-total.csv"
    return name

file_names = [n.replace('.csv', '') for n in file_names]
file_names = [normalize_filename(n) for n in file_names]
file_names = [n.replace('--', '') for n in file_names]

## Downloading Data Sources and Saving in Unity Catalog

In [0]:
for file_name in file_names:
    url = "https://www.gov.br/receitafederal/dados/" + file_name + ".csv"
    response = requests.get(url, verify=False)
    response.raise_for_status()
    with open(f"/Volumes/brazilian_tax_big_numbers/data_sources/data_sources/{file_name}.csv", "wb") as f:
        f.write(response.content)