## Ingestão de arquivo CSV no HDFS
Ingestão feita usando a lib python hdfs

### Importando as bibliotecas

In [3]:
import requests
import os
import pandas as pd 
import hdfs
import urllib3

from hdfs import InsecureClient
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util import Retry


### Parâmetros de conexão

In [4]:
max_threads = 50
session = requests.Session()

retry_strategy = Retry(
    total=10,
    connect=10,
    read=10,
    redirect=10,
    status_forcelist=[429, 500, 502, 503, 504],
    method_whitelist=["HEAD", "GET", "OPTIONS"],
)

adapter = HTTPAdapter(
    max_retries=retry_strategy, pool_connections=max_threads, pool_maxsize=max_threads,
)

session.mount("https://", adapter)
session.mount("http://", adapter)

# client usando IP do host docker
client = 'http://192.168.56.1:50070'
# URL da tabela csv de crimes 
URL = 'https://query.data.world/s/giu6keldi3v7dbiijhxwfxyclaw7xn'

# Client HDFS
hdfs_client = InsecureClient(client, session=session)

### Gravando o arquivo CSV no HDFS

In [6]:
# Ler o arquivo csv com pandas
response = requests.get(URL)
open("crimes.csv", 'wb').write(response.content)

df = pd.read_csv("crimes.csv")

# Gravar o arquivo csv no HDFS
try:
    with hdfs_client.write('/datalake/dadosbrutos/crimes.csv', overwrite = True, encoding = 'utf-8') as writer:
        df.to_csv(writer)
    print("Gravado com sucesso")
except hdfs.util.HdfsError as e:
    print(f"[ERRO] {e}")
except urllib3.exceptions.NewConnectionError as e:
    print(f"[ERRO] {e}")

os.remove("crimes.csv")

Gravado com sucesso
