In [5]:
!pip install kaggle
!pip install hdfs

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 3.1 MB/s eta 0:00:011
Collecting tqdm
  Downloading tqdm-4.63.1-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 4.0 MB/s  eta 0:00:01
[?25hCollecting python-slugify
  Downloading python_slugify-6.1.1-py2.py3-none-any.whl (9.1 kB)
Collecting importlib-resources; python_version < "3.7"
  Downloading importlib_resources-5.4.0-py3-none-any.whl (28 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 5.5 MB/s  eta 0:00:01
[?25hCollecting zipp>=3.1.0; python_version < "3.10"
  Downloading zipp-3.6.0-py3-none-any.whl (5.3 kB)
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73053 sha256=b68cc4d6b216d4c32c0dfbe376d2305128ee758a5edc036

# Kaggle configuration, API Token

In [5]:
import os
base_path = "/mnt/notebooks/Individual_Desafio1_FIAP"
os.environ["KAGGLE_CONFIG_DIR"] = f'{base_path}/kaggle_config_dir/'
!chmod 600 /mnt/notebooks/Individual_Desafio1_FIAP/kaggle_config_dir/kaggle.json

# Download dataset files from Kaggle

In [3]:
import kaggle
kaggle.api.authenticate()

kaggle.api.dataset_download_files('olistbr/brazilian-ecommerce', 
                                  path='/mnt/notebooks/Individual_Desafio1_FIAP/olist_dataset', 
                                  unzip=True)

# Ingest dataset files to HDFS

## Parâmetros de conexão

In [4]:
import requests
import os
import pandas as pd 
import hdfs
import urllib3

from hdfs import InsecureClient
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util import Retry

max_threads = 50
session = requests.Session()

retry_strategy = Retry(
    total=10,
    connect=10,
    read=10,
    redirect=10,
    status_forcelist=[429, 500, 502, 503, 504],
    method_whitelist=["HEAD", "GET", "OPTIONS"],
)

adapter = HTTPAdapter(
    max_retries=retry_strategy, pool_connections=max_threads, pool_maxsize=max_threads,
)

session.mount("https://", adapter)
session.mount("http://", adapter)

# client usando IP do host docker
client = 'http://192.168.56.1:50070'

# Client HDFS
hdfs_client = InsecureClient(client, session=session)

## Escrevendo na landing zone HDFS

In [17]:
# Gravar o arquivo csv no HDFS
for filename in os.listdir(f'{base_path}/olist_dataset'):
    df = pd.read_csv(f'{base_path}/olist_dataset/{filename}')
    try:
        with hdfs_client.write(f'/datalake/landing_zone/{filename}', overwrite = True, encoding='utf-8') as writer:
            df.to_csv(writer, header=False, index=False)
        print(f"{filename} Gravado com sucesso")
    except hdfs.util.HdfsError as e:
        print(f"{filename} falhou")
        print(f"[ERRO] {e}")
    except urllib3.exceptions.NewConnectionError as e:
        print(f"{filename} falhou")
        print(f"[ERRO] {e}")

olist_customers_dataset.csv Gravado com sucesso
olist_geolocation_dataset.csv Gravado com sucesso
olist_orders_dataset.csv Gravado com sucesso
olist_order_items_dataset.csv Gravado com sucesso
olist_order_payments_dataset.csv Gravado com sucesso
olist_order_reviews_dataset.csv Gravado com sucesso
olist_products_dataset.csv Gravado com sucesso
olist_sellers_dataset.csv Gravado com sucesso
product_category_name_translation.csv Gravado com sucesso


## Transformando arquivos de csv para orc

In [23]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Ingest Olist Dataset") \
    .getOrCreate()

landing_zone = '/datalake/landing_zone/'
files = hdfs_client.list(landing_zone)

for filename in files:
    csv = spark.read.csv(f'{landing_zone}/{filename}')
    orc_name = filename.replace('csv', 'orc')
    csv.write.orc(f'/datalake/dadosbrutos/{orc_name}')
    