In [1]:
!pip install kaggle
!pip install hdfs

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 3.5 MB/s eta 0:00:011
Collecting tqdm
  Downloading tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 5.4 MB/s  eta 0:00:01
[?25hCollecting python-slugify
  Downloading python_slugify-6.1.2-py2.py3-none-any.whl (9.4 kB)
Collecting importlib-resources; python_version < "3.7"
  Downloading importlib_resources-5.4.0-py3-none-any.whl (28 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 4.4 MB/s eta 0:00:011
[?25hCollecting zipp>=3.1.0; python_version < "3.10"
  Downloading zipp-3.6.0-py3-none-any.whl (5.3 kB)
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73053 sha256=8ff4c3f9d2763c9df52e63016efe4cfdb100d8e92be8702

# Kaggle configuration, API Token

In [2]:
import os
base_path = "/mnt/notebooks/Desafio1_FIAP"
os.environ["KAGGLE_CONFIG_DIR"] = f'{base_path}/kaggle_config_dir/'
!chmod 600 /mnt/notebooks/Desafio1_FIAP/kaggle_config_dir/kaggle.json

# Download dataset files from Kaggle

In [3]:
import kaggle
kaggle.api.authenticate()

kaggle.api.dataset_download_files('olistbr/brazilian-ecommerce', 
                                  path='/mnt/notebooks/Individual_Desafio1_FIAP/olist_dataset', 
                                  unzip=True)

# Ingest dataset files to HDFS

## Parâmetros de conexão

In [4]:
import requests
import os
import pandas as pd 
import hdfs
import urllib3

from hdfs import InsecureClient
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util import Retry

max_threads = 50
session = requests.Session()

retry_strategy = Retry(
    total=10,
    connect=10,
    read=10,
    redirect=10,
    status_forcelist=[429, 500, 502, 503, 504],
    method_whitelist=["HEAD", "GET", "OPTIONS"],
)

adapter = HTTPAdapter(
    max_retries=retry_strategy, pool_connections=max_threads, pool_maxsize=max_threads,
)

session.mount("https://", adapter)
session.mount("http://", adapter)

# client usando IP do host docker
client = 'http://192.168.56.1:50070'

# Client HDFS
hdfs_client = InsecureClient(client, session=session)

## Escrevendo na landing zone HDFS

In [15]:
# Gravar o arquivo csv no HDFS
for filename in os.listdir(f'{base_path}/olist_dataset'):
    df = pd.read_csv(f'{base_path}/olist_dataset/{filename}', sep =',')
    df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
    try:
        with hdfs_client.write(f'/datalake/landing_zone/{filename}', overwrite = True, encoding='utf-8') as writer:
            df.to_csv(writer, header=True, index=False)
        print(f"{filename} Gravado com sucesso")
    except hdfs.util.HdfsError as e:
        print(f"{filename} falhou")
        print(f"[ERRO] {e}")
    except urllib3.exceptions.NewConnectionError as e:
        print(f"{filename} falhou")
        print(f"[ERRO] {e}")
    except Exception as e:
        print(e)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
customer_id                 99441 non-null object
customer_unique_id          99441 non-null object
customer_zip_code_prefix    99441 non-null int64
customer_city               99441 non-null object
customer_state              99441 non-null object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB
olist_customers_dataset.csv Gravado com sucesso
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
geolocation_zip_code_prefix    1000163 non-null int64
geolocation_lat                1000163 non-null float64
geolocation_lng                1000163 non-null float64
geolocation_city               1000163 non-null object
geolocation_state              1000163 non-null object
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB
olist_geolocation_dataset.csv Gravado com sucesso
<class 'pandas.core.frame.DataFrame'>
RangeI

## Transformando arquivos de csv para orc

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lpad

spark = SparkSession \
    .builder \
    .appName("Ingest Olist Dataset") \
    .getOrCreate()

landing_zone = '/datalake/landing_zone/'
files = hdfs_client.list(landing_zone)

for filename in files:
    csv = spark.read.csv(f'{landing_zone}/{filename}', header = True, inferSchema=True, sep = ',')
    if filename == 'olist_customers_dataset.csv':
        csv = csv.withColumn('customer_zip_code_prefix', lpad(csv.customer_zip_code_prefix, 5, '0'))
    elif filename == 'olist_sellers_dataset.csv':
        csv = csv.withColumn('seller_zip_code_prefix', lpad(csv.seller_zip_code_prefix, 5, '0'))
    elif filename == 'olist_geolocation_dataset.csv':
        csv = csv.withColumn('geolocation_zip_code_prefix', lpad(csv.geolocation_zip_code_prefix, 5, '0'))
    csv.printSchema()
    orc_name = filename.replace('csv', 'orc')
    csv.write.orc(f'/datalake/dadosbrutos/{orc_name}', 'overwrite')
    

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)

root
 |-- geolocation_zip_code_prefix: string (nullable = true)
 |-- geolocation_lat: double (nullable = true)
 |-- geolocation_lng: double (nullable = true)
 |-- geolocation_city: string (nullable = true)
 |-- geolocation_state: string (nullable = true)

root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)

root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = 