<a href="https://colab.research.google.com/github/DuarteVn/PySpark-no-Google-Colab/blob/main/Notebook_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Etapa 1: Configurar PySpark




In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Instalar PySpark
!pip install pyspark



In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, upper

In [16]:
# Iniciar a sessão do Spark
spark = SparkSession.builder.appName("Notebook1_Notafiscal").getOrCreate()


# Etapa 2: Importar base de dados

In [19]:
df = spark.read.csv("/content/drive/MyDrive/Colab Notebooks/Excel/notafiscal.csv", header=True, inferSchema=True, sep=',')
df.printSchema()
df.show(5)


root
 |-- id_venda: integer (nullable = true)
 |-- cod_cliente: integer (nullable = true)
 |-- nom_cliente: string (nullable = true)
 |-- cod_vendedor: integer (nullable = true)
 |-- nom_vendedor: string (nullable = true)
 |-- cod_cidade: integer (nullable = true)
 |-- nom_cidade: string (nullable = true)
 |-- cod_produto: integer (nullable = true)
 |-- Nom_produto: string (nullable = true)
 |-- cod_marca: integer (nullable = true)
 |-- Nom_marca: string (nullable = true)
 |-- cod_departamento: integer (nullable = true)
 |-- nom_departamento: string (nullable = true)
 |-- cod_gerencia: integer (nullable = true)
 |-- nom_gerencia: string (nullable = true)
 |-- dtc_venda: date (nullable = true)
 |-- qtd_venda: integer (nullable = true)
 |-- val_venda: double (nullable = true)
 |-- num_nota: integer (nullable = true)

+--------+-----------+---------------+------------+---------------+----------+-----------+-----------+-----------+---------+-------------+----------------+----------------+-

# Etapa 3: Criar DataFrame

## Etapa 3.1 Criar DataFrame CLIENTE

In [9]:
df_cliente = df.select(
    (col("cod_cliente")).alias("ID_CLIENTE"),
    upper(col("nom_cliente")).alias("NOME_CLIENTE"),
    upper(col("nom_cidade")).alias("CIDADE_CLIENTE")
    ).dropDuplicates()

df_cliente.show(5)

+----------+---------------+--------------+
|ID_CLIENTE|   NOME_CLIENTE|CIDADE_CLIENTE|
+----------+---------------+--------------+
|         6|REGINALDO ROSSI|   NOVA IORQUE|
|         1|   JOÃO DO PULO|          ROMA|
|         2|BENITO DE PAULA|          ROMA|
|         3|   RENATO RUSSO|          ROMA|
|         3|   RENATO RUSSO|      SALVADOR|
+----------+---------------+--------------+
only showing top 5 rows



## Etapa 3.2 Criar DataFrame GERENCIA


In [13]:
df_gerencia = df.select(
    (col("cod_gerencia")).alias("ID_GERENCIA"),
    upper(col("nom_gerencia")).alias("NOME_GERENCIA")
    ).dropDuplicates()

df_gerencia.show(5)


+-----------+-------------------+
|ID_GERENCIA|      NOME_GERENCIA|
+-----------+-------------------+
|          2|PAPELARIA/COSMÉTICO|
|          3|  ELETRO ELETRÔNICO|
|          1|         TECNOLOGIA|
|          4|   CASA E DECORAÇÃO|
+-----------+-------------------+



## Etapa 3.3 Criar DataFrame DEPARTAMENTO

In [14]:
df_departamento = df.select(
    (col("cod_departamento")).alias("ID_DEPARTAMENTO"),
    upper(col("nom_departamento")).alias("NOME_DEPARTAMENTO")
    ).dropDuplicates()

df_departamento.show(5)

+---------------+-----------------+
|ID_DEPARTAMENTO|NOME_DEPARTAMENTO|
+---------------+-----------------+
|              3|        PAPELARIA|
|              6|  ELETRODOMESTICO|
|              5|       ELETRÓNICO|
|              1|      INFORMÁTICA|
|              7|           MOVÉIS|
+---------------+-----------------+
only showing top 5 rows



# Etapa 4: Transformar arquivos em CSV

In [15]:
df_cliente.coalesce(1).write.mode("overwrite").csv("clientes.csv", header=True)
df_gerencia.coalesce(1).write.mode("overwrite").csv("gerencia.csv", header=True)
df_departamento.coalesce(1).write.mode("overwrite").csv("departamento.csv", header=True)


In [17]:
!zip -r arquivos_csv.zip clientes.csv gerencia.csv departamento.csv
from google.colab import files
files.download("arquivos_csv.zip")


  adding: clientes.csv/ (stored 0%)
  adding: clientes.csv/.part-00000-36132742-bc9c-4183-8342-fa26c2858e11-c000.csv.crc (stored 0%)
  adding: clientes.csv/_SUCCESS (stored 0%)
  adding: clientes.csv/._SUCCESS.crc (stored 0%)
  adding: clientes.csv/part-00000-36132742-bc9c-4183-8342-fa26c2858e11-c000.csv (deflated 73%)
  adding: gerencia.csv/ (stored 0%)
  adding: gerencia.csv/_SUCCESS (stored 0%)
  adding: gerencia.csv/._SUCCESS.crc (stored 0%)
  adding: gerencia.csv/part-00000-bfbc37c2-9911-4ebe-ae69-58b9e9f5ac2a-c000.csv (deflated 12%)
  adding: gerencia.csv/.part-00000-bfbc37c2-9911-4ebe-ae69-58b9e9f5ac2a-c000.csv.crc (stored 0%)
  adding: departamento.csv/ (stored 0%)
  adding: departamento.csv/.part-00000-46122180-54a3-4f49-935b-13ec16e0942d-c000.csv.crc (stored 0%)
  adding: departamento.csv/_SUCCESS (stored 0%)
  adding: departamento.csv/._SUCCESS.crc (stored 0%)
  adding: departamento.csv/part-00000-46122180-54a3-4f49-935b-13ec16e0942d-c000.csv (deflated 20%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>