In [0]:
spark

##Conectando Azure ADLS Gen2 no Databricks

###Mostrando os pontos de montagem no cluster Databricks

In [0]:
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/Volumes,UnityCatalogVolumes,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/databricks-results,databricks-results,
/databricks/mlflow-registry,databricks/mlflow-registry,
/mnt/datalake7a68c04c876ba15d/silver,wasbs://silver@datalake7a68c04c876ba15d.blob.core.windows.net,
/mnt/datalake7a68c04c876ba15d/gold,wasbs://gold@datalake7a68c04c876ba15d.blob.core.windows.net,
/Volume,DbfsReserved,
/volumes,DbfsReserved,
/,DatabricksRoot,


### Definindo storage account e sas key 

In [0]:
storageAccountName = "datalake7a68c04c876ba15d"
sasToken = dbutils.secrets.get(scope="sas-token", key="sas-tkn")

###Desmontando os pontos de montagem não utilizados

In [0]:
# dbutils.fs.unmount(f'/mnt/{storageAccountName}/landing-zone')
# dbutils.fs.unmount(f'/mnt/{storageAccountName}/bronze')
dbutils.fs.unmount(f'/mnt/{storageAccountName}/silver')
dbutils.fs.unmount(f'/mnt/{storageAccountName}/gold')

/mnt/datalake7a68c04c876ba15d/bronze has been unmounted.
/mnt/datalake7a68c04c876ba15d/silver has been unmounted.
/mnt/datalake7a68c04c876ba15d/gold has been unmounted.


True

### Definindo uma função para montar um ADLS com um ponto de montagem com ADLS SAS 

In [0]:
def mount_adls(blobContainerName):
    try:
      dbutils.fs.mount(
        source = "wasbs://{}@{}.blob.core.windows.net".format(blobContainerName, storageAccountName),
        mount_point = f"/mnt/{storageAccountName}/{blobContainerName}",
        extra_configs = {'fs.azure.sas.' + blobContainerName + '.' + storageAccountName + '.blob.core.windows.net': sasToken}
      )
      print("OK!")
    except Exception as e:
      print("Falha", e)

###Montando todos os containers

In [0]:
mount_adls('landing-zone')
mount_adls('bronze')
mount_adls('silver')
mount_adls('gold')

OK!
OK!
OK!
OK!


###Mostrando os pontos de montagem no cluster Databricks

In [0]:
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/Volumes,UnityCatalogVolumes,
/mnt/datalake7a68c04c876ba15d/bronze,wasbs://bronze@datalake7a68c04c876ba15d.blob.core.windows.net,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/databricks-results,databricks-results,
/databricks/mlflow-registry,databricks/mlflow-registry,
/mnt/datalake7a68c04c876ba15d/silver,wasbs://silver@datalake7a68c04c876ba15d.blob.core.windows.net,
/mnt/datalake7a68c04c876ba15d/landing-zone,wasbs://landing-zone@datalake7a68c04c876ba15d.blob.core.windows.net,
/mnt/datalake7a68c04c876ba15d/gold,wasbs://gold@datalake7a68c04c876ba15d.blob.core.windows.net,
/Volume,DbfsReserved,


## Transformações de dados

### Mostrando todos os arquivos da camada landing-zone

In [0]:
display(dbutils.fs.ls(f"/mnt/{storageAccountName}/landing-zone"))

path,name,size,modificationTime
dbfs:/mnt/datalake7a68c04c876ba15d/landing-zone/Agendamentos.csv,Agendamentos.csv,784550,1719190940000
dbfs:/mnt/datalake7a68c04c876ba15d/landing-zone/Cargas.csv,Cargas.csv,512810,1719190941000
dbfs:/mnt/datalake7a68c04c876ba15d/landing-zone/Clientes.csv,Clientes.csv,1222768,1719190941000
dbfs:/mnt/datalake7a68c04c876ba15d/landing-zone/Motoristas.csv,Motoristas.csv,647073,1719190939000
dbfs:/mnt/datalake7a68c04c876ba15d/landing-zone/Rotas.csv,Rotas.csv,283294,1719190938000
dbfs:/mnt/datalake7a68c04c876ba15d/landing-zone/Veiculos.csv,Veiculos.csv,522623,1719190939000


###Gerando um dataframe para cada arquivo a partir dos arquivos CSV gravado no container landing-zone do Azure Data Lake Storage

In [0]:
df_agendamentos = spark.read.option("infeschema", "true").option("header", "true").csv(f"/mnt/{storageAccountName}/landing-zone/Agendamentos.csv")
df_cargas = spark.read.option("infeschema", "true").option("header", "true").csv(f"/mnt/{storageAccountName}/landing-zone/Cargas.csv")
df_clientes = spark.read.option("infeschema", "true").option("header", "true").csv(f"/mnt/{storageAccountName}/landing-zone/Clientes.csv")
df_motoristas = spark.read.option("infeschema", "true").option("header", "true").csv(f"/mnt/{storageAccountName}/landing-zone/Motoristas.csv") 
df_rotas = spark.read.option("infeschema", "true").option("header", "true").csv(f"/mnt/{storageAccountName}/landing-zone/Rotas.csv")
df_veiculos = spark.read.option("infeschema", "true").option("header", "true").csv(f"/mnt/{storageAccountName}/landing-zone/Veiculos.csv")

### Adicionando metadados de data e hora de processamento e nome do arquivo de origem

In [0]:
from pyspark.sql.functions import current_timestamp, lit

df_agendamentos = df_agendamentos.withColumn("data_hora_bronze", current_timestamp()).withColumn("nome_arquivo", lit("Agendamentos.csv"))
df_cargas = df_cargas.withColumn("data_hora_bronze", current_timestamp()).withColumn("nome_arquivo", lit("Cargas.csv"))
df_clientes = df_clientes.withColumn("data_hora_bronze", current_timestamp()).withColumn("nome_arquivo", lit("Clientes.csv"))
df_motoristas = df_motoristas.withColumn("data_hora_bronze", current_timestamp()).withColumn("nome_arquivo", lit("Motoristas.csv"))
df_rotas = df_rotas.withColumn("data_hora_bronze", current_timestamp()).withColumn("nome_arquivo", lit("Rotas.csv"))
df_veiculos = df_veiculos.withColumn("data_hora_bronze", current_timestamp()).withColumn("nome_arquivo", lit("Veiculos.csv"))


###Salvando os dataframes em delta lake (formato de arquivo) no data lake (repositorio cloud)

In [0]:
df_agendamentos.write.format('delta').mode("overwrite").option("overwriteSchema", "true").save(f"/mnt/{storageAccountName}/bronze/agendamentos")
df_cargas.write.format('delta').mode("overwrite").option("overwriteSchema", "true").save(f"/mnt/{storageAccountName}/bronze/cargas")
df_clientes.write.format('delta').mode("overwrite").option("overwriteSchema", "true").save(f"/mnt/{storageAccountName}/bronze/clientes")
df_motoristas.write.format('delta').mode("overwrite").option("overwriteSchema", "true").save(f"/mnt/{storageAccountName}/bronze/motoristas")
df_rotas.write.format('delta').mode("overwrite").option("overwriteSchema", "true").save(f"/mnt/{storageAccountName}/bronze/rotas")
df_veiculos.write.format('delta').mode("overwrite").option("overwriteSchema", "true").save(f"/mnt/{storageAccountName}/bronze/veiculos")

###Verificando os dados gravados em delta na camada bronze

In [0]:
display(dbutils.fs.ls(f"/mnt/{storageAccountName}/bronze/"))

path,name,size,modificationTime
dbfs:/mnt/datalake7a68c04c876ba15d/bronze/agendamentos/,agendamentos/,0,0
dbfs:/mnt/datalake7a68c04c876ba15d/bronze/cargas/,cargas/,0,0
dbfs:/mnt/datalake7a68c04c876ba15d/bronze/clientes/,clientes/,0,0
dbfs:/mnt/datalake7a68c04c876ba15d/bronze/motoristas/,motoristas/,0,0
dbfs:/mnt/datalake7a68c04c876ba15d/bronze/rotas/,rotas/,0,0
dbfs:/mnt/datalake7a68c04c876ba15d/bronze/veiculos/,veiculos/,0,0


### Lendo um exemplo de um delta lake para validar a existencia dos dados e das colunas do metadados

In [0]:
spark.read.format('delta').load(f'/mnt/{storageAccountName}/bronze/veiculos').limit(10).display()

VeiculoID,TipoVeiculo,DataAquisicao,EstadoVeiculo,PlacaVeiculo,MotoristaID,data_hora_bronze,nome_arquivo
1,caminhao,2023-09-13,Próximo a manutenção,MFC3479G,7290,2024-06-28T16:57:27.251Z,Veiculos.csv
2,caminhao,2019-09-06,Manutenção,ETC8083H,4176,2024-06-28T16:57:27.251Z,Veiculos.csv
3,caminhao,2021-11-28,Próximo a manutenção,LFY9976F,9940,2024-06-28T16:57:27.251Z,Veiculos.csv
4,caminhao,2024-04-26,Próximo a manutenção,JHF0759M,1087,2024-06-28T16:57:27.251Z,Veiculos.csv
5,caminhao,2022-05-24,Bom,OEP1666A,1479,2024-06-28T16:57:27.251Z,Veiculos.csv
6,caminhao,2022-01-01,Próximo a manutenção,VDW3967D,9139,2024-06-28T16:57:27.251Z,Veiculos.csv
7,caminhao,2020-03-05,Bom,LUL8938Q,1347,2024-06-28T16:57:27.251Z,Veiculos.csv
8,caminhao,2022-07-27,Próximo a manutenção,QYD6280L,4589,2024-06-28T16:57:27.251Z,Veiculos.csv
9,caminhao,2024-05-31,Manutenção,AVB7055D,1717,2024-06-28T16:57:27.251Z,Veiculos.csv
10,caminhao,2024-05-23,Manutenção,ZQR6073P,8098,2024-06-28T16:57:27.251Z,Veiculos.csv
