# Load Data Streaming

Notebook de desenvolvimento responável por realizar processo streaming de ingestão de dados, o spark streaming busca os dados de um eventhub e escreve em formato delta table no 
Azure Data Lake Storage


<div style="text-align: center; line-height: 0; padding-top: 9px;">
  <img src="https://raw.githubusercontent.com/Foiac/MobileFraudDetectSolution/main/Editaveis/Imagens/eventhubstreamingingestion.png" alt="SparkStreaming Ingest" style="width: 800px">
</div>

#### Import dependecies

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
import json

#### Path definitions and secrets

In [0]:
storage_account_name  = "stacmfraud"

database_name = "bronze_mobile"
table_name = "access" 

tenant_id = "dc25df03-ffa5-4111-b188-46fe6cd26a3a"
client_id = "775d1965-2ada-4399-adb9-ead9caaf4d72"

container_name = "cont-fraud"
container_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/bronze/{database_name}"
delta_table_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/bronze/{database_name}/{table_name}"

event_hubs_server = "fraud-eh-namespace-standard.servicebus.windows.net"
event_hubs_topic = "fraud-detect"

connection_string = dbutils.secrets.get(scope="dbwsscope", key="spn-secret")
hash_word = dbutils.secrets.get(scope="dbwsscope", key="hash-word")

#### `Kafka` definitions

In [0]:
sasl_config = f'kafkashaded.org.apache.kafka.common.security.oauthbearer.OAuthBearerLoginModule required clientId="{client_id}" clientSecret="{client_secret}" scope="https://{event_hubs_server}/.default" ssl.protocol="SSL";'

kafka_options = {
# Port 9093 is the EventHubs Kafka port
"kafka.bootstrap.servers": f"{event_hubs_server}:9093",
"kafka.sasl.jaas.config": sasl_config,
"kafka.sasl.oauthbearer.token.endpoint.url": f"https://login.microsoft.com/{tenant_id}/oauth2/v2.0/token",
"subscribe": event_hubs_topic,

# You should not need to modify these
"kafka.security.protocol": "SASL_SSL",
"kafka.sasl.mechanism": "OAUTHBEARER",
"kafka.sasl.login.callback.handler.class": "kafkashaded.org.apache.kafka.common.security.oauthbearer.secured.OAuthBearerLoginCallbackHandler"
}

#### Database Create

In [0]:
spark.sql(f"""CREATE DATABASE IF NOT EXISTS {database_name} LOCATION '{container_path}'""")

spark.sql(f"""CREATE TABLE IF NOT EXISTS {database_name}.{table_name} (
        imei STRING,
        mac STRING,
        network STRING,
        client_ip STRING,
        latitude STRING,
        longitude STRING,
        uid STRING,
        password STRING,
        `transaction` STRING,
        api STRING,
        endpoint STRING,
        os STRING,
        phone_brand STRING,
        app_version STRING,
        error STRING,
        `timestamp` STRING,
        dat_ref STRING
        )
    USING DELTA
    LOCATION '{delta_table_path}'""")

DataFrame[]

#### Read Data Stream

In [0]:
df = (
    spark.readStream
      .format("kafka")
      .options(**kafka_options)
      .load()
      )

df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



#### Edit dictionary

In [0]:
# Define schema for the JSON
schema = StructType([
    StructField("imei", StringType(), True),
    StructField("mac", StringType(), True),
    StructField("network", StringType(), True),
    StructField("client_ip", StringType(), True),
    StructField("latitude", StringType(), True),
    StructField("longitude", StringType(), True),
    StructField("uid", StringType(), True),
    StructField("password", StringType(), True),
    StructField("transaction", StringType(), True),
    StructField("api", StringType(), True),
    StructField("endpoint", StringType(), True),
    StructField("os", StringType(), True),
    StructField("phone_brand", StringType(), True),
    StructField("app_version", StringType(), True),
    StructField("error", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("dat_ref", StringType(), True)
])

# Converting the event hub body field to a DataFrame

df = (df
      .select(F.col("value").cast("string"))
      .withColumn("json_list", F.from_json(F.col("value"), ArrayType(schema)))
      .select(F.explode(F.col("json_list")).alias("json_data"))
      )

# Select and display individual fields
df_body = (df.select(
    F.col("json_data.imei"),
    F.col("json_data.mac"),
    F.col("json_data.network"),
    F.col("json_data.client_ip"),
    F.col("json_data.latitude"),
    F.col("json_data.longitude"),
    F.col("json_data.uid"),
    F.col("json_data.password"),
    F.col("json_data.transaction"),
    F.col("json_data.api"),
    F.col("json_data.endpoint"),
    F.col("json_data.os"),
    F.col("json_data.phone_brand"),
    F.col("json_data.app_version"),
    F.col("json_data.error"),
    F.col("json_data.timestamp"))
      .withColumn("imei", F.sha2(F.concat(F.col("imei"), F.lit(hash_word)), 256))
      .withColumn("mac", F.sha2(F.concat(F.col("mac"), F.lit(hash_word)), 256))
      .withColumn("uid", F.sha2(F.concat(F.col("uid"), F.lit(hash_word)), 256))
      .withColumn("password", F.sha2(F.concat(F.col("password"), F.lit(hash_word)), 256))
      .withColumn("dat_ref", F.from_unixtime(F.col("timestamp").cast("long")/1000, "yyyy-MM-dd"))
      )

#### Write data on `Bronze Table`

In [0]:
query = (df_body.writeStream
         .format("delta")
         .outputMode("append")
         .trigger(once=True)
         .option("checkpointLocation", f"{delta_table_path}/_checkpoints/")
         .start(delta_table_path))


In [0]:
df = spark.table(f"{database_name}.{table_name}")
display(df)

imei,mac,network,client_ip,latitude,longitude,uid,password,transaction,api,endpoint,os,phone_brand,app_version,error,timestamp,dat_ref
8013a84716f3db1a15e90fa085c5b353e41ad0d7b167d129f737628d27618579,933a9ff5c439b555936ab5edbc3b9e48d2688c9e343a1d0a89e38de514f72242,OI,220.78.99.50,-23.77563100432002,-51.262987693146776,677f7e365aedb68907d75345fa73468391bfbdc909f1848d7390881abf1d1e67,2f727553c360ec78cc026ae2b477f89dba4b582bb2f45c0b4fdf0fc48dc017a9,True,login-authentication,v1/login,iOS 16,iPhone 8,2.0.3,0,1730491222000.0,2024-11-01
271e7373bd8700a81c95c66561f1c980a22adaaf25c48d51224339ad513e6003,faa026689361a6b8560bd840b90194427d77aeedcf315b88079dafcf6df77156,VIVO,81.177.184.248,-21.703336134274167,-43.454286695170055,afaa06a8bebc3119b8a491a9634bba55e8583977ac359787e3d17e135b4c3dda,6765e2574e8235784ab2810fb7c5f3132ecba4b2409fb3e11c55d6283eae2eae,True,login-authentication,v1/login,iOS 17,iPhone SE 2020,2.0.3,0,1730565877000.0,2024-11-02
82a0936f125b41b754437d93c5ec1eeee65192d146ac350874e828a788982499,ab7b78b59f76ab1c11abb8c79ffce22b3b28615088836a7f4d58c811beb8bbe7,OI,66.182.23.205,-9.370609183608783,-36.000740168887646,ea23bb98bd7a761acc1c0a57e0eb29c6916dc38e2f2c74880b081ea7d0451750,06ba8d4934f3fd5323bace59690ad5d625ce3380d9214e16961280548bf23983,True,login-authentication,v1/login,Android 13 (Funtouch OS 13),Vivo V29e,2.0.3,0,1730495312000.0,2024-11-01
f0ca3836190245d30fc2484a9a4208e29e37371c58c59f6e27aa3808e779f7a3,5b326303862b0af7997c810ca75a834a7ba5a0d1cf2398a9ad68bddcca48362a,TIM,163.34.180.118,-19.649171045838926,-44.13518572870252,609ca3852d3c5ba968104a9718ea486445b3c68bd19eb91e538315d84489ec38,2f36c5c52248f0d66a4a080a62ed6c68e63e79c5757a545897dea64441400166,False,login-authentication,v1/login,iOS 10.3.4,iPhone 5,2.0.2,0,1730503785000.0,2024-11-01
033b1dbebf1914bc2248c0f6e4b0c326d5bb7ed627dd0da3b734a33621e4f231,00542d0f230e3fc04a26f3f5ba74074a2bb5135262202ae9f29ad5af29b5f2db,OI,87.218.173.230,-8.23590352456598,-34.93756363014488,c567cf37d440dd0f572dd68a66f7f548b2947f3602cd09ff80be4b433a12781a,7cf1ff25309278232199d8d6e05251eb53192064f0d6b10d705cef596e1f3fa2,True,login-authentication,v1/login,iOS 17,iPhone 11 Pro Max,2.0.3,0,1730491469000.0,2024-11-01
178e4b452fa639ec9b5ab64acdce8d1f394e111d840920aa5c73bd38e801fcbf,c223bfbe394d2411fcfee111b116a5adde4907ed8035041aac3014ace0f2f5f1,TIM,125.41.132.238,-9.86805392718289,-35.48945302108216,54dcd44b7c21f3374fe5e2bd524529515bc0173c97b4064c34c4c34612e2c33f,1134d1f3fabcadafd20664af17a7eadb8ffd5db91bf98d6732b0c5cf8f6c6536,False,login-authentication,v1/login,iOS 4.2.1,iPhone 3G,2.0.3,0,1730558263000.0,2024-11-02
8108ea805da4742e4d526643fac5762ef51cedb5ca528d00e72e0f8a83934d41,be1089680bc500d44219f25331a4e8c3e234c7fc27402877aa710d18204fc19c,OI,118.24.160.145,-2.1890253983261907,-57.010920594040726,848250c3a46fcc9f6cc1ddbad6a1f249b8bb71f06ee4d442f1b9ecc9e2b23a41,cab4135d86383a4806fa0597f13ff88f46bfdce921f5159615f5a65f04b65a30,True,login-authentication,v1/login,Android 13 (MyUX),Motorola Moto G Power 2023,2.0.3,0,1730534166000.0,2024-11-02
4ce4bf96ba295b1c55708b7011d7895e3e22c23e6419f742f94a23806dc5b890,fe93a68c342082762a50a436f0ff616f679ee11e2f8c6595f1da49d12a6ea6b8,TIM,47.179.147.165,-5.984354395007907,-35.71067742536712,b827e15aff333dad2438751515e3396f76d5ee3069c2f3e0b6f1feef3965c133,4790ceb7f0579e26c733b52f03f9e14c0176a0c30b19516ee0248dab22d58194,False,login-authentication,v1/login,Android 14 (Stock Android),Google Pixel 8 Pro,2.0.2,0,1730489420000.0,2024-11-01
9c2073c296ec2ef7b8625ba4009c68233e9c14527854a3f238bbde4d92d83e85,af9a27864578fb7813069bbbb93b5e1244f3709979598a63eaf9a1d5243edee0,CLARO,44.111.9.101,-30.18451361755625,-51.03876300886537,9a512eebc9e93b3a430973cac6dacf64d5529c7d6d2cb0fb2d4fe2fa5909dc00,750f9d697110c99bd682e95fef7ceaedd1bb77daf42fb1870b458eeb3e57471b,False,login-authentication,v1/login,Android 12 (XOS 12),Infinix Zero Ultra,2.0.3,0,1730457558000.0,2024-11-01
840bac1ea1b52a998e343e34f5c70af613c1b9155997b0e604329ff1cdca2be6,20b1f4f2fab6975d1b58b7493bfc15f77f3d476524898f076bdd68a54f2aa1f4,CLARO,198.142.165.78,-3.388612414467726,-40.503998141398895,ce5bcc5462f75528c2c1c1cc8d04a084235a712958f37cf8f6625387fa84c9bd,da9a4f5af0f51bdca448c8f9022ec786f6c76ccb7fc5a223e3d13a08a698c97d,True,login-authentication,v1/login,Android 13 (MIUI 14),Xiaomi 13 Lite,2.0.3,0,1730420332000.0,2024-11-01
