In [0]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, DoubleType, BooleanType, DateType

In [0]:
spark.conf.set("fs.azure.account.auth.type.fdadls.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.fdadls.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.fdadls.dfs.core.windows.net", "ab75fa94-e653-43f4-8e3b-59524ed11b4d")
spark.conf.set("fs.azure.account.oauth2.client.secret.fdadls.dfs.core.windows.net", "Wtf8Q~ZlI4LmbCXthSunl72i4LZM8tuX.PFU1bSQ")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.fdadls.dfs.core.windows.net", "https://login.microsoftonline.com/1aff2d1a-b404-4b47-a8f3-eca6b373b83f/oauth2/token")

In [0]:
spark

### Data loading

In [0]:
# Base path
base_path = "abfss://raw-data@fdadls.dfs.core.windows.net"

# Account Activity
account_activity = spark.read.option("header", True).option("inferSchema", True).csv(f"{base_path}/Customer_Profile/account_activity.csv")

# Customer Data
customer_data = spark.read.option("header", True).option("inferSchema", True).csv(f"{base_path}/Customer_Profile/customer_data.csv")

# Fraudulent Patterns
fraud_indicators = spark.read.option("header", True).option("inferSchema", True).csv(f"{base_path}/Fraudulent Patterns/fraud_indicators.csv")
suspicious_activity = spark.read.option("header", True).option("inferSchema", True).csv(f"{base_path}/Fraudulent Patterns/suspicious_activity.csv")

# Merchant Information
merchant_data = spark.read.option("header", True).option("inferSchema", True).csv(f"{base_path}/Merchant Information/merchant_data.csv")
transaction_category_labels = spark.read.option("header", True).option("inferSchema", True).csv(f"{base_path}/Merchant Information/transaction_category_labels.csv")

# Transaction Amounts
amount_data = spark.read.option("header", True).option("inferSchema", True).csv(f"{base_path}/Transaction Amounts/amount_data.csv")
anomaly_scores = spark.read.option("header", True).option("inferSchema", True).csv(f"{base_path}/Transaction Amounts/anomaly_scores.csv")

# Transaction Data
transaction_metadata = spark.read.option("header", True).option("inferSchema", True).csv(f"{base_path}/Transaction Data/transaction_metadata.csv")
transaction_records = spark.read.option("header", True).option("inferSchema", True).csv(f"{base_path}/Transaction Data/transaction_records.csv")


### Data Cleaning

In [0]:
account_activity.show(5)

+-----------+------------------+----------+
|customer_id|    AccountBalance| LastLogin|
+-----------+------------------+----------+
|       1001|  9507.27205955636|2022-01-01|
|       1002| 7408.704535557173|2022-01-02|
|       1003| 1715.321988918382|2022-01-03|
|       1004|3101.5091338402535|2022-01-04|
|       1005| 5405.766913696731|2022-01-05|
+-----------+------------------+----------+
only showing top 5 rows



In [0]:
account_activity.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- AccountBalance: double (nullable = true)
 |-- LastLogin: date (nullable = true)



In [0]:
account_activity = account_activity.withColumnRenamed("customer_id","CustomerID")

In [0]:
account_activity.show(5)

+----------+------------------+----------+
|CustomerID|    AccountBalance| LastLogin|
+----------+------------------+----------+
|      1001|  9507.27205955636|2022-01-01|
|      1002| 7408.704535557173|2022-01-02|
|      1003| 1715.321988918382|2022-01-03|
|      1004|3101.5091338402535|2022-01-04|
|      1005| 5405.766913696731|2022-01-05|
+----------+------------------+----------+
only showing top 5 rows



In [0]:
amount_data.show(5)
amount_data.printSchema()

+-------------+------------------+
|TransactionID| TransactionAmount|
+-------------+------------------+
|            1| 79.41360746377397|
|            2|12.053087153568082|
|            3| 33.31035719105734|
|            4| 46.12111728963105|
|            5| 54.05161801340517|
+-------------+------------------+
only showing top 5 rows

root
 |-- TransactionID: integer (nullable = true)
 |-- TransactionAmount: double (nullable = true)



In [0]:
anomaly_scores.show(5)
anomaly_scores.printSchema()

+-------------+--------------------+
|TransactionID|        AnomalyScore|
+-------------+--------------------+
|            1|  0.6866994638180963|
|            2| 0.08174887080114657|
|            3|0.023856830105308702|
|            4|  0.8769943477359176|
|            5| 0.03405877267311075|
+-------------+--------------------+
only showing top 5 rows

root
 |-- TransactionID: integer (nullable = true)
 |-- AnomalyScore: double (nullable = true)



In [0]:
customer_data.show(5)
customer_data.printSchema()  

+----------+-------------+---+------------+
|CustomerID|         Name|Age|     Address|
+----------+-------------+---+------------+
|      1001|Customer 1001| 54|Address 1001|
|      1002|Customer 1002| 35|Address 1002|
|      1003|Customer 1003| 40|Address 1003|
|      1004|Customer 1004| 30|Address 1004|
|      1005|Customer 1005| 46|Address 1005|
+----------+-------------+---+------------+
only showing top 5 rows

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Address: string (nullable = true)



In [0]:
fraud_indicators.show(5)
fraud_indicators.printSchema()

+-------------+--------------+
|TransactionID|FraudIndicator|
+-------------+--------------+
|            1|             0|
|            2|             0|
|            3|             0|
|            4|             0|
|            5|             0|
+-------------+--------------+
only showing top 5 rows

root
 |-- TransactionID: integer (nullable = true)
 |-- FraudIndicator: integer (nullable = true)



In [0]:
merchant_data.show(5)    
merchant_data.printSchema() 

+----------+-------------+-------------+
|MerchantID| MerchantName|     Location|
+----------+-------------+-------------+
|      2001|Merchant 2001|Location 2001|
|      2002|Merchant 2002|Location 2002|
|      2003|Merchant 2003|Location 2003|
|      2004|Merchant 2004|Location 2004|
|      2005|Merchant 2005|Location 2005|
+----------+-------------+-------------+
only showing top 5 rows

root
 |-- MerchantID: integer (nullable = true)
 |-- MerchantName: string (nullable = true)
 |-- Location: string (nullable = true)



In [0]:
transaction_category_labels.show(5)
transaction_category_labels.printSchema()

+-------------+--------+
|TransactionID|Category|
+-------------+--------+
|            1|   Other|
|            2|  Online|
|            3|  Travel|
|            4|  Travel|
|            5|   Other|
+-------------+--------+
only showing top 5 rows

root
 |-- TransactionID: integer (nullable = true)
 |-- Category: string (nullable = true)



In [0]:
transaction_metadata.show(5)
transaction_metadata.printSchema()

+-------------+-------------------+----------+
|TransactionID|          Timestamp|MerchantID|
+-------------+-------------------+----------+
|            1|2022-01-01 00:00:00|      2701|
|            2|2022-01-01 01:00:00|      2070|
|            3|2022-01-01 02:00:00|      2238|
|            4|2022-01-01 03:00:00|      2879|
|            5|2022-01-01 04:00:00|      2966|
+-------------+-------------------+----------+
only showing top 5 rows

root
 |-- TransactionID: integer (nullable = true)
 |-- Timestamp: timestamp (nullable = true)
 |-- MerchantID: integer (nullable = true)



In [0]:
transaction_records.show(5)
transaction_records.printSchema()

+-------------+------------------+----------+
|TransactionID|            Amount|CustomerID|
+-------------+------------------+----------+
|            1|55.530334429869185|      1952|
|            2|12.881180192784143|      1027|
|            3|50.176321517065674|      1955|
|            4| 41.63400105303006|      1796|
|            5| 78.12285326574603|      1946|
+-------------+------------------+----------+
only showing top 5 rows

root
 |-- TransactionID: integer (nullable = true)
 |-- Amount: double (nullable = true)
 |-- CustomerID: integer (nullable = true)



In [0]:
suspicious_activity.show(5)
suspicious_activity.printSchema()

+----------+--------------+
|CustomerID|SuspiciousFlag|
+----------+--------------+
|      1001|             0|
|      1002|             0|
|      1003|             0|
|      1004|             0|
|      1005|             0|
+----------+--------------+
only showing top 5 rows

root
 |-- CustomerID: integer (nullable = true)
 |-- SuspiciousFlag: integer (nullable = true)



### exporting transformed data into adls 

In [0]:
output_base_path = "abfss://transformed-data@fdadls.dfs.core.windows.net"

account_activity.write.format("delta").mode("overwrite").save(f"{output_base_path}/Customer_Profile/account_activity")
customer_data.write.format("delta").mode("overwrite").save(f"{output_base_path}/Customer_Profile/customer_data")

fraud_indicators.write.format("delta").mode("overwrite").save(f"{output_base_path}/Fraudulent Patterns/fraud_indicators")
suspicious_activity.write.format("delta").mode("overwrite").save(f"{output_base_path}/Fraudulent Patterns/suspicious_activity")

merchant_data.write.format("delta").mode("overwrite").save(f"{output_base_path}/Merchant Information/merchant_data")
transaction_category_labels.write.format("delta").mode("overwrite").save(f"{output_base_path}/Merchant Information/transaction_category_labels")

amount_data.write.format("delta").mode("overwrite").save(f"{output_base_path}/Transaction Amounts/amount_data")
anomaly_scores.write.format("delta").mode("overwrite").save(f"{output_base_path}/Transaction Amounts/anomaly_scores")

transaction_metadata.write.format("delta").mode("overwrite").save(f"{output_base_path}/Transaction Data/transaction_metadata")
transaction_records.write.format("delta").mode("overwrite").save(f"{output_base_path}/Transaction Data/transaction_records")
