In [1]:
from datetime import datetime, timedelta
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col, when, to_json, struct, lit, udf, collect_list, round as round_, max as max_, min as min_, sum as sum_, first, last
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, IntegerType, MapType
from pyspark.sql import functions as F
import boto3
import sys
from pandas import Timestamp
from pandas import DataFrame
import time
import io
import calendar
from operator import itemgetter
import freqtrade.vendor.qtpylib.indicators as qtpylib
import numpy as np  # noqa
import pandas as pd  # noqa
import pandas_ta as pta
import talib.abstract as ta
from dateutil.relativedelta import relativedelta
from freqtrade.strategy import IStrategy
from technical.indicators import *
from pandas import Timestamp
import requests
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
import s3fs
import dotenv
from dotenv import load_dotenv
import os
import sys
import os

# Add the directory containing spark_config.py to the Python path
module_path = '/root/Cicada-binance/cores/aggTrades/historical/transformation/src/VolumeProfileCluster'
if module_path not in sys.path:
    sys.path.append(module_path)

# Now you can import the spark_config module
import spark_config


In [2]:
load_dotenv()

ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID")
SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
REGION_NAME = os.getenv("AWS_REGION_NAME")
BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")

In [8]:
import sys
import os

# Set the SPARK_HOME and PATH environment variables
os.environ['SPARK_HOME'] = '/opt/spark'
os.environ['PATH'] += os.pathsep + os.path.join(os.environ['SPARK_HOME'], 'bin')

# Add the directory containing spark_config.py to the Python path
module_path = '/root/Cicada-binance/cores/aggTrades/historical/transformation/src/VolumeProfileCluster'
if module_path not in sys.path:
    sys.path.append(module_path)

# Import the spark_config module
import spark_config

# Get the Spark session
spark_conf = spark_config.get_spark_session()

In [10]:
# Use boto3 and s3fs for additional S3 interaction
s3_resource = boto3.resource(
    's3',
    region_name=os.getenv("AWS_REGION_NAME"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
)

s3 = s3fs.S3FileSystem(anon=False, key=os.getenv("AWS_ACCESS_KEY_ID"), secret=os.getenv("AWS_SECRET_ACCESS_KEY"))

# Read data from S3
s3_path = 'xtrus31/cicada-data/HistoricalTradeAggregator/binance_futures/BTCUSDT/2020/01.parquet'
renko = spark_conf.read.parquet(f"s3a://{s3_path}")

new_column_names = [
    "agg_trade_id",
    "price",
    "quantity",
    "first_trade_id",
    "last_trade_id",
    "time",
    "is_buyer_maker",
]
for old_name, new_name in zip(renko.columns, new_column_names):
    renko = renko.withColumnRenamed(old_name, new_name)

24/06/11 13:53:11 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/06/11 13:53:12 WARN BasicProfileConfigLoader: Your profile name includes a 'profile ' prefix. This is considered part of the profile name in the Java SDK, so you will need to include this prefix in your profile name when you reference this profile from your Java code.


In [11]:
renko.show()

                                                                                

+------------+-------+--------+--------------+-------------+-------------+--------------+
|agg_trade_id|  price|quantity|first_trade_id|last_trade_id|         time|is_buyer_maker|
+------------+-------+--------+--------------+-------------+-------------+--------------+
|    18374167|7189.43|    0.03|      25247504|     25247504|1577836801481|          true|
|    18374168|7189.42|    1.47|      25247505|     25247505|1577836801481|          true|
|    18374169|7189.42|   4.222|      25247506|     25247506|1577836801708|          true|
|    18374170| 7189.0|   0.778|      25247507|     25247507|1577836806290|         false|
|    18374171| 7189.5|   0.683|      25247508|     25247508|1577836806290|         false|
|    18374172|7189.99|   0.042|      25247509|     25247509|1577836806291|         false|
|    18374173|7190.22|     0.3|      25247510|     25247510|1577836806291|         false|
|    18374174|7190.23|     0.3|      25247511|     25247511|1577836806291|         false|
|    18374

In [12]:
def round_down_to_interval(df, timeframe):
    return df.withColumn(
        "time_rounded",
        (F.col("time") / 1000).cast("timestamp").cast("long")
        - (F.col("time") / 1000).cast("long") % timeframe,
    ).withColumn(
        "time_rounded", F.from_unixtime(F.col("time_rounded")).cast("timestamp")
    )
    
def calc_df(df, aggregate_trades, chart_type, timeframe):
    intervals = {
        "1m": 60,
        "5m": 300,
        "15m": 900,
        "30m": 1800,
        "1h": 3600,
        "4h": 14400,
        "12h": 60 * 60 * 12,
        "1d": 86400,
        "1w": 604800,
    }

    if timeframe not in intervals:
        raise ValueError("Unsupported timeframe provided.")

    interval_seconds = intervals[timeframe]
    df = round_down_to_interval(df, interval_seconds)

    # Define foot_bid and foot_ask columns
    foot_bid = when(
        col("is_buyer_maker") == "true",
        to_json(
            struct(
                col("price").alias("price_level"),
                col("quantity").alias("bid_qty"),
                lit(1).alias("bid_trades"),
                when(lit(aggregate_trades), lit(1)).otherwise(lit(None)).alias("bid_trades_aggr"),
            )
        ),
    )

    foot_ask = when(
        col("is_buyer_maker") == "false",
        to_json(
            struct(
                col("price").alias("price_level"),
                col("quantity").alias("ask_qty"),
                lit(1).alias("ask_trades"),
                when(lit(aggregate_trades), lit(1)).otherwise(lit(None)).alias("ask_trades_aggr"),
            )
        ),
    )

    df = df.withColumn("foot_bid", foot_bid).withColumn("foot_ask", foot_ask)

    # Aggregate to calculate OHLCV and footprint data
    df_agg = (
        df.groupBy(F.window("time_rounded", f"{interval_seconds} seconds"))
        .agg(
            F.first("price").alias("open"),
            F.last("price").alias("close"),
            F.max("price").alias("high"),
            F.min("price").alias("low"),
            F.sum("quantity").alias("qty"),
            F.collect_list("foot_bid").alias("foot_bid"),
            F.collect_list("foot_ask").alias("foot_ask"),
        )
        .select(
            col("window.start").alias("time_rounded"),
            "open",
            "close",
            "high",
            "low",
            "qty",
            "foot_bid",
            "foot_ask",
        )
        .orderBy("time_rounded")
    )

    return df_agg

In [13]:
chart_type = "candlestick"
aggregate_trades = True

start_year = "2024"
start_month = "04"
end_year = "2024"
end_month = "05"
timeframe = "15m"
renko_df = calc_df(renko, aggregate_trades, chart_type, timeframe)

In [14]:
renko_df.show()

                                                                                

+-------------------+-------+-------+-------+-------+------------------+--------------------+--------------------+
|       time_rounded|   open|  close|   high|    low|               qty|            foot_bid|            foot_ask|
+-------------------+-------+-------+-------+-------+------------------+--------------------+--------------------+
|2020-01-01 01:00:00|7189.43|7176.26|7190.52|7172.94|1037.3369999999989|[{"price_level":7...|[{"price_level":7...|
|2020-01-01 01:15:00|7176.22|7172.36|7179.41|7170.69| 707.8329999999991|[{"price_level":7...|[{"price_level":7...|
|2020-01-01 01:30:00|7172.79|7174.83|7179.45|7170.61| 325.2459999999999|[{"price_level":7...|[{"price_level":7...|
|2020-01-01 01:45:00|7174.51|7171.55|7179.36|7170.15| 378.6329999999997|[{"price_level":7...|[{"price_level":7...|
|2020-01-01 02:00:00|7171.43| 7186.6|7188.77| 7171.1| 555.3889999999992|[{"price_level":7...|[{"price_level":7...|
|2020-01-01 02:15:00| 7186.6| 7205.9| 7210.0|7184.16|1332.0769999999925|[{"price

In [20]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, MapType, StringType
from pyspark.sql.functions import udf, collect_list
import logging

# Initialize logger
logger = logging.getLogger("BidAskDataProcessing")
logging.basicConfig(level=logging.INFO)

# Define the schema for bid and ask data
json_schema_bid = StructType([
    StructField("price_level", DoubleType(), True),
    StructField("bid_qty", DoubleType(), True),
    StructField("bid_trades", IntegerType(), True),
    StructField("bid_trades_aggr", IntegerType(), True)
])

json_schema_ask = StructType([
    StructField("price_level", DoubleType(), True),
    StructField("ask_qty", DoubleType(), True),
    StructField("ask_trades", IntegerType(), True),
    StructField("ask_trades_aggr", IntegerType(), True)
])

def process_foot_data(renko_df, column_name, json_schema, prefix):
    # Convert the array to a single JSON string
    df_with_foot = renko_df.withColumn(f"{column_name}_str", F.concat_ws("", F.col(column_name)))
    
    df_with_foot = df_with_foot.withColumn(f"{column_name}_str", F.regexp_replace(f"{column_name}_str", r'\}\s*\{', '},{'))
    
    # Convert the concatenated string to an array of JSON strings
    df_with_foot = df_with_foot.withColumn(f"{column_name}_json_array", F.split(f"{column_name}_str", r',\{'))
    df_with_foot = df_with_foot.withColumn(f"{column_name}_json_array", F.expr(f"TRANSFORM({column_name}_json_array, x -> concat('{{', x))"))
    
    df_with_foot = df_with_foot.withColumn(f"{column_name}_json", F.explode(f"{column_name}_json_array"))
    
    # Replace any occurrences of "{{" with "{" and "}}" with "}"
    df_with_foot = df_with_foot.withColumn(f"{column_name}_json", F.regexp_replace(F.col(f"{column_name}_json"), r'\{\{', '{'))
    df_with_foot = df_with_foot.withColumn(f"{column_name}_json", F.regexp_replace(F.col(f"{column_name}_json"), r'\}\}', '}'))
    
    # Parse JSON and select the necessary fields
    df = df_with_foot.select(
        F.col("time_rounded"),
        F.from_json(F.col(f"{column_name}_json"), json_schema).alias("data")
    ).select(
        "time_rounded",
        F.round(F.col("data.price_level")).alias("price_level"),
        F.col(f"data.{column_name.split('_')[1]}_qty").alias("qty"),
        F.col(f"data.{column_name.split('_')[1]}_trades").alias("trades"),
    ).filter(
        F.col("price_level").isNotNull() &
        F.col("qty").isNotNull() &
        F.col("trades").isNotNull() 
    )
    
    # Aggregate data
    df_agg = df.groupBy("time_rounded", "price_level").agg(
        F.sum("qty").alias(f"total_{prefix}_qty"),
        F.sum("trades").alias(f"total_{prefix}_trades"),
    )
    
    # Sort the aggregated DataFrame by time_rounded and price_level
    df_agg_sorted = df_agg.orderBy("time_rounded", "price_level")
    
    # Additional aggregation step
    df_total_qty = df_agg_sorted.groupBy("time_rounded").agg(
        F.sum(f"total_{prefix}_qty").alias(f"sum_{prefix}_total_qty")
    )

    return df_agg_sorted, df_total_qty

In [21]:
# Process foot_bid and foot_ask columns
df_bid_agg_sorted, df_bid_total_qty = process_foot_data(renko_df, "foot_bid", json_schema_bid, "bid")
df_bid_agg_sorted.show(20)

[Stage 31:>                                                         (0 + 2) / 2]

+-------------------+-----------+------------------+----------------+
|       time_rounded|price_level|     total_bid_qty|total_bid_trades|
+-------------------+-----------+------------------+----------------+
|2020-01-01 01:00:00|     7173.0|17.825999999999997|              16|
|2020-01-01 01:00:00|     7174.0|15.869000000000002|              30|
|2020-01-01 01:00:00|     7175.0| 29.78100000000001|              39|
|2020-01-01 01:00:00|     7176.0| 51.82800000000002|              64|
|2020-01-01 01:00:00|     7177.0| 52.13700000000002|              60|
|2020-01-01 01:00:00|     7178.0|105.05199999999999|             100|
|2020-01-01 01:00:00|     7179.0| 71.12200000000001|              67|
|2020-01-01 01:00:00|     7180.0| 46.92099999999999|              54|
|2020-01-01 01:00:00|     7181.0| 38.78600000000001|              38|
|2020-01-01 01:00:00|     7182.0|            57.685|              54|
|2020-01-01 01:00:00|     7183.0|35.980000000000004|              29|
|2020-01-01 01:00:00

                                                                                

##  STRUCTURE PREPROCESSING

In [23]:
df_bid_footprint_sorted = transform_and_aggregate_footprint(df_bid_agg_sorted, "bid")
df_bid_footprint_sorted.show()

[Stage 39:>                                                         (0 + 2) / 2]

+-------------------+--------------------+
|       time_rounded| aggregated_foot_bid|
+-------------------+--------------------+
|2020-01-01 01:00:00|[{7173.0 -> {17.8...|
|2020-01-01 01:15:00|[{7171.0 -> {52.4...|
|2020-01-01 01:30:00|[{7171.0 -> {14.8...|
|2020-01-01 01:45:00|[{7170.0 -> {5.51...|
|2020-01-01 02:00:00|[{7171.0 -> {3.05...|
|2020-01-01 02:15:00|[{7184.0 -> {8.39...|
|2020-01-01 02:30:00|[{7201.0 -> {0.79...|
|2020-01-01 02:45:00|[{7200.0 -> {34.1...|
|2020-01-01 03:00:00|[{7206.0 -> {0.03...|
|2020-01-01 03:15:00|[{7210.0 -> {0.05...|
|2020-01-01 03:30:00|[{7224.0 -> {17.3...|
|2020-01-01 03:45:00|[{7228.0 -> {21.4...|
|2020-01-01 04:00:00|[{7218.0 -> {9.41...|
|2020-01-01 04:15:00|[{7223.0 -> {0.09...|
|2020-01-01 04:30:00|[{7215.0 -> {8.56...|
|2020-01-01 04:45:00|[{7215.0 -> {11.6...|
|2020-01-01 05:00:00|[{7213.0 -> {5.78...|
|2020-01-01 05:15:00|[{7211.0 -> {10.2...|
|2020-01-01 05:30:00|[{7217.0 -> {2.70...|
|2020-01-01 05:45:00|[{7214.0 -> {3.51...|
+----------

                                                                                

In [22]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, MapType, StringType
from pyspark.sql.functions import udf, collect_list
import logging

# Initialize logger
logger = logging.getLogger("BidAskDataProcessing")
logging.basicConfig(level=logging.INFO)

# Define the schema for bid and ask data
json_schema_bid = StructType([
    StructField("price_level", DoubleType(), True),
    StructField("bid_qty", DoubleType(), True),
    StructField("bid_trades", IntegerType(), True),
    StructField("bid_trades_aggr", IntegerType(), True)
])

json_schema_ask = StructType([
    StructField("price_level", DoubleType(), True),
    StructField("ask_qty", DoubleType(), True),
    StructField("ask_trades", IntegerType(), True),
    StructField("ask_trades_aggr", IntegerType(), True)
])

def process_foot_data(renko_df, column_name, json_schema, prefix):
    # Convert the array to a single JSON string
    df_with_foot = renko_df.withColumn(f"{column_name}_str", F.concat_ws("", F.col(column_name)))
    
    df_with_foot = df_with_foot.withColumn(f"{column_name}_str", F.regexp_replace(f"{column_name}_str", r'\}\s*\{', '},{'))
    
    # Convert the concatenated string to an array of JSON strings
    df_with_foot = df_with_foot.withColumn(f"{column_name}_json_array", F.split(f"{column_name}_str", r',\{'))
    df_with_foot = df_with_foot.withColumn(f"{column_name}_json_array", F.expr(f"TRANSFORM({column_name}_json_array, x -> concat('{{', x))"))
    
    df_with_foot = df_with_foot.withColumn(f"{column_name}_json", F.explode(f"{column_name}_json_array"))
    
    # Replace any occurrences of "{{" with "{" and "}}" with "}"
    df_with_foot = df_with_foot.withColumn(f"{column_name}_json", F.regexp_replace(F.col(f"{column_name}_json"), r'\{\{', '{'))
    df_with_foot = df_with_foot.withColumn(f"{column_name}_json", F.regexp_replace(F.col(f"{column_name}_json"), r'\}\}', '}'))
    
    # Parse JSON and select the necessary fields
    df = df_with_foot.select(
        F.col("time_rounded"),
        F.from_json(F.col(f"{column_name}_json"), json_schema).alias("data")
    ).select(
        "time_rounded",
        F.round(F.col("data.price_level")).alias("price_level"),
        F.col(f"data.{column_name.split('_')[1]}_qty").alias("qty"),
        F.col(f"data.{column_name.split('_')[1]}_trades").alias("trades"),
        F.col(f"data.{column_name.split('_')[1]}_trades_aggr").alias("trades_aggr")
    ).filter(
        F.col("price_level").isNotNull() &
        F.col("qty").isNotNull() &
        F.col("trades").isNotNull() &
        F.col("trades_aggr").isNotNull()
    )
    
    # Aggregate data
    df_agg = df.groupBy("time_rounded", "price_level").agg(
        F.sum("qty").alias(f"total_{prefix}_qty"),
        F.sum("trades").alias(f"total_{prefix}_trades"),
        F.sum("trades_aggr").alias(f"total_{prefix}_trades_aggr"),
    )
    
    # Sort the aggregated DataFrame by time_rounded and price_level
    df_agg_sorted = df_agg.orderBy("time_rounded", "price_level")
    
    # Additional aggregation step
    df_total_qty = df_agg_sorted.groupBy("time_rounded").agg(
        F.sum(f"total_{prefix}_qty").alias(f"sum_{prefix}_total_qty")
    )

    return df_agg_sorted, df_total_qty

def transform_and_aggregate_footprint(df_agg, column_prefix):
    try:
        return_schema = MapType(
            StringType(),
            StructType(
                [
                    StructField(f"{column_prefix}_qty", DoubleType(), True),
                    StructField(f"{column_prefix}_trades", DoubleType(), True),
                ]
            ),
        )

        def transform_row(price, qty, trades):
            return {
                str(price): {
                    f"{column_prefix}_qty": qty,
                    f"{column_prefix}_trades": trades,
                }
            }

        transform_row_udf = udf(transform_row, return_schema)

        df_transformed = df_agg.withColumn(
            "transformed_aggregated_foot",
            transform_row_udf(
                "price_level",
                F.col(f"total_{column_prefix}_qty").cast(DoubleType()),
                F.col(f"total_{column_prefix}_trades").cast(DoubleType()),
            ),
        )

        df_footprint = df_transformed.groupBy("time_rounded").agg(
            collect_list("transformed_aggregated_foot").alias(
                f"aggregated_foot_{column_prefix}"
            )
        )

        # Sort the footprint by time_rounded
        df_footprint_sorted = df_footprint.orderBy("time_rounded")

        return df_footprint_sorted
    except Exception as e:
        logger.error(f"Error in transform_and_aggregate_footprint: {e}")
        raise

# Process foot_bid and foot_ask columns
df_bid_agg_sorted, df_bid_total_qty = process_foot_data(renko_df, "foot_bid", json_schema_bid, "bid")
df_ask_agg_sorted, df_ask_total_qty = process_foot_data(renko_df, "foot_ask", json_schema_ask, "ask")

# Transform and aggregate the footprint data
df_bid_footprint_sorted = transform_and_aggregate_footprint(df_bid_agg_sorted, "bid")
df_ask_footprint_sorted = transform_and_aggregate_footprint(df_ask_agg_sorted, "ask")


def join_bid_ask_footprints(df, df_bid_qty, df_ask_qty, df_footprint_bids, df_footprint_asks):
    try:
        # Drop unnecessary columns from df
        df = df.drop("foot_bid", "foot_ask")
        
        # Join the df built before 
        df_joined = df.join(df_bid_qty, on="time_rounded", how="outer")\
                      .join(df_ask_qty, on="time_rounded", how="outer")\
                      .join(df_footprint_bids, on="time_rounded", how="outer")\
                      .join(df_footprint_asks, on="time_rounded", how="outer")\
                      .orderBy("time_rounded")
        return df_joined
    except Exception as e:
        logger.error(f"Error in join_bid_ask_footprints: {e}")
        raise

df_joined_final = join_bid_ask_footprints(renko_df, df_bid_total_qty, df_ask_total_qty, df_bid_footprint_sorted, df_ask_footprint_sorted)

def transform_footprint(row):
    bid_data = row["aggregated_foot_bid"]
    ask_data = row["aggregated_foot_ask"]
    aggregated_data = {}
    # TODO : add error management
    if bid_data:
        for bid_item in bid_data:
            for price, row in bid_item.items():
                if price not in aggregated_data:
                    aggregated_data[price] = {
                        "bid_qty": 0,
                        "bid_trades": 0,
                        "ask_qty": 0,
                        "ask_trades": 0,
                    }
                aggregated_data[price]["bid_qty"] += row["bid_qty"]
                aggregated_data[price]["bid_trades"] += row["bid_trades"]

    if ask_data:
        for ask_item in ask_data:
            for price, row in ask_item.items():
                if price not in aggregated_data:
                    aggregated_data[price] = {
                        "bid_qty": 0,
                        "bid_trades": 0,
                        "ask_qty": 0,
                        "ask_trades": 0,
                    }
                aggregated_data[price]["ask_qty"] += row["ask_qty"]
                aggregated_data[price]["ask_trades"] += row["ask_trades"]

    return aggregated_data

df_joined_final = df_joined_final.withColumn(
    "footprint",
    F.udf(lambda row: transform_footprint(row))(
        F.struct([df_joined_final[x] for x in df_joined_final.columns])
    ),
)

In [7]:
def join_bid_ask_footprints(df, df_bid_qty, df_ask_qty, df_footprint_bids, df_footprint_asks):
    try:
        # Drop unnecessary columns from df
        df = df.drop("foot_bid", "foot_ask")
        
        # Join the df built before 
        df_joined = df.join(df_bid_qty, on="time_rounded", how="outer")\
                      .join(df_ask_qty, on="time_rounded", how="outer")\
                      .join(df_footprint_bids, on="time_rounded", how="outer")\
                      .join(df_footprint_asks, on="time_rounded", how="outer")\
                      .orderBy("time_rounded")
        return df_joined
    except Exception as e:
        logger.error(f"Error in join_bid_ask_footprints: {e}")
        raise
    
df_joined_final = join_bid_ask_footprints(renko_df, df_bid_total_qty, df_ask_total_qty, df_bid_footprint_sorted, df_ask_footprint_sorted)
#df_joined_final.show(100, truncate=False)


In [8]:
def transform_footprint(row):
    bid_data = row["aggregated_foot_bid"]
    ask_data = row["aggregated_foot_ask"]
    aggregated_data = {}
    # TODO : add error management
    if bid_data:
        for bid_item in bid_data:
            for price, row in bid_item.items():
                if price not in aggregated_data:
                    aggregated_data[price] = {
                        "bid_qty": 0,
                        "bid_trades": 0,
                        "ask_qty": 0,
                        "ask_trades": 0,
                    }
                aggregated_data[price]["bid_qty"] += row["bid_qty"]
                aggregated_data[price]["bid_trades"] += row["bid_trades"]

    if ask_data:
        for ask_item in ask_data:
            for price, row in ask_item.items():
                if price not in aggregated_data:
                    aggregated_data[price] = {
                        "bid_qty": 0,
                        "bid_trades": 0,
                        "ask_qty": 0,
                        "ask_trades": 0,
                    }
                aggregated_data[price]["ask_qty"] += row["ask_qty"]
                aggregated_data[price]["ask_trades"] += row["ask_trades"]

    return aggregated_data

df_joined_final = df_joined_final.withColumn(
    "footprint",
    F.udf(lambda row: transform_footprint(row))(
        F.struct([df_joined_final[x] for x in df_joined_final.columns])
    ),
)

In [10]:
df_joined_final.printSchema()

root
 |-- time_rounded: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- close: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- qty: double (nullable = true)
 |-- sum_bid_total_qty: double (nullable = true)
 |-- sum_ask_total_qty: double (nullable = true)
 |-- aggregated_foot_bid: array (nullable = true)
 |    |-- element: map (containsNull = false)
 |    |    |-- key: string
 |    |    |-- value: struct (valueContainsNull = true)
 |    |    |    |-- bid_qty: double (nullable = true)
 |    |    |    |-- bid_trades: double (nullable = true)
 |-- aggregated_foot_ask: array (nullable = true)
 |    |-- element: map (containsNull = false)
 |    |    |-- key: string
 |    |    |-- value: struct (valueContainsNull = true)
 |    |    |    |-- ask_qty: double (nullable = true)
 |    |    |    |-- ask_trades: double (nullable = true)
 |-- footprint: string (nullable = true)



In [11]:
df_joined_final.select("footprint").show(20, truncate=False)

                                                                                

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
df_joined_final.select("aggregated_foot_ask").show(20, truncate=False)
df_joined_final.select("aggregated_foot_bid").show(20, truncate=False)

                                                                                

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|aggregated_foot_ask                                                    



+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|aggregated_foot_bid                                                                                                                                           

                                                                                

# DATA VALIDATION

# BREAK

In [14]:
def transform_and_aggregate_footprint(df_agg, column_prefix):
    """
    Transforms and aggregates the footprint data for bids or asks.
    
    :param df_agg: DataFrame to transform and aggregate.
    :param column_prefix: Prefix indicating whether it's for bids or asks ('bid' or 'ask').
    :return: Transformed and aggregated DataFrame.
    """
    # Define the return schema based on prefix
    return_schema = MapType(
        StringType(), 
        StructType([
            StructField(f"{column_prefix}_qty", DoubleType(), True),
            StructField(f"{column_prefix}_trades", DoubleType(), True)
        ])
    )

    # Define the transformation UDF
    def transform_row(price, qty, trades):
        return {str(price): {f"{column_prefix}_qty": qty, f"{column_prefix}_trades": trades}}

    transform_row_udf = udf(transform_row, return_schema)

    # Apply the transformation UDF and aggregate
    df_transformed = df_agg.withColumn("transformed_aggregated_foot", 
                                    transform_row_udf("price_level", f"total_{column_prefix}_qty", f"total_{column_prefix}_trades"))

    df_footprint = df_transformed.groupBy("time_rounded").agg(
        collect_list("transformed_aggregated_foot").alias(f"aggregated_foot_{column_prefix}")
    ).sort("time_rounded")

    return df_footprint

In [5]:
path = '/root/Cicada-binance/cores/aggTrades/historical/transformation/src/VolumeProfileCluster/data/test.parquet'
df = spark_conf.read.parquet(path)

                                                                                

In [7]:
df.show(20)

+------------+-----------+-------------+----------------+---------------------+
|time_rounded|price_level|total_bid_qty|total_bid_trades|total_bid_trades_aggr|
+------------+-----------+-------------+----------------+---------------------+
+------------+-----------+-------------+----------------+---------------------+

