#Forecasting Realised Volatility with ML

In [0]:
# Import functions and libraries
from pyspark.sql.window import Window
from pyspark.sql import functions as sf

In [0]:
# Import the table
df = spark.table("workspace.default.nifty_100_combined_data")
display(df)

##Feature Engineering

In [0]:
#Define windows
window = Window.partitionBy("ticker").orderBy("date")
rolling_1h_back_window = window.rowsBetween(-11, 0)
rolling_1h_forward_window = window.rowsBetween(1, 12)


In [0]:
# Compute 1 tick lagged close
df = df. \
        withColumn("lag_close", sf.lag("close").over(window)
)

# Compute the log return on positive closes
df = df. \
        withColumn(
            "log_return", 
            sf.when(
                sf.col("lag_close").isNotNull() & (sf.col("lag_close") != 0),
                sf.log(sf.col("close") / sf.col("lag_close"))
            ).otherwise(sf.lit(None))
)

# Compute the abs return 
df = df. \
        withColumn("abs_return", sf.abs(sf.col("log_return"))
)

# Compute high low diff
df = df. \
        withColumn("hl_range", sf.col("high") - sf.col("low")
)

# Compute 1 tick lagged volume
df = df. \
        withColumn("lag_volume", sf.lag("volume").over(window)
)

# Compute the log vol change
df = df. \
        withColumn(
            "log_volume_change", 
            sf.when(
                sf.col("lag_volume").isNotNull() & (sf.col("lag_volume") != 0),
                sf.log(sf.col("volume") / sf.col("lag_volume"))
            ).otherwise(sf.lit(None))
)

#Extract the hour from timestamp for ease
df = df. \
        withColumn(
            "hour", 
            sf.hour(sf.col("date"))
)

# Compute 1 hour lagged realised vol
df = df. \
        withColumn("window_count", sf.count("log_return").over(rolling_1h_back_window))


df = df. \
        withColumn("realised_rolling_vol_backward", 
                                sf.when(
                                    sf.col("window_count") == 12, 
                                    sf.sqrt(sf.sum(sf.pow(sf.col("log_return"), 2)).over(rolling_1h_back_window))
                                )
                                .otherwise(sf.lit(None))
                    )

df = df. \
        drop("window_count")

# Compute 1 hour forward realised vol
df = df. \
        withColumn("window_count", sf.count("log_return").over(rolling_1h_forward_window))

df = df. \
        withColumn("realised_rolling_vol_forward", 
                                sf.when(
                                    sf.col("window_count") == 12, 
                                    sf.sqrt(sf.sum(sf.pow(sf.col("log_return"), 2)).over(rolling_1h_forward_window))
                                )
                                .otherwise(sf.lit(None))
                    )

df = df. \
        drop("window_count")



In [0]:
display(df)

##Data Cleaning

In [0]:
# Remove the null values from the dataset (part of cleaning)
df = df.dropna()

In [0]:
display(df)

##Save Data as Spark Table

In [0]:
# N.B/ for improved perforamnce:
#df.write.partitionBy("ticker").mode("overwrite").saveAsTable("cleaned_volatility_data")

df.write.mode("overwrite").saveAsTable("cleaned_volatility_data")

##Random Forest Regressor Model