# Feature Engineering

* **Description**: COMP4103(Big Data)--Group Project
* **Author**: Aaron
* **Version**: 0.1

## 1. load packages

In [None]:
# Apache Spark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.window import Window

## 2. Create a Spark Session

In [None]:
# Start a SparkSession
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("data preprocessing") \
    .getOrCreate()

sc = spark.sparkContext

## 3. Data combination

In [None]:
# DataSet
bitcoin_data = "bitcoin_10y_1min_interpolate.csv"
blockChain_data = "blockChain_10y_1min_interpolate.csv"

df = spark.read.format("csv") \
          .option("inferSchema",'True') \
          .option("header",True) \
          .load(bitcoin_data) \
          .withColumn("id", F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))-1)

blockChain_df = spark.read.format("csv") \
                     .option("inferSchema",'True') \
                     .option("header",True) \
                     .load(blockChain_data) \
                     .withColumn("id", F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))-1) 

# join data
df = df.join(blockChain_df, on=['id','Timestamp'], how='inner')

## 4. Generate the label column

In [None]:
# Add a NEXT_BTC_CLOSE represent next step bitcoin price as the label column
# https://sparkbyexamples.com/pyspark/pyspark-window-functions/
df = df.withColumn("NEXT_BTC_CLOSE", F.lag("Close", offset=-1) \
       .over(Window.orderBy("id"))) \
       .dropna()

## 5. Generate financial indicators

In [None]:
# Generate additional valuable features

# Rate of Change allows investors to spot security momentum and other trends
# Typically a 12-day Rate-of-Change is used but for simplicity, I used it for every 30-min interval
df = df.withColumn("Rate_of_Change", (F.col("NEXT_BTC_CLOSE") / F.col("Previous_close") - 1) * 100)

# computing Simple Moving Averages 
# Adapted from: https://stackoverflow.com/questions/45806194/pyspark-rolling-average-using-timeseries-data
def simple_moving_average(df, period, col="NEXT_BTC_CLOSE", orderby="id"):
    
    df = df.withColumn(f"SMA_{period}", F.avg(col) \
           .over(Window.orderBy(orderby) \
           .rowsBetween(-period,0))) 
    return df

#MA number 5/7/10/20/50/100/200 days;
MA5 = 60 * 24 * 5
MA7 = 60 * 24 * 7
MA10 = 60 * 24 * 10
MA20 = 60 * 24 * 20
MA50 = 60 * 24 * 50
MA100 = 60 * 24 * 100

# periods selected based on this article: 
# https://www.investopedia.com/ask/answers/122414/what-are-most-common-periods-used-creating-moving-average-
# ma-lines.asp#:~:text=Traders%20and%20market%20analysts%20commonly,averages%20are%20the%20most%20common.

# to analyze short-term trends
df = simple_moving_average(df, MA5) # these might have to be 240 - 1 actually
df = simple_moving_average(df, MA7)
df = simple_moving_average(df, MA10)
df = simple_moving_average(df, MA20)
df = simple_moving_average(df, MA50)
# to analyze long-term trends
df = simple_moving_average(df, MA100)

In [None]:
# Save the complete data to a CSV file
df.write.option("header",True).csv("complete_10y_1min_interpolate.csv")