# Environment Setup

In [1]:
import glob
import json

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, functions as F, types as T, Window as W

In [2]:
# Set up the Spark configuration and context
conf = SparkConf().setAppName("MyApp").setMaster("local[*]")
sc = SparkContext(conf=conf)

# Set up the Spark session
spark = SparkSession.builder \
    .appName("BatchProcessor") \
    .config("spark.driver.extraJavaOptions", "-Xss4m") \
    .config("spark.executor.extraJavaOptions", "-Xss4m") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/05 13:10:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Data Preparation

In [3]:
schema = T.StructType([
    T.StructField("authors", T.ArrayType(T.StringType()), True),
    T.StructField("date_google", T.StringType(), True),
    T.StructField("date_metadata", T.StringType(), True),
    T.StructField("date_published", T.StringType(), True),
    T.StructField("date_target", T.StringType(), True),
    T.StructField("description", T.StringType(), True),
    T.StructField("explanation", T.StringType(), True),
    T.StructField("groq_usage", T.StringType(), True),
    T.StructField("metadata", T.MapType(T.StringType(), T.StringType()), True),
    T.StructField("rating_democrats", T.FloatType(), True),
    T.StructField("rating_republicans", T.FloatType(), True),
    T.StructField("source_url", T.StringType(), True),
    T.StructField("summary", T.StringType(), True),
    T.StructField("text", T.StringType(), True),
    T.StructField("title", T.StringType(), True),
    T.StructField("url", T.StringType(), True)
])
news_df = spark.read.option("multiline", "true").json("../news_ratings/data/", schema=schema)
news_df.show(n=5)

                                                                                

+--------------------+-------------------+--------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+----------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             authors|        date_google|       date_metadata|     date_published|        date_target|         description|         explanation|          groq_usage|            metadata|rating_democrats|rating_republicans|          source_url|             summary|                text|               title|                 url|
+--------------------+-------------------+--------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+----------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|            

In [4]:
market_df = spark.read.csv("../stocks_data/ticker_data.csv", header=True, inferSchema=True)
market_df.show(n=5)

                                                                                

+------+-------------------+-----------------+------------------+-----------------+------------------+------------------+-----------+
|Ticker|               Date|             Open|              High|              Low|             Close|         Adj Close|     Volume|
+------+-------------------+-----------------+------------------+-----------------+------------------+------------------+-----------+
|   XLP|2023-11-16 20:00:00| 69.8499984741211| 69.86000061035156|69.55999755859375| 69.66500091552734| 69.66500091552734|  4150005.0|
|   IJR|2023-11-16 20:00:00|96.30000305175781| 96.58000183105469|95.19999694824219| 95.44999694824219| 95.44999694824219|   997585.0|
|   SPY|2023-11-16 20:00:00|449.2200012207031|450.55999755859375|449.1300048828125|449.95001220703125|449.95001220703125|1.0474895E7|
|   XLU|2023-11-16 20:00:00|62.29999923706055|  62.6150016784668|62.06999969482422|62.470001220703125|62.470001220703125|  3710716.0|
|   XLB|2023-11-16 20:00:00|79.86000061035156| 80.209999084472

In [5]:
market_df.printSchema()

root
 |-- Ticker: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: double (nullable = true)



# Data Pre-processing

In [6]:
news_df = news_df.withColumn("published_at", F.coalesce("date_google", "date_metadata", "date_published")) \
    .withColumn("published_at", F.to_timestamp("published_at")) \
    .withColumn("date_target", F.to_date("date_target"))
news_df = news_df.select('published_at', 'date_target', 'rating_democrats', 'rating_republicans', 'title', 'summary', 'url')
news_df = news_df.na.drop(subset=["published_at"])
news_df.sample(fraction=0.01).show(n=5)

+-------------------+-----------+----------------+------------------+--------------------+--------------------+--------------------+
|       published_at|date_target|rating_democrats|rating_republicans|               title|             summary|                 url|
+-------------------+-----------+----------------+------------------+--------------------+--------------------+--------------------+
|2024-02-12 22:28:29| 2024-02-12|             0.0|               0.0|Primary Election ...| San Diego County...|https://www.kpbs....|
|2024-02-16 16:57:52| 2024-02-16|             1.0|               0.0|President Biden R...| President Biden ...|https://www.c-spa...|
|2024-02-21 20:31:14| 2024-02-20|             1.0|               1.0|Which is the bett...| In the past, spe...|https://abcnews.g...|
|2024-02-21 11:45:34| 2024-02-21|             0.0|               0.0|AP Decision Notes...| Donald Trump and...|https://apnews.co...|
|2024-02-23 05:00:04| 2024-02-21|             0.0|               0.0|

# Feature Engineering

In [7]:
time_windows = [
  1,      # 1 hour
#   5,      # 5 hours
  10,     # 10 hours
  24,     # 1 day
#   7*24,   # 1 week
#   14*24,  # 2 weeks
#   28*24,  # 4 weeks
]
# --------------------
statistics = [
  "count",
  "mean",
  "std",
  "min",
  "max",
  "median",
  "spread",
]
# --------------------
ticker_cols = [
  "Open",
  "High",
  "Low",
  "Close",
  "Adj Close",
  "Volume"
]
# --------------------
news_cols = [
  "rating_republicans",
  "rating_democrats",
]

In [8]:
# Define a helper function for rolling window calculations
def calculate_rolling_stats(df, cols, datetime_col, partition_col=None):
    """
    Computes rolling statistics for a given column over various time windows.
    """
    result_df = df
    for window_hours in time_windows:
        # Define the window range in milliseconds
        window_range = window_hours * 3600 * 1000
        # Define a rolling window spec
        window_spec = (
            W
            .partitionBy(partition_col if partition_col else [])
            .orderBy(F.col(datetime_col).cast("timestamp").cast("long") * 1000)
            .rangeBetween(-window_range, 0)
        )
        for col in cols:
            for stat in statistics:
                col_name = f"rolling_{window_hours}h_{col}_{stat}"
                if   stat == "count":   result_df = result_df.withColumn(col_name, F.count(col).over(window_spec))
                elif stat == "mean":    result_df = result_df.withColumn(col_name, F.mean(col).over(window_spec))
                elif stat == "std":     result_df = result_df.withColumn(col_name, F.stddev(col).over(window_spec))
                elif stat == "min":     result_df = result_df.withColumn(col_name, F.min(col).over(window_spec))
                elif stat == "max":     result_df = result_df.withColumn(col_name, F.max(col).over(window_spec))
                elif stat == "median":  result_df = result_df.withColumn(col_name, F.approx_percentile(col, 0.5, 10).over(window_spec))
                elif stat == "spread":  result_df = result_df.withColumn(col_name, F.max(col).over(window_spec) - F.min(col).over(window_spec))
    return result_df

In [None]:
# %%script false --no-raise-error
market_rollstats_df = calculate_rolling_stats(market_df, ticker_cols, "Date", "Ticker")
news_rollstats_df = calculate_rolling_stats(news_df, news_cols, "published_at")

In [11]:
# %%script false --no-raise-error
# sc.stop()       # Stop the Spark contex