In [None]:
import json
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

# Load configuration
def load_config():
    """Load configuration from config.json"""
    config_path = "/dbfs/FileStore/configs/config.json"
    with open(config_path, 'r') as f:
        return json.load(f)

config = load_config()
bronze_path = config["data_storage"]["bronze_path"]
silver_path = config["data_storage"]["silver_path"]
 
print(f"Bronze data path: {bronze_path}")
print(f"Silver data path: {silver_path}")

In [None]:
# Load Bronze Data and Perform Quality Checks

# Read from bronze layer
bronze_df = spark.read.format("delta").load(bronze_path)

# Display sample data
print("Sample bronze data:")
display(bronze_df.limit(5))

# Data quality check - count nulls
null_counts = bronze_df.select([count(when(col(c).isNull(), c)).alias(c) for c in bronze_df.columns])
print("Null count by column:")
display(null_counts)

# Count records by symbol
symbol_counts = bronze_df.groupBy("symbol").count().orderBy("symbol")
print("Record count by symbol:")
display(symbol_counts)


In [None]:

# Basic Transformations - Add Time Features and Price Metrics

# COMMAND ----------
# Add basic time and price change columns
silver_df = bronze_df \
    .withColumn("year", year("date")) \
    .withColumn("month", month("date")) \
    .withColumn("day_of_month", dayofmonth("date")) \
    .withColumn("day_of_week", dayofweek("date")) \
    .withColumn("price_change", col("close") - col("open")) \
    .withColumn("price_change_pct", round((col("close") - col("open")) / col("open") * 100, 2)) \
    .withColumn("daily_range", col("high") - col("low")) \
    .withColumn("processing_date", current_timestamp())

# Remove any rows with null values in critical columns
silver_df = silver_df.dropna(subset=["open", "close", "high", "low"])

# Preview basic transformations
print("Preview basic transformations:")
display(silver_df.select("symbol", "date", "close", "year", "month", "day_of_week", "price_change", "price_change_pct").limit(5))


In [None]:
# Add 10-day moving average
windowSpec10 = Window.partitionBy("symbol").orderBy("date").rowsBetween(-9, 0)
silver_df = silver_df.withColumn("ma10", avg("close").over(windowSpec10))

# Add 30-day moving average
windowSpec30 = Window.partitionBy("symbol").orderBy("date").rowsBetween(-29, 0)
silver_df = silver_df.withColumn("ma30", avg("close").over(windowSpec30))

# Calculate price volatility (standard deviation over 30 days)
silver_df = silver_df.withColumn("volatility_30d", stddev("close").over(windowSpec30))

# Add relative strength (ratio of current price to 30-day moving average)
silver_df = silver_df.withColumn("rel_strength", round(col("close") / col("ma30"), 4))

# Preview technical indicators
print("Preview technical indicators:")
display(silver_df.select("symbol", "date", "close", "ma10", "ma30", "volatility_30d", "rel_strength").limit(5))


In [None]:
# Write to silver layer
silver_df.write.format("delta").mode("overwrite").partitionBy("symbol", "year") \
    .save(silver_path)

print(f"Silver layer created with {silver_df.count()} records")

In [None]:
# Select one symbol for detailed analysis
example_symbol = config["stock_symbols"][0]  # First symbol from config
example_df = silver_df.filter(col("symbol") == example_symbol) \
              .orderBy(col("date").desc()) \
              .select("symbol", "date", "close", "ma10", "ma30", "price_change_pct", "volatility_30d", "rel_strength")

print(f"Example transformations for {example_symbol}:")
display(example_df.limit(30))

# Calculate additional metrics for this symbol
avg_volatility = example_df.select(avg("volatility_30d")).first()[0]
max_daily_change = example_df.select(max(abs("price_change_pct"))).first()[0]

print(f"Average 30-day volatility for {example_symbol}: {avg_volatility:.4f}")
print(f"Maximum daily price change % for {example_symbol}: {max_daily_change:.2f}%")