In [0]:
pip install yfinance

Collecting yfinance
  Downloading yfinance-0.2.66-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.6-py312-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.2.tar.gz (949 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/949.2 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m768.0/949.2 kB[0m [31m23.1 MB/s[0m eta [36m0:00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m949.2/949.2 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build 

In [0]:
pip install s3fs

Collecting s3fs
  Downloading s3fs-2025.9.0-py3-none-any.whl.metadata (1.4 kB)
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)
  Downloading aiobotocore-2.24.2-py3-none-any.whl.metadata (25 kB)
Collecting fsspec==2025.9.0 (from s3fs)
  Downloading fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading aioitertools-0.12.0-py3-none-any.whl.metadata (3.8 kB)
Collecting botocore<1.40.19,>=1.40.15 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading botocore-1.40.18-py3-none-any.whl.metadata (5.7 kB)
Downloading s3fs-2025.9.0-py3-none-any.whl (30 kB)
Downloading fsspec-2025.9.0-py3-none-any.whl (199 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/199.3 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.3/199.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading aiobotocore-2.24.2-py3-none-any.whl (85 kB)
[?25l   [

In [0]:
dbutils.library.restartPython()

In [0]:
import yfinance as yf
import pandas as pd
from pathlib import PurePosixPath

In [0]:
#Check the file info of AWS S3
dbutils.fs.ls("s3a://databricks-stock-project-2025-10-02/")

[FileInfo(path='s3a://databricks-stock-project-2025-10-02/curated/', name='curated/', size=0, modificationTime=1759714252598),
 FileInfo(path='s3a://databricks-stock-project-2025-10-02/processed/', name='processed/', size=0, modificationTime=1759714252598),
 FileInfo(path='s3a://databricks-stock-project-2025-10-02/raw/', name='raw/', size=0, modificationTime=1759714252598)]

In [0]:
import yfinance as yf
import pandas as pd
import os
import time
from datetime import datetime
import boto3
from botocore.exceptions import ClientError

# === CONFIG ===
bucket = "databricks-stock-project-2025-10-02"
prefix = "raw/stocks"
tickers = ["AAPL", "MSFT", "AMZN", "GOOGL", "META", "NVDA", "TSLA", "NFLX", "AVGO", "AMD"]
start = "2015-01-01"

# === AWS S3 Client ===
s3 = boto3.client("s3")

def s3_exists(bucket, key):
    """Check if S3 object already exists"""
    try:
        s3.head_object(Bucket=bucket, Key=key)
        return True
    except ClientError:
        return False

# === MAIN LOOP ===
for t in tickers:
    print(f"\n⬇️ Downloading {t} ...")

    # --- Download Yahoo Finance Data ---
    try:
        df = yf.download(t, start=start, interval="1d", auto_adjust=True, progress=False)
        if df.empty:
            print(f" {t} No data, pass")
            continue
    except Exception as e:
        print(f"Error occurred while downloading {t}: {e}")
        continue

    df = df.reset_index()

    # --- Flatten MultiIndex Column ---
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [
            "_".join([str(c) for c in col if c]).strip().lower().replace(" ", "_")
            for col in df.columns
        ]
    else:
        df.columns = [str(c).lower().replace(" ", "_") for c in df.columns]

    # --- Standardize column names (remove ticker suffix, e.g., close_nvda → close)---
    df.columns = [c.replace(f"_{t.lower()}", "") for c in df.columns]

    # --- Ensure all required columns exist ---
    required_cols = ["date", "open", "high", "low", "close", "adj_close", "volume"]
    for col_name in required_cols:
        if col_name not in df.columns:
            df[col_name] = None  # Fill with None if column doesn't exist

    # --- Add metadata column ---
    df["ticker"] = t
    df["source"] = "yfinance"
    df["interval"] = "1d"
    df["ingestion_date"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
    df["is_valid"] = True

    # --- Reorder columns ---
    df = df[
        [
            "date",
            "open",
            "high",
            "low",
            "close",
            "adj_close",
            "volume",
            "ticker",
            "source",
            "interval",
            "ingestion_date",
            "is_valid",
        ]
    ]

    # --- Save temporary CSV locally ---
    local_file = f"/tmp/stocks_{t}.csv"
    df.to_csv(local_file, index=False)

    # --- Define S3 target path ---
    s3_key = f"{prefix}/ticker={t}/stocks_{t}.csv"

    # --- Check if file already exists on S3 ---
    if s3_exists(bucket, s3_key):
        print(f"{t} already exists in S3, will overwrite.")

    # --- Upload file to S3 ---
    try:
        s3.upload_file(local_file, bucket, s3_key)
        print(f"Successfully uploaded to s3://{bucket}/{s3_key}")
    except Exception as e:
            print(f"Upload failed for {t}: {e}")
        continue
    finally:
        # Remove temporary file
        os.remove(local_file)

    # --- Sleep to prevent API rate limiting ---
    time.sleep(1)



⬇️ Downloading AAPL ...


  df["ingestion_date"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


✅ 成功上傳到 s3://databricks-stock-project-2025-10-02/raw/stocks/ticker=AAPL/stocks_AAPL.csv

⬇️ Downloading MSFT ...


  df["ingestion_date"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


✅ 成功上傳到 s3://databricks-stock-project-2025-10-02/raw/stocks/ticker=MSFT/stocks_MSFT.csv

⬇️ Downloading AMZN ...


  df["ingestion_date"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


✅ 成功上傳到 s3://databricks-stock-project-2025-10-02/raw/stocks/ticker=AMZN/stocks_AMZN.csv

⬇️ Downloading GOOGL ...


  df["ingestion_date"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


✅ 成功上傳到 s3://databricks-stock-project-2025-10-02/raw/stocks/ticker=GOOGL/stocks_GOOGL.csv

⬇️ Downloading META ...


  df["ingestion_date"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


✅ 成功上傳到 s3://databricks-stock-project-2025-10-02/raw/stocks/ticker=META/stocks_META.csv

⬇️ Downloading NVDA ...


  df["ingestion_date"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


✅ 成功上傳到 s3://databricks-stock-project-2025-10-02/raw/stocks/ticker=NVDA/stocks_NVDA.csv

⬇️ Downloading TSLA ...


  df["ingestion_date"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


✅ 成功上傳到 s3://databricks-stock-project-2025-10-02/raw/stocks/ticker=TSLA/stocks_TSLA.csv

⬇️ Downloading NFLX ...


  df["ingestion_date"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


✅ 成功上傳到 s3://databricks-stock-project-2025-10-02/raw/stocks/ticker=NFLX/stocks_NFLX.csv

⬇️ Downloading AVGO ...


  df["ingestion_date"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


✅ 成功上傳到 s3://databricks-stock-project-2025-10-02/raw/stocks/ticker=AVGO/stocks_AVGO.csv

⬇️ Downloading AMD ...


  df["ingestion_date"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


✅ 成功上傳到 s3://databricks-stock-project-2025-10-02/raw/stocks/ticker=AMD/stocks_AMD.csv


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year

spark = SparkSession.builder.getOrCreate()

# 1. Read raw data
df = spark.read.option("header", True).csv("s3://databricks-stock-project-2025-10-02/raw/stocks/")

# 2️. Data cleaning and type casting
df_clean = (df
    .dropna(subset=["close"])                 # Drop rows where 'close' is null
    .withColumn("close", col("close").cast("double"))
    .withColumn("open", col("open").cast("double"))
    .withColumn("high", col("high").cast("double"))
    .withColumn("low", col("low").cast("double"))
    .withColumn("volume", col("volume").cast("bigint"))
    .withColumn("year", year(col("date")))    # Extract year for partitioning
)

# 3. Write processed data (Parquet format + partitioned)
df_clean.write.mode("overwrite") \
    .partitionBy("ticker", "year") \
    .parquet("s3://databricks-stock-project-2025-10-02/processed/stocks/")


In [0]:
df.printSchema()

root
 |-- date: string (nullable = true)
 |-- open: string (nullable = true)
 |-- high: string (nullable = true)
 |-- low: string (nullable = true)
 |-- close: string (nullable = true)
 |-- adj_close: string (nullable = true)
 |-- volume: string (nullable = true)
 |-- ticker: string (nullable = true)
 |-- source: string (nullable = true)
 |-- interval: string (nullable = true)
 |-- ingestion_date: string (nullable = true)
 |-- is_valid: string (nullable = true)



In [0]:
df = spark.read.option("header", True).csv("s3://databricks-stock-project-2025-10-02/raw/stocks/")
df.select("ticker", "date", "close").show(5)

+------+----------+-------------------+
|ticker|      date|              close|
+------+----------+-------------------+
|  NVDA|2015-01-02| 0.4830383360385895|
|  NVDA|2015-01-05| 0.4748796820640564|
|  NVDA|2015-01-06|0.46048209071159363|
|  NVDA|2015-01-07| 0.4592823088169098|
|  NVDA|2015-01-08| 0.4765594005584717|
+------+----------+-------------------+
only showing top 5 rows


In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Read raw data directly from S3
df = spark.read.option("header", True).csv("s3://databricks-stock-project-2025-10-02/raw/stocks/")

df.printSchema()
df.show(5)

root
 |-- date: string (nullable = true)
 |-- open: string (nullable = true)
 |-- high: string (nullable = true)
 |-- low: string (nullable = true)
 |-- close: string (nullable = true)
 |-- adj_close: string (nullable = true)
 |-- volume: string (nullable = true)
 |-- ticker: string (nullable = true)
 |-- source: string (nullable = true)
 |-- interval: string (nullable = true)
 |-- ingestion_date: string (nullable = true)
 |-- is_valid: string (nullable = true)

+----------+-------------------+-------------------+-------------------+-------------------+---------+---------+------+--------+--------+-------------------+--------+
|      date|               open|               high|                low|              close|adj_close|   volume|ticker|  source|interval|     ingestion_date|is_valid|
+----------+-------------------+-------------------+-------------------+-------------------+---------+---------+------+--------+--------+-------------------+--------+
|2015-01-02| 0.4830383360385895|

In [0]:
# Read Parquet data from the processed layer
df_clean = spark.read.parquet("s3://databricks-stock-project-2025-10-02/processed/stocks/")

# View schema
df_clean.printSchema()

# Preview data
df_clean.show(5)

root
 |-- date: string (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- adj_close: string (nullable = true)
 |-- volume: long (nullable = true)
 |-- source: string (nullable = true)
 |-- interval: string (nullable = true)
 |-- ingestion_date: string (nullable = true)
 |-- is_valid: string (nullable = true)
 |-- ticker: string (nullable = true)
 |-- year: integer (nullable = true)

+----------+------------------+------------------+------------------+------------------+---------+---------+--------+--------+-------------------+--------+------+----+
|      date|              open|              high|               low|             close|adj_close|   volume|  source|interval|     ingestion_date|is_valid|ticker|year|
+----------+------------------+------------------+------------------+------------------+---------+---------+--------+--------+-------------------+--------+------+

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import (
    col, avg, stddev, lag, when
)

spark = SparkSession.builder.getOrCreate()

# Step 1. Read the processed layer (df_clean)
# df_clean = spark.read.parquet("s3://databricks-stock-project-2025-10-02/processed/stocks/")

# === Define window specifications ===
w_ticker = Window.partitionBy("ticker").orderBy("date")
window_sma20 = w_ticker.rowsBetween(-19, 0)
window_sma50 = w_ticker.rowsBetween(-49, 0)
window_sma200 = w_ticker.rowsBetween(-199, 0)
window_rsi = w_ticker.rowsBetween(-13, 0)
window_vol = w_ticker.rowsBetween(-19, 0)
window_macd12 = w_ticker.rowsBetween(-11, 0)
window_macd26 = w_ticker.rowsBetween(-25, 0)
window_signal = w_ticker.rowsBetween(-8, 0)

# Step 2. 
# === Technical indicator calculations ===

# Moving averages (SMA)
df_feat = (
    df_clean
    # Moving averages (SMA)
    .withColumn("sma_20", avg(col("close")).over(window_sma20))
    .withColumn("sma_50", avg(col("close")).over(window_sma50))
    .withColumn("sma_200", avg(col("close")).over(window_sma200))
)

# 20-day volatility
df_feat = df_feat.withColumn("prev_close", lag("close").over(w_ticker))
df_feat = df_feat.withColumn("return", (col("close") - col("prev_close")) / col("prev_close"))
df_feat = df_feat.withColumn("volatility_20", stddev(col("return")).over(window_vol))

# RSI(14)
df_feat = df_feat.withColumn("delta", col("close") - lag("close").over(w_ticker))
df_feat = df_feat.withColumn("gain", when(col("delta") > 0, col("delta")).otherwise(0.0))
df_feat = df_feat.withColumn("loss", when(col("delta") < 0, -col("delta")).otherwise(0.0))
df_feat = df_feat.withColumn("avg_gain", avg(col("gain")).over(window_rsi))
df_feat = df_feat.withColumn("avg_loss", avg(col("loss")).over(window_rsi))
df_feat = df_feat.withColumn("rs", col("avg_gain") / col("avg_loss"))
df_feat = df_feat.withColumn("rsi", 100 - (100 / (1 + col("rs"))))

# MACD(12,26,9) — simplified version using rolling mean
df_feat = df_feat.withColumn("ema12", avg(col("close")).over(window_macd12))
df_feat = df_feat.withColumn("ema26", avg(col("close")).over(window_macd26))
df_feat = df_feat.withColumn("macd", col("ema12") - col("ema26"))
df_feat = df_feat.withColumn("signal_line", avg(col("macd")).over(window_signal))

# Bollinger Bands
df_feat = df_feat.withColumn("bb_mid", avg(col("close")).over(window_sma20))
df_feat = df_feat.withColumn("bb_std", stddev(col("close")).over(window_sma20))
df_feat = df_feat.withColumn("bollinger_upper", col("bb_mid") + 2 * col("bb_std"))
df_feat = df_feat.withColumn("bollinger_lower", col("bb_mid") - 2 * col("bb_std"))

# Volume moving average
df_feat = df_feat.withColumn("vol_ma_20", avg(col("volume")).over(window_vol))

# Simple buy/sell flags
df_feat = df_feat.withColumn("buy_flag", (col("sma_20") > col("sma_50")) & (col("rsi") < 30))
df_feat = df_feat.withColumn("sell_flag", (col("sma_20") < col("sma_50")) & (col("rsi") > 70))

# Golden Cross / Death Cross
df_feat = df_feat.withColumn("prev_sma50", lag("sma_50").over(w_ticker))
df_feat = df_feat.withColumn("prev_sma200", lag("sma_200").over(w_ticker))
df_feat = df_feat.withColumn(
    "golden_cross",
    (col("sma_50") > col("sma_200")) & (col("prev_sma50") <= col("prev_sma200"))
)
df_feat = df_feat.withColumn(
    "death_cross",
    (col("sma_50") < col("sma_200")) & (col("prev_sma50") >= col("prev_sma200"))
)

# Step 3. Write to the curated layer
df_feat.write.mode("overwrite") \
    .partitionBy("ticker", "year") \
    .parquet("s3://databricks-stock-project-2025-10-02/curated/stocks_features/")

print("Feature engineering completed!")


✅ Feature engineering 完成！


In [0]:
# df_feat = spark.read.parquet("s3://databricks-stock-project-2025-10-02/curated/stocks_features/")
df_feat.select("ticker", "date", "close", "sma_20", "rsi", "macd", "bollinger_upper").show(10)


+------+----------+------------------+------------------+------------------+----+------------------+
|ticker|      date|             close|            sma_20|               rsi|macd|   bollinger_upper|
+------+----------+------------------+------------------+------------------+----+------------------+
|  AAPL|2015-01-02|24.261043548583984|24.261043548583984|              NULL| 0.0|              NULL|
|  AAPL|2015-01-05|23.577573776245117| 23.91930866241455|               0.0| 0.0|24.885880883928227|
|  AAPL|2015-01-06|23.579795837402344|23.806137720743816|0.3240611961057027| 0.0|24.594060860500885|
|  AAPL|2015-01-07|23.910436630249023|23.832212448120117| 32.75136951980947| 0.0|24.483948728093644|
|  AAPL|2015-01-08| 24.82912826538086|24.031595611572264| 64.67900759972795| 0.0|25.086888486246885|
|  AAPL|2015-01-09|24.855756759643555| 24.16895580291748| 65.15847311202747| 0.0|25.328154738848962|
|  AAPL|2015-01-12|24.243289947509766|24.179574966430664|49.655151973206486| 0.0|  25.23926

In [0]:
df_raw = spark.read.option("header", True).csv("s3://databricks-stock-project-2025-10-02/raw/stocks/")
df_raw.select("ticker").distinct().show()
df_raw.count()

+------+
|ticker|
+------+
|  AAPL|
|  META|
| GOOGL|
|  AVGO|
|  NVDA|
|  TSLA|
|  NFLX|
|   AMD|
|  AMZN|
|  MSFT|
+------+



27050

In [0]:
df_clean = spark.read.parquet("s3://databricks-stock-project-2025-10-02/processed/stocks/")
df_clean.printSchema()
df_clean.select("ticker", "year").distinct().orderBy("ticker", "year").show()

root
 |-- date: string (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- adj_close: string (nullable = true)
 |-- volume: long (nullable = true)
 |-- source: string (nullable = true)
 |-- interval: string (nullable = true)
 |-- ingestion_date: string (nullable = true)
 |-- is_valid: string (nullable = true)
 |-- ticker: string (nullable = true)
 |-- year: integer (nullable = true)

+------+----+
|ticker|year|
+------+----+
|  AAPL|2015|
|  AAPL|2016|
|  AAPL|2017|
|  AAPL|2018|
|  AAPL|2019|
|  AAPL|2020|
|  AAPL|2021|
|  AAPL|2022|
|  AAPL|2023|
|  AAPL|2024|
|  AAPL|2025|
|   AMD|2015|
|   AMD|2016|
|   AMD|2017|
|   AMD|2018|
|   AMD|2019|
|   AMD|2020|
|   AMD|2021|
|   AMD|2022|
|   AMD|2023|
+------+----+
only showing top 20 rows


In [0]:
df_feat = spark.read.parquet("s3://databricks-stock-project-2025-10-02/curated/stocks_features/")
df_feat.select("ticker", "date", "close", "sma_20", "rsi", "macd").show(10)

+------+----------+------------------+------------------+------------------+-------------------+
|ticker|      date|             close|            sma_20|               rsi|               macd|
+------+----------+------------------+------------------+------------------+-------------------+
|  NVDA|2024-12-31|134.26075744628906|137.11764602661134| 48.81242125525914| -2.345644730788024|
|  NVDA|2024-12-30|137.46005249023438| 137.3341209411621| 48.02278072636426| -2.567195403270233|
|  NVDA|2024-12-27|136.98016357421875|137.37163696289062| 42.56774682505629| -2.738182556934845|
|  NVDA|2024-12-26|139.89952087402344| 137.2876884460449| 42.92021698076249|-3.2839555984888307|
|  NVDA|2024-12-24|140.18946838378906| 137.1367500305176| 43.18243851496714| -3.385357196514434|
|  NVDA|2024-12-23| 139.6395721435547|136.92632675170898| 49.28066127859403| -3.267698434682984|
|  NVDA|2024-12-20|  134.670654296875|137.03981246948243|44.702019945773436| -3.090872251070465|
|  NVDA|2024-12-19|130.6515197

In [0]:
#Check RSI 
df_feat.select("rsi").summary().show()

+-------+------------------+
|summary|               rsi|
+-------+------------------+
|  count|             27033|
|   mean|54.790867296940874|
| stddev| 16.92780250252875|
|    min|               0.0|
|    25%|  42.6216142609664|
|    50%| 55.24307503002412|
|    75%| 67.22273955479875|
|    max| 99.42895135103366|
+-------+------------------+



In [0]:
#Check SMA
df_feat.filter(col("ticker")=="AAPL").select("date","close","sma_20","sma_50").orderBy("date").show(10)

+----------+------------------+------------------+------------------+
|      date|             close|            sma_20|            sma_50|
+----------+------------------+------------------+------------------+
|2015-01-02|24.261043548583984|24.261043548583984|24.261043548583984|
|2015-01-05|23.577573776245117| 23.91930866241455| 23.91930866241455|
|2015-01-06|23.579795837402344|23.806137720743816|23.806137720743816|
|2015-01-07|23.910436630249023|23.832212448120117|23.832212448120117|
|2015-01-08| 24.82912826538086|24.031595611572264|24.031595611572264|
|2015-01-09|24.855756759643555| 24.16895580291748| 24.16895580291748|
|2015-01-12|24.243289947509766|24.179574966430664|24.179574966430664|
|2015-01-13|24.458545684814453|24.214446306228638|24.214446306228638|
|2015-01-14|24.365345001220703|24.231212827894424|24.231212827894424|
|2015-01-15|23.704063415527344|24.178497886657716|24.178497886657716|
+----------+------------------+------------------+------------------+
only showing top 10 

In [0]:
#Check Golden/Dead Flag
df_feat.filter((col("ticker")=="NVDA") & (col("golden_cross")==True)).select("date","sma_50","sma_200").show(5)

+----------+------------------+------------------+
|      date|            sma_50|           sma_200|
+----------+------------------+------------------+
|2015-09-16|0.5182564294338227|0.5171246751305763|
|2015-03-17|0.5078988969326019| 0.507411434954288|
|2023-01-24|16.229695472717285| 16.22136088848114|
|2019-08-22| 4.014870138168335| 4.001718949079514|
|2025-06-27|129.54735260009767|129.17738998413085|
+----------+------------------+------------------+



In [0]:
#Check Boolinger
df_feat.filter(col("ticker")=="TSLA").select("date","close","bollinger_upper","bollinger_lower").show(10)

+----------+------------------+------------------+------------------+
|      date|             close|   bollinger_upper|   bollinger_lower|
+----------+------------------+------------------+------------------+
|2020-12-31|235.22332763671875|235.84033929569293|194.17399175899456|
|2020-12-30| 231.5933380126953|234.21830521190822|191.23436049854098|
|2020-12-29|221.99667358398438| 231.8583176650236| 189.9270140610506|
|2020-12-28|221.22999572753906|231.64359580282022|186.86206825967977|
|2020-12-24|220.58999633789062|230.43294439055396| 185.4750527407937|
|2020-12-23|   215.32666015625|229.31299167778323| 183.6693386444824|
|2020-12-22|213.44667053222656| 229.2894652499633| 180.6728653774781|
|2020-12-21| 216.6199951171875|230.61263578799228|175.40002748105067|
|2020-12-18| 231.6666717529297| 232.3759094698996| 168.2950881497293|
|2020-12-17|218.63333129882812|228.96740065261372|165.17926377609723|
+----------+------------------+------------------+------------------+
only showing top 10 

In [0]:
#Check total court
df_feat.groupBy("ticker").count().orderBy("ticker").show()

+------+-----+
|ticker|count|
+------+-----+
|  AAPL| 2705|
|   AMD| 2705|
|  AMZN| 2705|
|  AVGO| 2705|
| GOOGL| 2705|
|  META| 2705|
|  MSFT| 2705|
|  NFLX| 2705|
|  NVDA| 2705|
|  TSLA| 2705|
+------+-----+



In [0]:
#Check performance()
df_feat.select("ticker").distinct().count()

10

In [0]:
df_feat.filter(col("ticker")=="AAPL").count()

2705

In [0]:
#Check missing value
from pyspark.sql.functions import count, when, isnan
df_feat.select([count(when(col(c).isNull(), c)).alias(c) for c in df_feat.columns]).show()

+----+----+----+---+-----+---------+------+------+--------+--------------+--------+------+------+-------+----------+------+-------------+-----+----+----+--------+--------+---+---+-----+-----+----+-----------+------+------+---------------+---------------+---------+--------+---------+----------+-----------+------------+-----------+------+----+
|date|open|high|low|close|adj_close|volume|source|interval|ingestion_date|is_valid|sma_20|sma_50|sma_200|prev_close|return|volatility_20|delta|gain|loss|avg_gain|avg_loss| rs|rsi|ema12|ema26|macd|signal_line|bb_mid|bb_std|bollinger_upper|bollinger_lower|vol_ma_20|buy_flag|sell_flag|prev_sma50|prev_sma200|golden_cross|death_cross|ticker|year|
+----+----+----+---+-----+---------+------+------+--------+--------------+--------+------+------+-------+----------+------+-------------+-----+----+----+--------+--------+---+---+-----+-----+----+-----------+------+------+---------------+---------------+---------+--------+---------+----------+-----------+------

In [0]:
#Simple Visualization
display(df_feat.filter(col("ticker")=="NVDA").select("date","close","sma_20","sma_50"))

date,close,sma_20,sma_50
2024-12-31,134.26075744628906,137.11764602661134,139.8536813354492
2024-12-30,137.46005249023438,137.3341209411621,139.92767486572265
2024-12-27,136.98016357421875,137.37163696289062,139.91628875732422
2024-12-26,139.89952087402344,137.2876884460449,139.8903076171875
2024-12-24,140.18946838378906,137.1367500305176,139.7235629272461
2024-12-23,139.6395721435547,136.92632675170898,139.6803823852539
2024-12-20,134.670654296875,137.03981246948243,139.58281829833984
2024-12-19,130.65151977539062,137.63767776489257,139.58483215332032
2024-12-18,128.88192749023438,138.39751052856445,139.62404083251954
2024-12-17,130.36160278320312,139.301806640625,139.70344024658203
