In [0]:
%pip install s3fs

Collecting s3fs
  Downloading s3fs-2025.9.0-py3-none-any.whl.metadata (1.4 kB)
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)
  Downloading aiobotocore-2.25.0-py3-none-any.whl.metadata (25 kB)
Collecting fsspec==2025.9.0 (from s3fs)
  Downloading fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading aioitertools-0.12.0-py3-none-any.whl.metadata (3.8 kB)
Collecting botocore<1.40.50,>=1.40.46 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading botocore-1.40.49-py3-none-any.whl.metadata (5.7 kB)
Downloading s3fs-2025.9.0-py3-none-any.whl (30 kB)
Downloading fsspec-2025.9.0-py3-none-any.whl (199 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/199.3 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.3/199.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading aiobotocore-2.25.0-py3-none-any.whl (86 kB)
[?25l   [

In [0]:
dbutils.library.restartPython()

In [0]:
import boto3
import pandas as pd
import json
import time
from datetime import datetime, timezone
import s3fs

# ------------------------------------------------------------
# Configuration
# ------------------------------------------------------------
stream_name = "stock_stream"     # Name of the Kinesis Stream
region = "us-east-1"             # AWS region
base_s3_path = "s3://databricks-stock-project-2025-10-02/curated/stocks_features/ticker=AAPL/year=2024/"
partition_key = "AAPL"           # Partition key (ensures message order for the same key)
interval = 1.5                   # Time interval between records (in seconds)
loop_forever = True              # Whether to replay the dataset continuously
print_limit = 50                 # Print a progress message every N records
# ------------------------------------------------------------

# Initialize S3 filesystem
fs = s3fs.S3FileSystem(anon=False)

# 1. Load AAPL 2024 historical data
print(f"Loading 2024 data from {base_s3_path} ...")
df = pd.read_parquet(base_s3_path, storage_options={"anon": False}).sort_values("date").reset_index(drop=True)
print(f"Loaded {len(df)} records for AAPL (2024)")

# 2. Initialize Kinesis client
kinesis = boto3.client("kinesis", region_name=region)

# 3. Define a function to send each record
def send_row(row):
    # Current UTC system time (timezone-aware)
    now_utc = datetime.now(timezone.utc).isoformat()

    # Construct a JSON-formatted record
    record = {
        "ticker": str(row.get("ticker", "AAPL")),
        "date": str(row["date"]),       # Historical trade date
        "timestamp": now_utc,           # Current system timestamp (simulated real-time)
        "open": float(row["open"]),
        "high": float(row["high"]),
        "low": float(row["low"]),
        "close": float(row["close"]),
        "volume": int(row["volume"])
    }

    # Send record to Kinesis Stream
    kinesis.put_record(
        StreamName=stream_name,
        Data=json.dumps(record),
        PartitionKey=partition_key
    )
    return record

# 4. Start real-time data streaming
counter = 0
print("Starting real-time replay (AAPL 2024 data with live timestamps)...")

while True:
    for i, row in df.iterrows():
        record = send_row(row)
        counter += 1

        # Print progress every 'print_limit' records
        if counter % print_limit == 0:
            print(f"[{datetime.now(timezone.utc).isoformat()}] "
                  f"Sent {counter}/{len(df)} | date={record['date']} | close={record['close']}")
        time.sleep(interval)

    # Stop after one iteration if loop_forever is False
    if not loop_forever:
        break

    # Otherwise, restart the dataset replay from the beginning
    print("Finished dataset, restarting from beginning...")



Loading 2024 data from s3://databricks-stock-project-2025-10-02/curated/stocks_features/ticker=AAPL/year=2024/ ...
Loaded 252 records for AAPL (2024)
Starting real-time replay (AAPL 2024 data with live timestamps)...
[2025-10-11T21:12:01.028786+00:00] Sent 50/252 | date=2024-03-13 | close=169.90985107421875
[2025-10-11T21:13:16.599346+00:00] Sent 100/252 | date=2024-05-23 | close=185.79920959472656
[2025-10-11T21:14:32.907740+00:00] Sent 150/252 | date=2024-08-06 | close=206.0315399169922
[2025-10-11T21:15:48.977257+00:00] Sent 200/252 | date=2024-10-16 | close=230.70628356933594
[2025-10-11T21:17:04.572842+00:00] Sent 250/252 | date=2024-12-27 | close=254.68588256835938
Finished dataset, restarting from beginning...
[2025-10-11T21:18:20.207117+00:00] Sent 300/252 | date=2024-03-11 | close=171.518310546875
[2025-10-11T21:19:35.841170+00:00] Sent 350/252 | date=2024-05-21 | close=191.23757934570312
[2025-10-11T21:20:51.467943+00:00] Sent 400/252 | date=2024-08-02 | close=218.58848571777

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:132)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:132)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
# Replay historical S3 data into Kinesis Stream

# import boto3
# import pandas as pd
# import json
# import time

# stream_name = "stock_stream"
# region = "us-east-1"
# s3_path = "s3://databricks-stock-project-2025-10-02/curated/stocks_features/ticker=AAPL/year=2024/"
# send_limit = 200
# interval = 0.2
# partition_key = "AAPL"

# print(f" Loading data from {s3_path} ...")
# df = pd.read_parquet(s3_path, storage_options={"anon": False})

# if send_limit:
#     df = df.head(send_limit)

# print(f"Loaded {len(df)} records to replay.")
# kinesis = boto3.client("kinesis", region_name=region)

# print(f" Sending {len(df)} records to stream [{stream_name}] ...")
# for i, row in df.iterrows():
#     record = {
#         "ticker": str(row.get("ticker", "AAPL")),
#         "date": str(row["date"]),
#         "open": float(row["open"]),
#         "high": float(row["high"]),
#         "low": float(row["low"]),
#         "close": float(row["close"]),
#         "volume": int(row["volume"])
#     }
#     kinesis.put_record(StreamName=stream_name, Data=json.dumps(record), PartitionKey=partition_key)
#     if i % 50 == 0:
#         print(f"Sent {i}/{len(df)} | {record['date']} | close={record['close']}")
#     time.sleep(interval)

# print("All records sent successfully!")


com.databricks.backend.common.rpc.CommandSkippedException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:134)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:473)
	at com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:750)
	at com.data