In [0]:
%pip install faker

In [0]:
import random
from faker import Faker
import pandas as pd
import time

In [0]:
def reset_or_init_directories(reset_all):
  for i in [transaction_output, dimension_output]:
    try:
        dbutils.fs.ls(f"{i}")
        if reset_all:
          print(f"deleting {i}")
          dbutils.fs.rm(i, True)
          print(f"recreating {i}")
          dbutils.fs.mkdirs(i)
        else: 
          print("no data to delete")
    except:
        print(f"making {i}")
        dbutils.fs.mkdirs(i)

In [0]:
def drop_tables_and_checkpoints(*,tables=[''],catalog='',schema='',checkpoint_dir=''):
  for i in tables:
    try:
        spark.sql(f"DROP TABLE IF EXISTS {catalog}.{schema}.{i}")
        print(f"successfully dropped {catalog}.{schema}.{i}")
        dbutils.fs.rm(f"{checkpoint_dir}/{catalog}.{schema}.{i}", True)
        print(f"{checkpoint_dir}/{catalog}.{schema}.{i}")
        print(f"successfully removed checkpoint directory for {catalog}.{schema}.{i}")
    except Exception as e:
        print(f"Error occurred: {e}")

In [0]:
def write_stream_data(*,chunk_size=1000):
  fake = Faker()

  # List of stock tickers
  tickers = ["AAPL", "GOOGL", "AMZN", "FB", "NFLX","NVDA","VOO"]

  buyers = ["Vanguard", "BlackRock", "State Street Global Advisors", "Fidelity Investments",
                          "Capital Research and Management Company", "The Vanguard Group",
                          "T. Rowe Price", "Bank of New York Mellon", "JPMorgan Chase", 
                          "Goldman Sachs", "BNP Paribas Asset Management", "Northern Trust Corporation","Bank of Andrew"]

  sellers = ["JPMorgan Chase", "Bank of America", "Citigroup", "Wells Fargo", "Goldman Sachs",
                      "Morgan Stanley", "Barclays", "HSBC Holdings", "BNP Paribas", "UBS Group", "Credit Suisse", "Deutsche Bank"]
  

  # Generate transaction data
  transactions = []
  for _ in range(chunk_size):
      ticker = random.choice(tickers)
      buyer = random.choice(buyers)
      seller = random.choice(sellers)
      timestamp = fake.date_time_between_dates(datetime_start='-1d', datetime_end='now')
      price = round(random.uniform(100, 1000), 2)
      volume = random.randint(100, 1000)
      transactions.append((timestamp, buyer,seller, ticker, price, volume))

  # Convert transactions list to a DataFrame
  df = pd.DataFrame(transactions, columns=["timestamp", "buyer", "seller", "ticker", "price", "volume"])

  output_path = f"{transaction_output}/transactions_{int(time.time())}.csv"
  # Write DataFrame to CSV
  df.to_csv(output_path, index=False)
  
  return output_path

In [0]:
def build_a_stream( *, total_rows_m=1, stream_length_m=5, interval_s=3, reset_data=False):
  """
  This function will build a stream of data to a directory over a given timeframe, and log the output
  It will first check to see if all existing data, and checkpoints should be deleted and recreated
  """

  intervals_count = int(stream_length_m * 60 // interval_s)
  batch_size = int((total_rows_m/intervals_count)*1000000)

  output_log = {"intervals_count": intervals_count, "batch_size":batch_size}
  
  #considered for later date 
  # input(f"this stream will take {stream_length_m} minutes, produce {intervals_count} files, and {batch_size} rows per csv every {interval_s} seconds: Continue(Y/N):")
  
  print(f"Stream Metrics:\n"
        f"----------------\n"
        f"Duration: {stream_length_m:.2f} minutes\n"
        f"Intervals: {intervals_count}\n"
        f"Batch Size: {batch_size} rows per csv\n"
        f"---BEGINNING STREAM----------------\n")

  reset_or_init_directories(reset_data)
  
  for i in range(1,intervals_count):
      
      file = write_stream_data(chunk_size=batch_size)
      print(f"wrote: {i} of {intervals_count} files")
      time.sleep(interval_s)

  return output_log