In [1]:
import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

def generate_time_series(start_time, num_intervals):
    time_list = [start_time]
    for _ in range(num_intervals):
        delta = timedelta(seconds=np.random.randint(1, 4))
        time_list.append(time_list[-1] - delta)
    return time_list[1:][::-1]

def generate_registers(start_time, transactions, captures):
    def generate_file_registers(ref_time):
        temp_df = pd.DataFrame({
            'File_Date': [ref_time] * transactions,
            'Datetime': generate_time_series(ref_time, transactions)
        })
        temp_df['Date'] = temp_df['Datetime'].dt.date
        temp_df['Time'] = temp_df['Datetime'].dt.time
        return temp_df.drop(['Datetime'], axis=1)
    
    result_df = pd.DataFrame()
    for i in range(captures):
        ref_time = start_time + timedelta(minutes=5*i)
        temp_df_1 = generate_file_registers(ref_time)
        temp_df_2 = generate_file_registers(ref_time)

        result_df = pd.concat([result_df, temp_df_1, temp_df_2], ignore_index=True)
    return result_df

def get_types_pattern(size):
    pattern = ["bids"] * 5 + ["asks"] * 5
    return np.tile(pattern, size // len(pattern) + 1)[:size]

transactions_per_capture = 5
captures_count = 2
size = transactions_per_capture * captures_count * 2
start_time = datetime.now().replace(microsecond=0)

df = generate_registers(start_time, transactions_per_capture, captures_count)
df["Transaction_type"] = get_types_pattern(size)
print(df.head(11))

             File_Date        Date      Time Transaction_type
0  2024-06-08 01:39:07  2024-06-08  01:38:57             bids
1  2024-06-08 01:39:07  2024-06-08  01:38:58             bids
2  2024-06-08 01:39:07  2024-06-08  01:38:59             bids
3  2024-06-08 01:39:07  2024-06-08  01:39:02             bids
4  2024-06-08 01:39:07  2024-06-08  01:39:04             bids
5  2024-06-08 01:39:07  2024-06-08  01:38:56             asks
6  2024-06-08 01:39:07  2024-06-08  01:38:58             asks
7  2024-06-08 01:39:07  2024-06-08  01:39:01             asks
8  2024-06-08 01:39:07  2024-06-08  01:39:03             asks
9  2024-06-08 01:39:07  2024-06-08  01:39:05             asks
10 2024-06-08 01:44:07  2024-06-08  01:43:56             bids


In [2]:
SV = starting_value = 50

def get_stock_value(range_n, mul_n):
    return stock_multipliers[mul_n](
      stock_ranges[range_n]()
    )

def generate_stocks(size, probs, mul_probs):
    values = np.random.choice(np.arange(len(stock_ranges)), size, p = probs)
    muls = np.random.choice(np.arange(len(stock_multipliers)), size, p = mul_probs)
    get_stock = np.vectorize(get_stock_value)
    return get_stock(values, muls)

def generate_sold_stocks(size):
    return generate_stocks(size, price_stock_probs, price_stock_mul_probs)

def generate_top_stocks(size):
    return generate_stocks(size, top_stock_probs, top_stock_mul_probs)

def remainder(n, mod):
    return (n // mod) * mod if n > mod else n

def normalize(prob_array):
    p = np.array(prob_array)
    return p / p.sum()

stock_ranges = [
  lambda: np.random.randint(1, 9), # 10
  lambda: np.random.randint(10, 49), # 50
  lambda: np.random.randint(50, 99), # 100
  lambda: 100, # 1
  lambda: np.random.randint(100, 499), # 500
  lambda: np.random.randint(500, 999), # 1000
  lambda: np.random.randint(1000, 9499), # 9500
]
price_stock_probs = normalize([0.24, 0.15, 0.11, 0.45, 0.015, 0.03, 0.005])
top_stock_probs = normalize([0.01, 0.01, 0.01, 0.05, 0.30, 0.30, 0.32])

stock_multipliers = [
  lambda n: remainder(n, 100), # 100
  lambda n: remainder(n, 50), # 50
  lambda n: remainder(n, 10), # 10
  lambda n: n, # 1
]
price_stock_mul_probs = normalize([0, 0.4, 0.2, 0.4])
top_stock_mul_probs = normalize([0.75, 0.04, 0.02, 0.19])

df["Number_of_Shares_Sold"] = generate_sold_stocks(size)
df["Shares_Top_of_Book"] = generate_top_stocks(size)
print(df.head(10))


            File_Date        Date      Time Transaction_type  \
0 2024-06-08 01:39:07  2024-06-08  01:38:57             bids   
1 2024-06-08 01:39:07  2024-06-08  01:38:58             bids   
2 2024-06-08 01:39:07  2024-06-08  01:38:59             bids   
3 2024-06-08 01:39:07  2024-06-08  01:39:02             bids   
4 2024-06-08 01:39:07  2024-06-08  01:39:04             bids   
5 2024-06-08 01:39:07  2024-06-08  01:38:56             asks   
6 2024-06-08 01:39:07  2024-06-08  01:38:58             asks   
7 2024-06-08 01:39:07  2024-06-08  01:39:01             asks   
8 2024-06-08 01:39:07  2024-06-08  01:39:03             asks   
9 2024-06-08 01:39:07  2024-06-08  01:39:05             asks   

   Number_of_Shares_Sold  Shares_Top_of_Book  
0                    100                 453  
1                    100                 400  
2                      7                 600  
3                      2                 400  
4                    100                 800  
5            

In [3]:
def get_prices(size, ref_value = 80.0):
    noise_1 = 0.5*np.cos(np.pi*10*np.linspace(0, 1, size))
    noise_2 = np.random.normal(0, 0.03, size).cumsum()
    noise_3 = np.random.normal(0, 0.4, size)

    return ((np.full(size, ref_value) + noise_1) + noise_2) + noise_3

df["Price_of_One_Share"] = get_prices(size, 80)
print(df.head(10))

            File_Date        Date      Time Transaction_type  \
0 2024-06-08 01:39:07  2024-06-08  01:38:57             bids   
1 2024-06-08 01:39:07  2024-06-08  01:38:58             bids   
2 2024-06-08 01:39:07  2024-06-08  01:38:59             bids   
3 2024-06-08 01:39:07  2024-06-08  01:39:02             bids   
4 2024-06-08 01:39:07  2024-06-08  01:39:04             bids   
5 2024-06-08 01:39:07  2024-06-08  01:38:56             asks   
6 2024-06-08 01:39:07  2024-06-08  01:38:58             asks   
7 2024-06-08 01:39:07  2024-06-08  01:39:01             asks   
8 2024-06-08 01:39:07  2024-06-08  01:39:03             asks   
9 2024-06-08 01:39:07  2024-06-08  01:39:05             asks   

   Number_of_Shares_Sold  Shares_Top_of_Book  Price_of_One_Share  
0                    100                 453           81.030522  
1                    100                 400           80.225810  
2                      7                 600           80.228844  
3                      2   

In [4]:
from pandas.tseries.offsets import BDay

companies = ['KO', 'GM', 'C', 'AAPL', 'TWTR', 'JPM', 'F', 'GE', 'DAL', 'PFE', 'MS', 'BAC', 'AAL', 'NEM', 'NCLH', 'INTC', 'TSLA', 'MSFT', 'FCX', 'CSCO']

time_A = pd.to_datetime('2021-12-01 09:30:00')
time_B = pd.to_datetime('2021-12-01 09:40:00')

time_difference = time_B - time_A
five_min_intervals = time_difference // pd.Timedelta(5, unit="m") + 1
size = five_min_intervals * transactions_per_capture * 2

mock_df = pd.DataFrame()
ref_value = 50.0
ref_day = time_A
total_days = 10

for day in range(total_days):
    daily_df = pd.DataFrame()
    for company in companies:
        comp_df = generate_registers(ref_day, transactions_per_capture, five_min_intervals)
        comp_df["Company"] = company
        comp_df["Transaction_type"] = get_types_pattern(size)
        comp_df["Number_of_Shares_Sold"] = generate_stocks(size, price_stock_probs, price_stock_mul_probs)
        comp_df["Shares_Top_of_Book"] = generate_stocks(size, top_stock_probs, top_stock_mul_probs)
        comp_df["Price_of_One_Share"] = get_prices(size, ref_value)
        comp_df["Prices_Top_of_Book"] = get_prices(size, ref_value)

        daily_df = pd.concat([daily_df, comp_df], ignore_index=True)
    ref_day += BDay()
    ref_value = ref_value + np.random.normal(0, 1, 1)
    mock_df = pd.concat([mock_df, daily_df], ignore_index=True)

output_path = os.path.join("data", "mocked_market_stocks.csv")
output_file = mock_df.to_csv(output_path, index=False)