# Create Synthetic 10,000 Asset Dataset

#### 1.1 Create 10,000 fake tickers

In [1]:
import random
import string
import itertools

n_stocks = 10_000

# Create a list of all 4-letter strings
all_strings = ["".join(x) for x in itertools.product(string.ascii_uppercase, repeat=4)]

# Randomly select unique 4-letter strings
tickers = random.sample(all_strings, n_stocks)

#### 1.2 Create trading days

In [2]:
# [ ] Create trading days
import pandas as pd
import pandas_market_calendars as mcal
import datetime

# Create a calendar
nyse = mcal.get_calendar("NYSE")
trading_dates = [
    d.date() for d in nyse.valid_days(start_date="2012-01-01", end_date="2022-12-31")
]

#### 1.3 Create market data

In [3]:
import numpy as np

return_annualized = 0.08  # Will be slightly less due to greater geometric difference of negative returns
return_per_period = (1 + return_annualized) ** (1 / 252) - 1
return_std_dev = 0.015

returns = np.random.normal(
    return_per_period, return_std_dev, size=(len(trading_dates) * n_stocks)
)

min_volume = 100_000
max_volume = 1_000_000
volumes = np.random.randint(
    min_volume, max_volume, size=(len(trading_dates) * n_stocks)
)

min_spread = 1 / 20_000
max_spread = 1 / 4_000
spreads = np.around(
    np.random.uniform(min_spread, max_spread, size=(len(trading_dates) * n_stocks)),
    decimals=7,
)

#### 1.4 Create dataframe

In [4]:
df = pd.DataFrame(
    {
        "asset": np.array([[sym] * len(trading_dates) for sym in tickers]).flatten(),
        "date": trading_dates * len(tickers),
        "return": returns,
        "volume": volumes,
        "spread": spreads,
    }
)

In [5]:
# And price specifically
df["return_lag"] = df.groupby(["asset"])["return"].shift(1)
df["return_lag"] = df["return_lag"].fillna(0)
df["return_factor"] = df["return_lag"] + 1
df["cum_return_factor"] = df.groupby(["asset"])["return_factor"].cumprod()
df["price"] = 100 * df["cum_return_factor"]

df = df[["asset", "date", "price", "return", "volume", "spread"]]

#### 1.5 Write out dataframe to data dir (as 2.5k, 5k, 7.5k, and 10k assets)

In [6]:
# 10k assets
dir_name = "./data/"
df.to_parquet(dir_name + "10k_synthetic.parquet")

In [7]:
# 7.5k assets
p = 0.75

rows_to_keep = int(len(df) * p)
df_75p = df.iloc[:rows_to_keep]

df_75p.to_parquet(dir_name + "7_5k_synthetic.parquet")

In [8]:
# 5k assets
p = 0.5

rows_to_keep = int(len(df) * p)
df_50p = df.iloc[:rows_to_keep]

df_50p.to_parquet(dir_name + "5k_synthetic.parquet")

In [9]:
# 2.5k assets
p = 0.25

rows_to_keep = int(len(df) * p)
df_25p = df.iloc[:rows_to_keep]

df_25p.to_parquet(dir_name + "2_5k_synthetic.parquet")

In [10]:
df

Unnamed: 0,asset,date,price,return,volume,spread
0,VDEM,2012-01-03,100.000000,0.000976,772050,0.000100
1,VDEM,2012-01-04,100.097589,-0.000350,633783,0.000238
2,VDEM,2012-01-05,100.062578,0.015726,849575,0.000205
3,VDEM,2012-01-06,101.636174,-0.001795,774688,0.000167
4,VDEM,2012-01-09,101.453781,0.009333,777982,0.000141
...,...,...,...,...,...,...
27679995,BUUP,2022-12-23,69.189483,0.005904,560596,0.000133
27679996,BUUP,2022-12-27,69.597946,-0.005376,770558,0.000100
27679997,BUUP,2022-12-28,69.223815,-0.011491,106948,0.000227
27679998,BUUP,2022-12-29,68.428397,0.037141,162551,0.000191


In [11]:
df_25p

Unnamed: 0,asset,date,price,return,volume,spread
0,VDEM,2012-01-03,100.000000,0.000976,772050,0.000100
1,VDEM,2012-01-04,100.097589,-0.000350,633783,0.000238
2,VDEM,2012-01-05,100.062578,0.015726,849575,0.000205
3,VDEM,2012-01-06,101.636174,-0.001795,774688,0.000167
4,VDEM,2012-01-09,101.453781,0.009333,777982,0.000141
...,...,...,...,...,...,...
6919995,SMTQ,2022-12-23,89.465507,0.012943,549176,0.000057
6919996,SMTQ,2022-12-27,90.623472,0.010199,748745,0.000137
6919997,SMTQ,2022-12-28,91.547709,0.011040,466528,0.000060
6919998,SMTQ,2022-12-29,92.558401,-0.007328,923612,0.000146
