In [11]:
import numpy as np
import pandas as pd
import datetime as dt

In [12]:
# Number of data points to have in dataset
data_len = 1000
# Probability that some shares are bought
p_buy = 0.80
# Parameter for the size, modeled as exponential distribution: https://en.wikipedia.org/wiki/Exponential_distribution
exp_param = 8
# Names of tickers and their sectors
tickers = [("AAPL TECH"), ("GOOGL TECH"), ("NVDA TECH"),\
                ("BIIB BIOTECH"), ("APLS BIOTECH"),\
                ("GM AUTOMOTIVE"), ("TSLA AUTOMOTIVE")\
          ]
# Start and end date for data to be distributed inbetween
start_date = pd.to_datetime("January 1, 2018")
end_date = dt.datetime.now()

In [13]:
# Create arrays with names and sector
names = [np.random.choice(tickers).split() for n in range(data_len)]
t_names = [n[0] for n in names]
t_sectors = [n[1] for n in names]

In [14]:
# Arrays for direction, size and day
t_directions = ["BUY" if np.random.rand() < p_buy else "SELL" for n in range(data_len)]
t_sizes = [int(np.random.exponential(exp_param) + 1) for n in range(data_len)]
t_prices = [(np.random.randint(1000, 7000) / 100) for n in range(data_len)]
t_dates = [start_date + dt.timedelta(days = np.random.randint((end_date - start_date).days)) for n in range(data_len)]

In [15]:
# Create dataframe with each array as a column
df = pd.DataFrame({
    "TICKER" : t_names,
    "SECTOR" : t_sectors,
    "DIRECTION" : t_directions,
    "SIZE" : t_sizes,
    "PRICE" : t_prices, 
    "DATE" : t_dates
})

In [16]:
# Sample 10 random points to see if looks good
df.sample(10).sort_index()

Unnamed: 0,TICKER,SECTOR,DIRECTION,SIZE,PRICE,DATE
28,NVDA,TECH,BUY,13,51.52,2020-07-31
132,TSLA,AUTOMOTIVE,BUY,2,39.4,2021-11-22
160,TSLA,AUTOMOTIVE,SELL,6,37.93,2021-09-13
239,APLS,BIOTECH,BUY,3,15.3,2020-10-01
309,GM,AUTOMOTIVE,BUY,1,23.82,2020-12-14
403,GM,AUTOMOTIVE,SELL,1,41.6,2019-04-17
451,TSLA,AUTOMOTIVE,BUY,5,28.55,2019-11-27
468,GM,AUTOMOTIVE,BUY,8,42.94,2019-09-04
527,BIIB,BIOTECH,BUY,18,42.89,2018-03-27
877,APLS,BIOTECH,BUY,6,38.02,2019-11-04


In [17]:
# Export to csv
df.to_csv("transaction_data.csv")

Done!