**Purpose**

This jupter notebok file generates a stock market prices csv file with the following columns:

company- The company code

date- A random date between 1/1/2022 and 31/12/2024

stock_price- The stock closing price of the company on that date

Each record represents the stock closing price of a company on a particular date

So the PK of the dataset is the company and the date columns

After the csv is generated, its gets exported to the s3 bucket

When the csv gets exported, the file gets named with the timestamp as the time of exporting

This jupyter notebook acts as the data source used for batch processing

In [1]:
import pandas as pd
from datetime import datetime, timedelta
from random import randrange
import s3fs
from pyathena import connect
from pandasql import sqldf

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
#get the current stock prices of some companies on 25/01/2025

companies_price = {
                    'NVDA': {'current_price':143},
                    'AAPL': {'current_price':223},
                    'MSFT': {'current_price':444},
                    'AMZN': {'current_price':234},
                    'GOOGL': {'current_price':200},
                    'META': {'current_price':647},
                    'TSLA': {'current_price':407},
                    'WMT': {'current_price':95},
                    'JPM': {'current_price':265},
                    'V': {'current_price':330},
                    'ORCL': {'current_price':184},
                    'MA': {'current_price':490},
                    'XOM': {'current_price':109},
                    'NFLX': {'current_price':978},
                    'PG': {'current_price':164},
                    'SAP': {'current_price':276}
                    }

In [3]:
#create a min and max price for each current price by adding and removing 10% from the current price

for k in companies_price:
    companies_price[k]['min_price'] = int(companies_price[k]['current_price'] * 0.9)
    companies_price[k]['max_price'] = int(companies_price[k]['current_price'] * 1.1)

In [4]:
#create a random date between 1/1/2022 and 31/12/2024

date_1 = datetime.strptime('01-01-2022', '%d-%m-%Y')
date_2 = datetime.strptime('31-12-2024', '%d-%m-%Y')

days_diff_integer = (date_2 - date_1).days
random_days = randrange(days_diff_integer)
random_date = date_1 + timedelta(days = random_days)
random_date = random_date.strftime('%Y-%m-%d')

In [5]:
#generate a random closing pricing between the min and the max prices, and add the random_date generated in the previous step

for k in companies_price:
    min_price = companies_price[k]['min_price']
    max_price = companies_price[k]['max_price']
    companies_price[k]['date'] = random_date
    companies_price[k]['close_price'] = randrange(min_price, max_price)

In [6]:
#delete the keys min_price, max_price and current_price since they're not needed anymore

for k in companies_price:
    del companies_price[k]['min_price'] 
    del companies_price[k]['max_price']
    del companies_price[k]['current_price']

In [7]:
df = pd.DataFrame.from_dict(companies_price, orient='index')
df.reset_index(inplace = True)
df.rename(columns={'index': 'company'}, inplace = True)

In [8]:
today = datetime.today()
file_name = datetime.strftime(today, '%Y%m%d%H%M%S')

In [9]:
df

Unnamed: 0,company,date,close_price
0,NVDA,2024-05-20,134
1,AAPL,2024-05-20,244
2,MSFT,2024-05-20,463
3,AMZN,2024-05-20,234
4,GOOGL,2024-05-20,191
5,META,2024-05-20,620
6,TSLA,2024-05-20,381
7,WMT,2024-05-20,99
8,JPM,2024-05-20,264
9,V,2024-05-20,313


In [28]:
path = f's3://stock-market-raw-data-us-east-1/stg_price_by_date/{file_name}.csv'

In [29]:
df.to_csv(path, index=False, lineterminator = '\r')