In [1]:
# Alright, let's start with some easy stuff. Get the top volume of a stock in yesterday, 3 days, 5 days , 15 days and 30 days.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import multiprocessing
from multiprocessing import Pool


In [2]:
def get_top_volume(file):
    # Get symbol
    symbol = os.path.splitext(file)[0]
    df = pd.read_csv(os.path.join(Data_Path, file))
    # The dataframe should be sorted by datetime already, but just to be sure, sort it again.
    df = df.sort_values(by='datetime', ascending=False)
    # Get volume of yesterday
    yesterday_volume = df.iloc[0]['volume'].mean()
    # Get volume average of 3 days
    volume_3_days = df.iloc[0:3]['volume'].mean()
    # Get volume of 5 days
    volume_5_days = df.iloc[0:5]['volume'].mean()
    # Get volume of 15 days
    volume_15_days = df.iloc[0:15]['volume'].mean()
    # Get volume of 30 days
    volume_30_days = df.iloc[0:30]['volume'].mean()
    # Append to result
    data = {
        'symbol': symbol,
        'yesterday_volume': yesterday_volume,
        'volume_3_days': volume_3_days,
        'volume_5_days': volume_5_days,
        'volume_15_days': volume_15_days,
        'volume_30_days': volume_30_days
    }
    return data

In [3]:
# Idea is simple. Read in each file, caculate volume of yesterday, 3 days, 5 days, 15 days and 30 days.
# This should be easy to do with a multiprocessing, but I will use tranditional approach for now.
# For references, the production data's csv has the following columns:
# open,high,low,close,volume,timestamp,datetime

# This is Mac data path
#Data_Path = '/Users/Arthur/Developer/Trading/Data/Daily/Production'
# This is WSL2 data path
Data_Path = '/home/arthur/Developer/Trading/Data/Daily/Production'
Working_Path = os.getcwd()
# Get file list
file_list = [f for f in os.listdir(Data_Path) if f.endswith('.csv')]
# Get symbol list
symbol_list = [os.path.splitext(f)[0] for f in file_list]
pd.options.display.float_format = '{:,.2f}'.format
# file_list = ['AAPL.csv', 'SPY.csv', 'GOOG.csv', 'MSFT.csv']

top_volume_list =pd.DataFrame()
# with Pool(processes = os.cpu_count()) as pool:
#     for result in tqdm(pool.imap_unordered(get_top_volume, file_list), total=len(file_list)):
#         top_volume_list = top_volume_list.append(result)
pool = Pool(processes=os.cpu_count())
result = [f for f in tqdm(pool.imap_unordered(get_top_volume, file_list), total=len(file_list))]
pool.close()
pool.join()

top_volume_list = pd.DataFrame(result)
top_volume_list.to_csv(os.path.join(Working_Path,'top_volume_list.csv'), index=False)

100%|██████████| 12285/12285 [00:08<00:00, 1488.85it/s]


In [4]:
top_volume_list.sort_values(by='yesterday_volume', ascending=False, inplace=True)

In [5]:
top_volume_list.head(50)

Unnamed: 0,symbol,yesterday_volume,volume_3_days,volume_5_days,volume_15_days,volume_30_days
905,SQQQ,175486690.0,226450710.0,216061004.0,164090195.07,141813534.83
8186,AAPL,107496982.0,120753089.33,137746065.6,106655399.2,86615035.0
6567,LCID,103929805.0,85846501.33,85750082.6,109645393.93,107897845.4
11224,SPY,98977532.0,121315645.67,128998474.0,86893040.0,70903742.17
6120,SNDL,90743158.0,105116204.0,115108542.4,133072451.87,138326554.57
7757,F,88147408.0,100854947.67,106256316.2,90926335.4,108505553.2
11465,ISIG,84523423.0,28190311.33,16921668.6,5664154.13,2893196.0
10698,CCL,78131555.0,71402480.33,71404289.0,50312482.13,39027753.3
7160,CFVI,77075622.0,74332498.0,44615844.0,14909207.6,7475667.87
9782,TQQQ,76206983.0,98486218.67,89426791.2,56498919.13,46168195.5
