In [1]:
import os 
from glob import glob
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
select_tf = ["minute"]
tables = [] 
final_data = {}

for filename in tqdm(glob('stock_data/*')):
    names = (filename.split(os.sep)[-1].split('_'))
    company_name = names[0]
    if names[1] in select_tf:
        data = pd.read_csv(filename, index_col=0)
        tf = names[1]
        variable_name = f"{company_name}_{tf}"
        locals()[variable_name] = data
        tables.append(variable_name)
        final_data[variable_name] = data

100%|████████████████████████████████████████████████████████████████████████████████| 909/909 [00:52<00:00, 17.47it/s]


In [3]:
def custom_interval_creator(df, interval_minutes):
    # Ensure the DataFrame is sorted by date
    df = df.sort_values(by='date')

    # Calculate the number of intervals
    num_intervals = len(df) // interval_minutes

    # Initialize an empty list to store the results
    result = []

    # Iterate through the intervals
    for i in range(num_intervals):
        start_idx = i * interval_minutes
        end_idx = (i + 1) * interval_minutes

        # Create a sub-dataframe for the current interval
        interval_df = df.iloc[start_idx:end_idx]

        # Calculate the open, high, low, close, and volume for the interval
        open_price = interval_df['open'].iloc[0]
        close_price = interval_df['close'].iloc[-1]
        high_price = interval_df['high'].max()
        low_price = interval_df['low'].min()
        total_volume = interval_df['volume'].sum()

        # Create a dictionary with the calculated values
        interval_result = {
            'date': interval_df['date'].iloc[0],
            'open': open_price,
            'high': high_price,
            'low': low_price,
            'close': close_price,
            'volume': total_volume
        }

        # Append the result to the list
        result.append(interval_result)

    # Create a new DataFrame from the list of results and reset the index
    result_df = pd.DataFrame(result).reset_index(drop=True)

    return result_df

In [9]:
def custom_interval_creator_and_save(data_dict, interval_minutes, save_location):
    for key, df in tqdm(data_dict.items(), desc="Processing companies", unit="company"):
        result_df = custom_interval_creator(df, interval_minutes)
        company_name = key.split('_')[0]  # Extract company name from dictionary key
        filename = f"{company_name}_{interval_minutes}minute_data.csv"
        filepath = os.path.join(save_location, filename)

        # Check if the file already exists and replace it
        if os.path.exists(filepath):
            os.remove(filepath)

        result_df.to_csv(filepath)


In [10]:
custom_interval_creator_and_save(final_data, interval_minutes=38, save_location='stock_data/')

Processing companies: 100%|█████████████████████████████████████████████████████| 101/101 [08:52<00:00,  5.27s/company]


In [4]:
df = ACC_minute
custom_interval_creator(df, interval_minutes= 38)

Unnamed: 0,date,open,high,low,close,volume
0,2015-02-02 09:15:00+05:30,1554.90,1556.70,1536.05,1548.10,34335
1,2015-02-02 09:53:00+05:30,1548.10,1549.95,1540.35,1540.55,12323
2,2015-02-02 10:31:00+05:30,1541.00,1544.70,1538.10,1541.65,13197
3,2015-02-02 11:09:00+05:30,1541.65,1544.30,1539.00,1541.15,12779
4,2015-02-02 11:47:00+05:30,1541.15,1541.15,1523.85,1529.00,22168
...,...,...,...,...,...,...
17379,2022-10-21 13:16:00+05:30,2256.40,2257.70,2245.55,2247.05,21323
17380,2022-10-21 13:54:00+05:30,2248.10,2258.60,2241.05,2246.00,33306
17381,2022-10-21 14:32:00+05:30,2246.00,2260.00,2244.20,2258.65,39005
17382,2022-10-21 15:10:00+05:30,2260.00,2278.95,2254.30,2263.75,52389
