In [1]:
import os
import pandas as pd
import re
from config_parameters import MARKET_DATA_DICT, DATA_TABLE_FILE

data_folder = MARKET_DATA_DICT
output_excel_path = DATA_TABLE_FILE

def extract_metadata_from_folder(folder_path):
    metadata_summary = []

    # Clean up ticker name (remove 'USDT' and suffixes like '.OANDA')
    def clean_symbol_name(name: str) -> str:
        name = re.sub(r'USDT$', '', name)
        name = name.split('.')[0]
        return name

    # Create a sort key based on interval unit (m, h, d, w, M)
    def extract_interval_sort_key(file_name):
        match = re.search(r'(\d+)([mhdDwWM])', file_name)
        if not match:
            return (float('inf'), float('inf'))

        num = int(match.group(1))
        unit = match.group(2)

        if unit == 'm':
            order = 1
        elif unit == 'h':
            order = 2
        elif unit == 'd':
            order = 3
        elif unit in ['w', 'W']:
            order = 4
        elif unit == 'M':
            order = 5
        else:
            order = float('inf')

        return (order, num)

    # Walk through folders and collect metadata
    for root, dirs, files in os.walk(folder_path):
        if not files:
            continue

        # Identify interval file names (e.g. 5m.csv, 1d.csv)
        interval_names = [os.path.splitext(f)[0] for f in files if re.search(r'\d+[mhdDwWM]', f)]
        if not interval_names:
            continue

        sorted_intervals = sorted(interval_names, key=extract_interval_sort_key)

        # Filter CSV files that contain valid interval identifiers
        csv_interval_files = [f for f in files if f.endswith('.csv') and re.search(r'\d+[mhdDwWM]', f)]
        sorted_csv_files = sorted(csv_interval_files, key=lambda f: extract_interval_sort_key(os.path.splitext(f)[0]))

        if not sorted_csv_files:
            continue

        # Pick the file with the smallest interval
        smallest_interval_file = sorted_csv_files[0]
        file_path = os.path.join(root, smallest_interval_file)

        match = re.search(r'\d+[mhdDwWM]', smallest_interval_file)
        min_interval = match.group(0) if match else None

        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"⚠️ Error reading file {smallest_interval_file}: {e}")
            continue

        if 'timestamp' in df.columns:
            timestamps = pd.to_datetime(df['timestamp'])
        elif 'date' in df.columns:
            timestamps = pd.to_datetime(df['date'])
        else:
            continue

        start_date = timestamps.min().date()
        end_date = timestamps.max().date()

        raw_name = os.path.basename(root)
        category = "CRYPTO" if raw_name.endswith("USDT") else None
        symbol = clean_symbol_name(raw_name)

        metadata_summary.append({
            'Ticker': symbol,
            'Start': start_date,
            'End': end_date,
            'Min Interval': min_interval,
            'File': raw_name,
            'Category': category,
            'Intervals': sorted_intervals
        })

    return pd.DataFrame(metadata_summary)

#  Generate summary from folder structure

def generate_market_data_file():
    summary_table = extract_metadata_from_folder(data_folder)

    # Save summary to Excel
    summary_table.to_excel(output_excel_path, index=False)

    print(f"Saved summary to {output_excel_path}")


generate_market_data_file()

Saved summary to Market_data2.xlsx
