# Dataset Overview


In [1]:
# Install required packages
!pip install pandas numpy ipython

Collecting pandas
  Downloading pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting numpy
  Downloading numpy-2.3.4-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl (10.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.3.4-cp313-cp313-macosx_14_0_arm64.whl (5.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Use this notebook to profile the available stock and ETF histories before building transformer models.
Configure the paths and run each cell top-to-bottom to regenerate the summary tables.


In [2]:
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List

import pandas as pd
from IPython.display import display


In [3]:
@dataclass
class SymbolSummary:
    symbol: str
    category: str
    rows: int
    start_date: pd.Timestamp
    end_date: pd.Timestamp
    median_close: float
    avg_close: float
    std_close: float
    avg_volume: float
    last_close: float


def summarize_symbol(path: Path) -> SymbolSummary | None:
    try:
        df = pd.read_csv(path, parse_dates=['Date'])
    except Exception as exc:
        print(f'[WARN] Failed to load {path}: {exc}')
        return None

    if df.empty:
        print(f'[WARN] {path} is empty - skipping')
        return None

    df = df.sort_values('Date')
    return SymbolSummary(
        symbol=path.stem,
        category=path.parent.name,
        rows=len(df),
        start_date=df['Date'].iloc[0],
        end_date=df['Date'].iloc[-1],
        median_close=float(df['Close'].median()),
        avg_close=float(df['Close'].mean()),
        std_close=float(df['Close'].std(ddof=0)),
        avg_volume=float(df['Volume'].mean()),
        last_close=float(df['Close'].iloc[-1]),
    )


def gather_summaries(files: Iterable[Path]) -> List[SymbolSummary]:
    summaries: List[SymbolSummary] = []
    for path in files:
        summary = summarize_symbol(path)
        if summary is not None:
            summaries.append(summary)
    return summaries


def build_dataframe(summaries: Iterable[SymbolSummary]) -> pd.DataFrame:
    df = pd.DataFrame([s.__dict__ for s in summaries])
    if df.empty:
        raise RuntimeError('No valid symbol data was found.')
    df['span_years'] = (df['end_date'] - df['start_date']).dt.days / 365.25
    return df


def print_headline_numbers(df: pd.DataFrame) -> None:
    total_symbols = len(df)
    categories = df['category'].value_counts()
    overall_start = df['start_date'].min().date()
    overall_end = df['end_date'].max().date()
    median_rows = int(df['rows'].median())

    print('=== Dataset Overview ===')
    print(f'Total symbols: {total_symbols}')
    for category, count in categories.items():
        print(f'  - {category}: {count}')
    print(f'Overall coverage: {overall_start} → {overall_end} ({(overall_end - overall_start).days} days)')
    print(f'Median observations per symbol: {median_rows}')


def display_top_lists(df: pd.DataFrame, limit: int) -> None:
    longest = df.sort_values('span_years', ascending=False).head(limit)
    latest = df.sort_values('end_date', ascending=False).head(limit)
    vol = df.sort_values('avg_volume', ascending=False).head(limit)

    print(f'=== Top {limit} Longest Coverage ===')
    display(longest[['symbol', 'category', 'span_years', 'start_date', 'end_date', 'rows']])

    print(f'=== Top {limit} Latest Data ===')
    display(latest[['symbol', 'category', 'end_date', 'last_close']])

    print(f'=== Top {limit} Highest Volume (avg) ===')
    display(vol[['symbol', 'category', 'avg_volume', 'avg_close']])


## Configuration
Adjust the inputs below to point at a different dataset or control the reporting depth.


In [4]:
DATA_ROOT = Path('../stock_data')
LIMIT = 5
OUTPUT_CSV = None  # e.g. Path('outputs/dataset_overview.csv')


## Run Analysis
Execute the cell below to generate summaries and optional CSV exports.


In [5]:
files = sorted(DATA_ROOT.glob('*/*.txt'))
if not files:
    raise RuntimeError(f'No symbol text files found in {DATA_ROOT}')

summaries = gather_summaries(files)
df = build_dataframe(summaries)

print_headline_numbers(df)
display_top_lists(df, LIMIT)

if OUTPUT_CSV:
    OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f'[INFO] Detailed summary saved to {OUTPUT_CSV}')


[WARN] Failed to load ../stock_data/Stocks/accp.us.txt: No columns to parse from file
[WARN] Failed to load ../stock_data/Stocks/amrh.us.txt: No columns to parse from file
[WARN] Failed to load ../stock_data/Stocks/amrhw.us.txt: No columns to parse from file
[WARN] Failed to load ../stock_data/Stocks/asns.us.txt: No columns to parse from file
[WARN] Failed to load ../stock_data/Stocks/bbrx.us.txt: No columns to parse from file
[WARN] Failed to load ../stock_data/Stocks/bolt.us.txt: No columns to parse from file
[WARN] Failed to load ../stock_data/Stocks/boxl.us.txt: No columns to parse from file
[WARN] Failed to load ../stock_data/Stocks/bxg.us.txt: No columns to parse from file
[WARN] Failed to load ../stock_data/Stocks/ehr.us.txt: No columns to parse from file
[WARN] Failed to load ../stock_data/Stocks/fmax.us.txt: No columns to parse from file
[WARN] Failed to load ../stock_data/Stocks/gnst.us.txt: No columns to parse from file
[WARN] Failed to load ../stock_data/Stocks/hayu.us.txt:

Unnamed: 0,symbol,category,span_years,start_date,end_date,rows
4607,ibm.us,Stocks,55.854894,1962-01-02,2017-11-10,14059
4041,ge.us,Stocks,55.854894,1962-01-02,2017-11-10,14058
4278,gt.us,Stocks,47.854894,1970-01-02,2017-11-10,12073
5649,mro.us,Stocks,47.854894,1970-01-02,2017-11-10,12073
5414,mcd.us,Stocks,47.854894,1970-01-02,2017-11-10,12075


=== Top 5 Latest Data ===


Unnamed: 0,symbol,category,end_date,last_close
0,aadr.us,ETFs,2017-11-10,56.4
5585,mmyt.us,Stocks,2017-11-10,26.9
5616,mosy.us,Stocks,2017-11-10,0.73
5615,mosc-u.us,Stocks,2017-11-10,10.15
5614,mos.us,Stocks,2017-11-10,22.84


=== Top 5 Highest Volume (avg) ===


Unnamed: 0,symbol,category,avg_volume,avg_close
1103,spy.us,ETFs,168053400.0,141.275289
1356,aapl.us,Stocks,106641600.0,22.281018
1305,xlf.us,ETFs,90567870.0,16.936153
936,qqq.us,ETFs,80543780.0,58.386467
5677,msft.us,Stocks,79458000.0,18.984698
