In [1]:
import sys
import os
from pathlib import Path

# Add project root to path
project_root = Path(os.getcwd()).parent.parent
sys.path.append(str(project_root))

import pandas as pd
import logging
from core.data.collectors import YahooCollector
from core.data.processors import PriceProcessor
from core.data.storage import ParquetHandler

# Configure logging
logging.basicConfig(level=logging.INFO)

## 1. Test Data Collection
Fetch data for Sector ETFs.

In [2]:
collector = YahooCollector()
tickers = ['XLK', 'XLF', 'XLV', 'SPY']
start_date = '2020-01-01'

print(f"Fetching data for: {tickers}")
df_raw = collector.fetch_history(tickers, start_date=start_date)

print("Raw Data Shape:", df_raw.shape)
df_raw.head()

INFO:core.data.collectors.yahoo_collector:Fetching 1d data for 4 tickers from 2020-01-01 to None


Fetching data for: ['XLK', 'XLF', 'XLV', 'SPY']
Raw Data Shape: (1505, 20)


Ticker,XLV,XLV,XLV,XLV,XLV,XLF,XLF,XLF,XLF,XLF,SPY,SPY,SPY,SPY,SPY,XLK,XLK,XLK,XLK,XLK
Price,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
2020-01-02,92.721864,92.83077,92.023073,92.685562,6277400,27.615207,27.785065,27.534748,27.785065,28843300,296.480193,297.717289,295.554657,297.698944,59151200,43.926602,44.349247,43.841123,44.349247,26567000
2020-01-03,91.614685,92.404235,91.160923,91.877869,8247500,27.427471,27.606267,27.347012,27.490049,51363600,294.299309,296.5719,294.244329,295.444763,77709700,43.703412,44.154552,43.698662,43.850624,30023600
2020-01-06,91.460409,92.47684,91.433185,92.449615,6441800,27.239732,27.490048,27.230792,27.472168,27956100,293.685332,296.654369,293.5662,296.571899,55653900,43.413732,44.002588,43.333003,43.955097,15630000
2020-01-07,92.195493,92.349778,91.750808,92.268097,6335300,27.409592,27.454291,27.293375,27.293375,39627500,296.003762,296.480289,295.289,295.738037,40496400,44.031082,44.154553,43.869621,43.936104,15363600
2020-01-08,92.313482,93.284534,92.27718,92.867073,7494700,27.32019,27.624146,27.32019,27.472168,47966600,295.930369,298.532838,295.682961,297.314087,68296000,43.983597,44.600942,43.902865,44.406242,23254400


## 2. Test Data Processing
Clean and process the data.

In [3]:
processor = PriceProcessor()
df_processed = processor.process(df_raw)

print("Processed Data Shape:", df_processed.shape)
df_processed.head()

Processed Data Shape: (1505, 20)


Ticker,XLV,XLV,XLV,XLV,XLV,XLF,XLF,XLF,XLF,XLF,SPY,SPY,SPY,SPY,SPY,XLK,XLK,XLK,XLK,XLK
Price,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
2020-01-02,92.721864,92.83077,92.023073,92.685562,6277400,27.615207,27.785065,27.534748,27.785065,28843300,296.480193,297.717289,295.554657,297.698944,59151200,43.926602,44.349247,43.841123,44.349247,26567000
2020-01-03,91.614685,92.404235,91.160923,91.877869,8247500,27.427471,27.606267,27.347012,27.490049,51363600,294.299309,296.5719,294.244329,295.444763,77709700,43.703412,44.154552,43.698662,43.850624,30023600
2020-01-06,91.460409,92.47684,91.433185,92.449615,6441800,27.239732,27.490048,27.230792,27.472168,27956100,293.685332,296.654369,293.5662,296.571899,55653900,43.413732,44.002588,43.333003,43.955097,15630000
2020-01-07,92.195493,92.349778,91.750808,92.268097,6335300,27.409592,27.454291,27.293375,27.293375,39627500,296.003762,296.480289,295.289,295.738037,40496400,44.031082,44.154553,43.869621,43.936104,15363600
2020-01-08,92.313482,93.284534,92.27718,92.867073,7494700,27.32019,27.624146,27.32019,27.472168,47966600,295.930369,298.532838,295.682961,297.314087,68296000,43.983597,44.600942,43.902865,44.406242,23254400


## 3. Test Data Storage
Save to Parquet and reload.

In [4]:
# Create a temporary storage directory
storage_path = project_root / 'data' / 'processed' / 'test'
handler = ParquetHandler(base_path=storage_path)

filename = 'sector_prices_test.parquet'
handler.save(df_processed, filename)

# Load it back
df_loaded = handler.load(filename)

print("Loaded Data Shape:", df_loaded.shape)
pd.testing.assert_frame_equal(df_processed, df_loaded)
print("Data integrity check passed!")

INFO:core.data.storage.parquet_handler:Saved data to /Users/Sakarias/QuantTrading/data/processed/test/sector_prices_test.parquet


Loaded Data Shape: (1505, 20)
Data integrity check passed!
