In [1]:
f = open(r"./data/monash/monash-df.pkl", "rb")

In [2]:
import pickle
import pandas as pd


def yield_data(pickle_file_path="./data/monash/monash-df.pkl"):
    """
    Generator function to yield objects one at a time from a pickle file.

    Args:
        pickle_file_path (str): Path to the pickle file.

    Yields:
        dict: A dictionary with file name, df, freq as keys.

    Raises:
        FileNotFoundError: If the pickle file doesn't exist.
        pickle.UnpicklingError: If the pickle file is corrupted.
    """
    try:
        with open(pickle_file_path, "rb") as f:
            while True:
                try:
                    obj = pickle.load(f)
                    items = [i for i in obj.items()][0]
                    yield {
                        "name": items[0].split('.')[0],
                        "df": items[1][0],
                        "freq": items[1][1],
                    }
                except EOFError:
                    break
    except FileNotFoundError:
        raise FileNotFoundError(f"Pickle file not found: {pickle_file_path}")
    except pickle.UnpicklingError:
        raise pickle.UnpicklingError("Corrupted pickle file.")

In [3]:
data_generator = yield_data()

In [4]:
def prepare_time_series(df, frequency):
    """
    Convert DataFrame with series_value lists to a time series DataFrame, handling varied frequencies.
    If series_value contains NaN, that series will be skipped.

    Args:
        df (pd.DataFrame): DataFrame with series_name, start_timestamp, series_value.
        frequency (str): Frequency of the series (e.g., '4_seconds', 'half_hourly', 'daily').

    Returns:
        pd.DataFrame: DataFrame with timestamps as index and series_name as columns.

    Raises:
        ValueError: If the frequency is unsupported.
    """
    freq_map = {
        '4_seconds': '4s',
        'minutely': 'min',
        'hourly': 'h',
        'half_hourly': '30min',
        'daily': 'D',
        'weekly': 'W',
        'monthly': 'ME',
        'quarterly': 'Q',
        'yearly': 'Y'
    }

    pandas_freq = freq_map.get(frequency)
    if pandas_freq is None:
        raise ValueError(f"Unsupported frequency: {frequency}")
    
    series_dict = {}

    for _, row in df.iterrows():
        series_name = row['series_name']
        
        try:
            start_time = pd.to_datetime(row['start_timestamp'])
        except KeyError:
            start_time = pd.Timestamp("2000-01-01 00:00:00")

        values = row['series_value']
        if any(pd.isna(v) for v in values):
            continue
        timestamps = pd.date_range(
            start=start_time, periods=len(values), freq=pandas_freq)
        
        series_dict[series_name] = pd.Series(values, index=timestamps)

    ts_df = pd.DataFrame(series_dict)
    return ts_df


In [40]:
data_generator = yield_data()

missing_indices = []

for idx, item in enumerate(data_generator):
    df = item['df']
    name = item['name']

    # Print the column names
    print(f"DataFrame {idx} ('{name}') columns:", df.columns.tolist())

    # Check for missing 'start_timestamp'
    if 'start_timestamp' not in df.columns:
        missing_indices.append(idx)

print("\nIndices of DataFrames missing 'start_timestamp':", missing_indices)


DataFrame 0 ('kdd_cup_2018_dataset_without_missing_values') columns: ['series_name', 'city', 'station', 'air_quality_measurement', 'start_timestamp', 'series_value']
DataFrame 1 ('solar_4_seconds_dataset') columns: ['series_name', 'start_timestamp', 'series_value']
DataFrame 2 ('pedestrian_counts_dataset') columns: ['series_name', 'start_timestamp', 'series_value']
DataFrame 3 ('traffic_hourly_dataset') columns: ['series_name', 'start_timestamp', 'series_value']
DataFrame 4 ('temperature_rain_dataset_without_missing_values') columns: ['series_name', 'station_id', 'obs_or_fcst', 'start_timestamp', 'series_value']
DataFrame 5 ('saugeenday_dataset') columns: ['series_name', 'start_timestamp', 'series_value']
DataFrame 6 ('tourism_monthly_dataset') columns: ['series_name', 'start_timestamp', 'series_value']
DataFrame 7 ('bitcoin_dataset_without_missing_values') columns: ['series_name', 'start_timestamp', 'series_value']
DataFrame 8 ('wind_farms_minutely_dataset_without_missing_values') col

In [44]:
from itertools import islice
data_generator = yield_data()
i = 23
next(islice(data_generator, i, i + 1))


{'name': 'dominick_dataset',
 'df':        series_name                                       series_value
 0               T1  [41.83, 0.0, 0.0, 0.0, 41.83, 0.0, 0.0, 0.0, 0...
 1               T2  [68.85, 68.85, 0.0, 68.85, 0.0, 68.85, 0.0, 0....
 2               T3  [0.0, 0.0, 0.0, 62.62, 62.62, 62.62, 0.0, 62.6...
 3               T4  [67.99, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...
 4               T5  [0.0, 0.0, 0.0, 62.58, 62.58, 0.0, 0.0, 62.58,...
 ...            ...                                                ...
 115699     T115700  [0.0, 27.47, 33.54, 33.54, 33.56, 33.67, 33.67...
 115700     T115701  [0.0, 26.31, 31.49, 33.67, 33.67, 33.67, 33.67...
 115701     T115702  [0.0, 26.54, 33.26, 33.26, 33.26, 33.67, 33.67...
 115702     T115703  [0.0, 27.39, 31.48, 33.21, 33.38, 33.67, 33.67...
 115703     T115704  [0.0, 26.89, 0.0, 33.67, 33.67, 33.67, 33.67, ...
 
 [115704 rows x 2 columns],
 'freq': 'weekly'}

In [6]:
i = 0
data_generator = yield_data()

for data in data_generator:
    prepare_time_series(data['df'], data['freq'])
    print(i)
    i += 1

0
1
2
3
4
5
6


In [5]:
data_generator = yield_data()
data = next(data_generator)
prepare_time_series(data['df'], data['freq'])

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,...,T261,T262,T263,T264,T265,T266,T267,T268,T269,T270
2017-01-01 00:00:01,,,,,,,,,,,...,55.0,22.0,23.3,16.1,7.0,23.3,31.7,23.2,21.3,
2017-01-01 01:00:01,,,,,,,,,,,...,26.0,27.0,17.5,10.7,9.4,17.5,29.9,23.1,25.9,
2017-01-01 02:00:01,,,,,,,,,,,...,24.4,22.0,16.6,8.9,7.8,16.6,21.8,13.9,22.6,
2017-01-01 03:00:01,,,,,,,,,,,...,15.8,18.6,18.6,8.5,5.4,18.6,19.1,12.8,23.4,
2017-01-01 04:00:01,,,,,,,,,,,...,22.4,16.3,20.4,8.3,5.4,20.4,23.7,13.8,22.6,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-03-31 19:00:01,,,,,,,,,,,...,9.8,23.6,10.2,5.9,1.4,4.5,44.8,18.7,16.9,6.0
2018-03-31 20:00:01,,,,,,,,,,,...,9.8,23.6,10.2,5.9,1.4,4.5,44.8,18.7,16.9,6.0
2018-03-31 21:00:01,,,,,,,,,,,...,9.8,23.6,10.2,5.9,1.4,4.5,44.8,18.7,16.9,6.0
2018-03-31 22:00:01,,,,,,,,,,,...,9.8,23.6,10.2,5.9,1.4,4.5,44.8,18.7,16.9,6.0
