In [None]:
import pickle
from pathlib import Path
from pprint import pprint

def explore_pickle_data(file_path):
    """
    Load and explore the structure of a pickle file
    """
    # Load pickle file
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    
    # Get all top-level keys
    print("\n=== Top Level Keys ===")
    print("-" * 40)
    for key in sorted(data.keys()):
        value = data[key]
        value_type = type(value).__name__
        
        # For simple types, show the value
        if isinstance(value, (str, int, float, bool)) or value is None:
            print(f"{key}: {value_type} = {value}")
        # For collections, show the type and size
        elif isinstance(value, dict):
            print(f"{key}: {value_type} with {len(value)} items")
        elif isinstance(value, list):
            print(f"{key}: {value_type} with {len(value)} elements")
        else:
            print(f"{key}: {value_type}")
    
    # Show a sample of nested data structures
    if 'data' in data:
        print("\n=== Data Field Structure ===")
        print("-" * 40)
        pprint(list(data['data'].keys()))
    
    if 'salesRanks' in data:
        print("\n=== Sales Ranks Categories ===")
        print("-" * 40)
        pprint(list(data['salesRanks'].keys()))

# Usage
if __name__ == "__main__":
    # Setup path - adjust this to your data location
    base_path = Path('/Users/takedownccp/Documents/Cursor/DDU/data')
    # pickle_files = list((base_path / 'raw_data').glob('*_raw.pkl'))
    # pickle_files = ['B0CHTZ6NCL_raw.pkl', 'B07N52NLC3_raw.pkl', 'B09MZ9T3KT_raw.pkl', 'B09MZBXNHP_raw.pkl', 'B09QL5K6LW_raw.pkl', 'B09ZLSR8PH_raw.pkl']
    
    pickle_files = ['B0CHTZ6NCL_raw.pkl']
    if pickle_files:
        # Load the first pickle file
        first_file = (base_path / 'raw_data' / pickle_files[0])
        print(f"Exploring file: {first_file.name}")
        explore_pickle_data(first_file)
    else:
        print("No pickle files found!")

## Extract data from "Data" column

In [10]:
import pickle
from pathlib import Path
import pandas as pd
from datetime import datetime
import pytz
import numpy as np

def keepa_to_est(keepa_time):
    """Convert Keepa time to EST datetime"""
    if isinstance(keepa_time, datetime):
        if keepa_time.tzinfo is None:
            keepa_time = pytz.UTC.localize(keepa_time)
        return keepa_time.astimezone(pytz.timezone('US/Eastern'))
    
    try:
        unix_time = (keepa_time + 21564000) * 60
        utc_time = datetime.fromtimestamp(unix_time, tz=pytz.UTC)
        return utc_time.astimezone(pytz.timezone('US/Eastern'))
    except TypeError as e:
        print(f"Error converting time: {keepa_time}, type: {type(keepa_time)}")
        raise e

def process_data_to_timeseries(file_path):
    """Convert nested data structure to time series DataFrame"""
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    
    nested_data = data.get('data', {})
    asin = data.get('asin', 'unknown')  # Get ASIN for identification
    time_series_dict = {}
    
    metrics = {
        'AMAZON': ['AMAZON_time', 'AMAZON'],
        'NEW': ['NEW_time', 'NEW'],
        'USED': ['USED_time', 'USED'],
        'SALES': ['SALES_time', 'SALES'],
        'LISTPRICE': ['LISTPRICE_time', 'LISTPRICE'],
        'NEW_FBA': ['NEW_FBA_time', 'NEW_FBA'],
        'COUNT_NEW': ['COUNT_NEW_time', 'COUNT_NEW'],
        'RATING': ['RATING_time', 'RATING'],
        'COUNT_REVIEWS': ['COUNT_REVIEWS_time', 'COUNT_REVIEWS'],
        'BUY_BOX_SHIPPING': ['BUY_BOX_SHIPPING_time', 'BUY_BOX_SHIPPING']
    }
    
    for metric_name, (time_key, value_key) in metrics.items():
        if time_key in nested_data and value_key in nested_data:
            times = nested_data[time_key]
            values = nested_data[value_key]
            
            if not isinstance(times, (list, np.ndarray)) or not isinstance(values, (list, np.ndarray)):
                continue
                
            for t, v in zip(times, values):
                try:
                    est_time = keepa_to_est(t)
                    
                    if est_time not in time_series_dict:
                        time_series_dict[est_time] = {}
                    
                    time_series_dict[est_time][metric_name] = np.nan if v == -1 else v
                except Exception as e:
                    print(f"Error processing {metric_name} for {asin}: {str(e)}")
                    continue
    
    df = pd.DataFrame.from_dict(time_series_dict, orient='index')
    df.sort_index(inplace=True)
    df = df.fillna(method='ffill')
    df = df.fillna(method='bfill')
    
    return df, asin

def process_multiple_files(pickle_files, base_path):
    """Process multiple pickle files and save their time series data"""
    results = {}
    
    for pickle_file in pickle_files:
        print(f"\nProcessing {pickle_file}")
        file_path = base_path / 'raw_data' / pickle_file
        
        try:
            df, asin = process_data_to_timeseries(file_path)
            results[asin] = df
            
            # Save individual file
            output_path = base_path / 'processed_data' / f'{asin}_timeseries.csv'
            df.to_csv(output_path)
            print(f"Saved individual file for {asin}")
            
            # Print basic statistics
            print(f"\nStatistics for {asin}:")
            print(f"Time range: {df.index.min()} to {df.index.max()}")
            print(f"Number of records: {len(df)}")
            print("Available metrics:", list(df.columns))
            
        except Exception as e:
            print(f"Error processing {pickle_file}: {str(e)}")
    
    return results

# Usage
if __name__ == "__main__":
    # Setup path
    base_path = Path('/Users/takedownccp/Documents/Cursor/DDU/data')
    
    # List of files to process
    pickle_files = [
        'B0CHTZ6NCL_raw.pkl', 
        'B07N52NLC3_raw.pkl', 
        'B09MZ9T3KT_raw.pkl', 
        'B09MZBXNHP_raw.pkl', 
        'B09QL5K6LW_raw.pkl', 
        'B09ZLSR8PH_raw.pkl'
    ]
    
    # Process all files
    results = process_multiple_files(pickle_files, base_path)
    
    # Optional: Create a combined report
    print("\n=== Summary Report ===")
    for asin, df in results.items():
        print(f"\nASIN: {asin}")
        print(f"Records: {len(df)}")
        print(f"Time range: {df.index.min()} to {df.index.max()}")
        print("Columns:", list(df.columns))
        print("-" * 50)
    
    # Optional: Save a combined dataset
    # Uncomment if you want to combine all data into one file
    """
    combined_df = pd.concat(results.values(), keys=results.keys(), names=['ASIN', 'Date'])
    combined_output_path = base_path / 'processed_data' / 'combined_timeseries.csv'
    combined_df.to_csv(combined_output_path)
    print(f"\nSaved combined dataset to {combined_output_path}")
    """


Processing B0CHTZ6NCL_raw.pkl
Saved individual file for B0CHTZ6NCL

Statistics for B0CHTZ6NCL:
Time range: 2023-09-26 10:16:00-04:00 to 2025-01-13 08:52:00-05:00
Number of records: 4168
Available metrics: ['AMAZON', 'NEW', 'USED', 'NEW_FBA', 'COUNT_NEW', 'BUY_BOX_SHIPPING', 'SALES', 'LISTPRICE', 'COUNT_REVIEWS', 'RATING']

Processing B07N52NLC3_raw.pkl


  df = df.fillna(method='ffill')
  df = df.fillna(method='bfill')
  df = df.fillna(method='ffill')
  df = df.fillna(method='bfill')
  df = df.fillna(method='ffill')
  df = df.fillna(method='bfill')


Saved individual file for B07N52NLC3

Statistics for B07N52NLC3:
Time range: 2019-06-20 16:28:00-04:00 to 2025-01-12 14:28:00-05:00
Number of records: 15374
Available metrics: ['AMAZON', 'NEW', 'USED', 'SALES', 'LISTPRICE', 'COUNT_NEW', 'NEW_FBA', 'BUY_BOX_SHIPPING', 'COUNT_REVIEWS', 'RATING']

Processing B09MZ9T3KT_raw.pkl
Saved individual file for B09MZ9T3KT

Statistics for B09MZ9T3KT:
Time range: 2021-12-09 23:36:00-05:00 to 2025-01-13 03:32:00-05:00
Number of records: 7476
Available metrics: ['AMAZON', 'NEW', 'USED', 'COUNT_NEW', 'BUY_BOX_SHIPPING', 'SALES', 'LISTPRICE', 'COUNT_REVIEWS', 'RATING', 'NEW_FBA']

Processing B09MZBXNHP_raw.pkl


  df = df.fillna(method='ffill')
  df = df.fillna(method='bfill')
  df = df.fillna(method='ffill')
  df = df.fillna(method='bfill')


Saved individual file for B09MZBXNHP

Statistics for B09MZBXNHP:
Time range: 2021-12-10 05:48:00-05:00 to 2025-01-12 13:34:00-05:00
Number of records: 6481
Available metrics: ['AMAZON', 'NEW', 'USED', 'COUNT_NEW', 'BUY_BOX_SHIPPING', 'SALES', 'LISTPRICE', 'COUNT_REVIEWS', 'NEW_FBA', 'RATING']

Processing B09QL5K6LW_raw.pkl
Saved individual file for B09QL5K6LW

Statistics for B09QL5K6LW:
Time range: 2022-02-21 14:28:00-05:00 to 2025-01-11 15:16:00-05:00
Number of records: 7289
Available metrics: ['AMAZON', 'NEW', 'USED', 'COUNT_NEW', 'BUY_BOX_SHIPPING', 'SALES', 'LISTPRICE', 'NEW_FBA', 'COUNT_REVIEWS', 'RATING']

Processing B09ZLSR8PH_raw.pkl
Saved individual file for B09ZLSR8PH

Statistics for B09ZLSR8PH:
Time range: 2022-05-05 14:08:00-04:00 to 2025-01-10 08:48:00-05:00
Number of records: 9237
Available metrics: ['AMAZON', 'NEW', 'USED', 'RATING', 'COUNT_REVIEWS', 'SALES', 'COUNT_NEW', 'BUY_BOX_SHIPPING', 'LISTPRICE', 'NEW_FBA']

=== Summary Report ===

ASIN: B0CHTZ6NCL
Records: 4168


  df = df.fillna(method='ffill')
  df = df.fillna(method='bfill')
