In [63]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import datetime

In [64]:
ticker = "AAPL"
data_path = f"../data/raw/{ticker}_data.csv"

data = pd.read_csv(data_path)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5083 entries, 0 to 5082
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     5083 non-null   object 
 1   Open     5083 non-null   float64
 2   High     5083 non-null   float64
 3   Low      5083 non-null   float64
 4   Close    5083 non-null   float64
 5   Volume   5083 non-null   int64  
 6   item_id  5083 non-null   object 
dtypes: float64(4), int64(1), object(2)
memory usage: 278.1+ KB


## Pre-Processing the Time Series

#### Requirements for Amazon Chronos:
- No missing values.
- Normalized features.
- Time-based features (lags, moving averages, seasonality).
- Consistent datetime format.

In [65]:
def check_for_missing_values(data):
    missing_values = data.isnull().sum() 
    total_missing = missing_values.sum() 

    if total_missing > 0:
        print("❌ The dataset has missing values:\n")
        print(missing_values[missing_values > 0])
    else:
        print("✅ The data has no missing values.")


check_for_missing_values(data)

✅ The data has no missing values.


In [66]:
from sklearn.preprocessing import MinMaxScaler

def normalize_data(data):
    scaler = MinMaxScaler()

    data[["Open", "High", "Low", "Close", "Volume"]] = scaler.fit_transform(
        data[["Open", "High", "Low", "Close", "Volume"]]
    )
    
    return data

data_normalized = normalize_data(data)
data_normalized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5083 entries, 0 to 5082
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     5083 non-null   object 
 1   Open     5083 non-null   float64
 2   High     5083 non-null   float64
 3   Low      5083 non-null   float64
 4   Close    5083 non-null   float64
 5   Volume   5083 non-null   float64
 6   item_id  5083 non-null   object 
dtypes: float64(5), object(2)
memory usage: 278.1+ KB


In [67]:
def convert_to_chronos_format(data):
    # First id and timestamp, then the additional covariates
    chronos_data = data[["item_id", "Date", "Close", "Open", "High", "Low", "Volume"]]
    chronos_data.rename(columns={"Date": "timestamp", "Close": "target"}, inplace=True)

    return chronos_data

#-------#
chronos_data = convert_to_chronos_format(data_normalized)
chronos_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5083 entries, 0 to 5082
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   item_id    5083 non-null   object 
 1   timestamp  5083 non-null   object 
 2   target     5083 non-null   float64
 3   Open       5083 non-null   float64
 4   High       5083 non-null   float64
 5   Low        5083 non-null   float64
 6   Volume     5083 non-null   float64
dtypes: float64(5), object(2)
memory usage: 278.1+ KB
