In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import datetime

In [15]:
stock_symbol = "AAPL"

data_path = r"../data/raw/AAPL_data.csv"
data = pd.read_csv(data_path)

data

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2022-03-18 09:30:00-04:00,160.509995,161.919998,159.759995,161.395004,31435630
1,2022-03-18 10:30:00-04:00,161.389999,162.190002,160.820007,161.789993,10494244
2,2022-03-18 11:30:00-04:00,161.779999,162.610001,161.779999,162.580002,9003024
3,2022-03-18 12:30:00-04:00,162.585007,162.899994,162.360001,162.630005,9421564
4,2022-03-18 13:30:00-04:00,162.619995,163.054993,162.244995,162.960007,7858878
...,...,...,...,...,...,...
5080,2025-02-13 10:30:00-05:00,237.895996,241.610001,237.729996,240.479996,8946690
5081,2025-02-13 11:30:00-05:00,240.490295,240.839996,239.820007,240.345001,3794477
5082,2025-02-13 12:30:00-05:00,240.349899,241.350006,240.190002,241.029999,3189003
5083,2025-02-13 13:30:00-05:00,241.039993,242.070007,240.529999,241.769897,3762258


## Pre-Processing the Time Series

#### Requirements for Amazon Chronos:
- No missing values.
- Normalized features.
- Time-based features (lags, moving averages, seasonality).
- Consistent datetime format.

In [17]:
def check_for_missing_values(data):
    missing_values = data.isnull().sum() 
    total_missing = missing_values.sum() 

    if total_missing > 0:
        print("❌ The dataset has missing values:\n")
        print(missing_values[missing_values > 0])
    else:
        print("✅ The data has no missing values.")


check_for_missing_values(data)

✅ The data has no missing values.


In [18]:
from sklearn.preprocessing import MinMaxScaler

def normalize_data(data):
    scaler = MinMaxScaler()

    data[["Open", "High", "Low", "Close", "Volume"]] = scaler.fit_transform(
        data[["Open", "High", "Low", "Close", "Volume"]]
        )
    
    return data

data_normalized = normalize_data(data)

data_normalized

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2022-03-18 09:30:00-04:00,0.267069,0.273622,0.263942,0.273710,0.550697
1,2022-03-18 10:30:00-04:00,0.273610,0.275620,0.271804,0.276646,0.183841
2,2022-03-18 11:30:00-04:00,0.276508,0.278727,0.278923,0.282518,0.157717
3,2022-03-18 12:30:00-04:00,0.282492,0.280873,0.283225,0.282890,0.165049
4,2022-03-18 13:30:00-04:00,0.282752,0.282020,0.282372,0.285343,0.137674
...,...,...,...,...,...,...
5080,2025-02-13 10:30:00-05:00,0.842250,0.863263,0.842183,0.861528,0.156730
5081,2025-02-13 11:30:00-05:00,0.861532,0.857566,0.857683,0.860525,0.066473
5082,2025-02-13 12:30:00-05:00,0.860489,0.861339,0.860427,0.865616,0.055866
5083,2025-02-13 13:30:00-05:00,0.865618,0.866667,0.862949,0.871116,0.065908


### Feature Engineering

In [19]:
# Moving averages
data_normalized["MA_3"] = data_normalized["Close"].rolling(window=3).mean()
data_normalized["MA_7"] = data_normalized["Close"].rolling(window=7).mean()

# Lag features
data_normalized["Close_Lag_1"] = data_normalized["Close"].shift(1)
data_normalized["Close_Lag_2"] = data_normalized["Close"].shift(2)

# Drop NaN values caused by rolling and lag features
data_normalized.dropna(inplace=True)

# Show processed data
data_normalized

Unnamed: 0,Date,Open,High,Low,Close,Volume,MA_3,MA_7,Close_Lag_1,Close_Lag_2
6,2022-03-18 15:30:00-04:00,0.293975,0.292564,0.291308,0.292629,0.239307,0.290658,0.283963,0.294002,0.285343
7,2022-03-21 09:30:00-04:00,0.293826,0.300777,0.288082,0.287312,0.320497,0.291314,0.285906,0.292629,0.294002
8,2022-03-21 10:30:00-04:00,0.287211,0.304994,0.289602,0.308235,0.265453,0.296059,0.290418,0.287312,0.292629
9,2022-03-21 11:30:00-04:00,0.308134,0.306400,0.305547,0.305077,0.174637,0.300208,0.293641,0.308235,0.287312
10,2022-03-21 12:30:00-04:00,0.305050,0.301184,0.289158,0.290735,0.177918,0.301349,0.294762,0.305077,0.308235
...,...,...,...,...,...,...,...,...,...,...
5080,2025-02-13 10:30:00-05:00,0.842250,0.863263,0.842183,0.861528,0.156730,0.846192,0.835338,0.842203,0.834845
5081,2025-02-13 11:30:00-05:00,0.861532,0.857566,0.857683,0.860525,0.066473,0.854752,0.840706,0.861528,0.842203
5082,2025-02-13 12:30:00-05:00,0.860489,0.861339,0.860427,0.865616,0.055866,0.862556,0.846599,0.860525,0.861528
5083,2025-02-13 13:30:00-05:00,0.865618,0.866667,0.862949,0.871116,0.065908,0.865752,0.852503,0.865616,0.860525


In [27]:
data_normalized.rename(columns={"Date": "timestamp"}, inplace=True)
chronos_data = data_normalized[["timestamp", "Close", "MA_3", "MA_7", "Close_Lag_1", "Close_Lag_2"]]

# Convert timestamp to string format (Chronos requirement)
chronos_data["timestamp"] = chronos_data["timestamp"].astype(str)

os.makedirs("../data/chronos", exist_ok=True)
chronos_data.to_csv(f"../data/chronos/chronos_{stock_symbol}_data.csv", index=False)

print("✅ Data formatted and saved for Chronos training.")

chronos_data

✅ Data formatted and saved for Chronos training.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chronos_data["timestamp"] = chronos_data["timestamp"].astype(str)


Unnamed: 0,timestamp,Close,MA_3,MA_7,Close_Lag_1,Close_Lag_2
6,2022-03-18 15:30:00-04:00,0.292629,0.290658,0.283963,0.294002,0.285343
7,2022-03-21 09:30:00-04:00,0.287312,0.291314,0.285906,0.292629,0.294002
8,2022-03-21 10:30:00-04:00,0.308235,0.296059,0.290418,0.287312,0.292629
9,2022-03-21 11:30:00-04:00,0.305077,0.300208,0.293641,0.308235,0.287312
10,2022-03-21 12:30:00-04:00,0.290735,0.301349,0.294762,0.305077,0.308235
...,...,...,...,...,...,...
5080,2025-02-13 10:30:00-05:00,0.861528,0.846192,0.835338,0.842203,0.834845
5081,2025-02-13 11:30:00-05:00,0.860525,0.854752,0.840706,0.861528,0.842203
5082,2025-02-13 12:30:00-05:00,0.865616,0.862556,0.846599,0.860525,0.861528
5083,2025-02-13 13:30:00-05:00,0.871116,0.865752,0.852503,0.865616,0.860525


@article{ansari2024chronos,
    title={Chronos: Learning the Language of Time Series},
    author={Ansari, Abdul Fatir and Stella, Lorenzo and Turkmen, Caner and Zhang, Xiyuan, and Mercado, Pedro and Shen, Huibin and Shchur, Oleksandr and Rangapuram, Syama Syndar and Pineda Arango, Sebastian and Kapoor, Shubham and Zschiegner, Jasper and Maddix, Danielle C. and Mahoney, Michael W. and Torkkola, Kari and Gordon Wilson, Andrew and Bohlke-Schneider, Michael and Wang, Yuyang},
    journal={Transactions on Machine Learning Research},
    issn={2835-8856},
    year={2024},
    url={https://openreview.net/forum?id=gerNCVqqtR}
}
