## Data Preprocessing for comparison between Amazon Chronos and AutoARIMA

### STEP 1: Load the dataset

In [1]:
import pandas as pd
import numpy as np

# Load and combine Date + Time as datetime index
df = pd.read_csv(
    'household_power_consumption.txt',
    sep=';',
    parse_dates={'datetime': ['Date', 'Time']},
    infer_datetime_format=True,
    na_values='?',
    index_col='datetime',
    low_memory=False
)

# Convert all numeric columns to float32
df = df.astype('float32')
df.sort_index(inplace=True)
df

  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(


Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.839996,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.360,0.436,233.630005,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.289993,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.740005,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.679993,15.8,0.0,1.0,17.0
...,...,...,...,...,...,...,...
2010-11-26 20:58:00,0.946,0.000,240.429993,4.0,0.0,0.0,0.0
2010-11-26 20:59:00,0.944,0.000,240.000000,4.0,0.0,0.0,0.0
2010-11-26 21:00:00,0.938,0.000,239.820007,3.8,0.0,0.0,0.0
2010-11-26 21:01:00,0.934,0.000,239.699997,3.8,0.0,0.0,0.0


### Step 2: Resample to Hourly Frequency

We’ll resample the data to hourly frequency by taking the mean of all 1-minute readings per hour.


In [2]:
# Resample to hourly frequency using mean aggregation
df_hourly = df.resample('H').mean()

# Focus only on the target variable
df_hourly = df_hourly[['Global_active_power']]


  df_hourly = df.resample('H').mean()


### Step 3: Handle Missing Values

Although Chronos has the ability to handle missing data, same cannot be said for AutoARIMA, as it can give unstable and incorrect forecasting results. 

In [3]:
# Fill missing values via linear interpolation
df_hourly['Global_active_power'] = df_hourly['Global_active_power'].interpolate(method='linear')

# Drop remaining NaNs if any
df_hourly.dropna(inplace=True)


### Step 4: Train/Test Split
Let’s split using 80% train / 20% test, preserving time order.

In [4]:
# 80% training / 20% test split
split_index = int(len(df_hourly) * 0.8)
train = df_hourly.iloc[:split_index]
test = df_hourly.iloc[split_index:]

# Separate into target arrays
y_train = train['Global_active_power']
y_test = test['Global_active_power']


### Final Preperation

In [5]:
chronos_df = df_hourly.copy().reset_index()
chronos_df['item_id'] = 'household_1'
chronos_df.columns = ['timestamp', 'target_value', 'item_id']

# Reorder columns
chronos_df = chronos_df[['item_id', 'timestamp', 'target_value']]

# Save for HuggingFace Chronos interface or tokenization
chronos_df.to_csv('chronos_input.csv', index=False)
