In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/cleaned.csv")
df["date"] = pd.to_datetime(df["date"])

Create Total Dataset

In [4]:
df_total = (
    df.groupby("date")["arrivals"]
      .sum()
      .reset_index()
)

df_total = df_total.sort_values("date")
df_total.head()

Unnamed: 0,date,arrivals
0,2019-01-01,85874.0
1,2019-02-01,98573.0
2,2019-03-01,89548.0
3,2019-04-01,64667.0
4,2019-05-01,10368.0


Extract Time Features

In [6]:
df_total["year"] = df_total["date"].dt.year
df_total["month"] = df_total["date"].dt.month

Add Seasonality Encoding 

In [7]:
df_total["month_sin"] = np.sin(2 * np.pi * df_total["month"] / 12)
df_total["month_cos"] = np.cos(2 * np.pi * df_total["month"] / 12)

Add Lag Features

In [8]:
df_total["lag_1"] = df_total["arrivals"].shift(1)
df_total["lag_12"] = df_total["arrivals"].shift(12)

Add Rolling Mean

In [9]:
df_total["rolling_mean_3"] = df_total["arrivals"].rolling(3).mean()

Drop Missing Rows

In [10]:
df_total = df_total.dropna()

Save Feature Engineered Dataset

In [11]:
df_total.to_csv("../data/total_features.csv", index=False)