In [30]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [31]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np

In [32]:
# fetch dataset 
metro_interstate_traffic_volume = fetch_ucirepo(id=492) 
  
# data (as pandas dataframes) 
X = metro_interstate_traffic_volume.data.features 
y = metro_interstate_traffic_volume.data.targets   

In [None]:
X.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00


In [None]:
X

In [34]:
X.iloc[0,7]

'2012-10-02 09:00:00'

In [35]:
X.iloc[-1,7]

'2018-09-30 23:00:00'

In [36]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   holiday              61 non-null     object 
 1   temp                 48204 non-null  float64
 2   rain_1h              48204 non-null  float64
 3   snow_1h              48204 non-null  float64
 4   clouds_all           48204 non-null  int64  
 5   weather_main         48204 non-null  object 
 6   weather_description  48204 non-null  object 
 7   date_time            48204 non-null  object 
dtypes: float64(3), int64(1), object(4)
memory usage: 2.9+ MB


In [37]:
y.head()

Unnamed: 0,traffic_volume
0,5545
1,4516
2,4767
3,5026
4,4918


In [38]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   traffic_volume  48204 non-null  int64
dtypes: int64(1)
memory usage: 376.7 KB


biến đổi holiday

In [39]:
X['holiday'].unique()

array([nan, 'Columbus Day', 'Veterans Day', 'Thanksgiving Day',
       'Christmas Day', 'New Years Day', 'Washingtons Birthday',
       'Memorial Day', 'Independence Day', 'State Fair', 'Labor Day',
       'Martin Luther King Jr Day'], dtype=object)

In [40]:
# 1. Tạo cột is_holiday (0/1)
X["is_holiday"] = X["holiday"].notna().astype(int)

# 2. Xóa cột holiday gốc
X = X.drop(columns=["holiday"])

biến đổi datetime

In [41]:
# --- 1. Convert datetime ---
X["date_time"] = pd.to_datetime(X["date_time"])

# --- 2. Extract base time features ---
X["year"] = X["date_time"].dt.year
X["month"] = X["date_time"].dt.month
X["day"] = X["date_time"].dt.day
X["hour"] = X["date_time"].dt.hour
X["day_of_week"] = X["date_time"].dt.dayofweek

# --- 3. Cyclical encoding for hour ---
X["hour_sin"]  = np.sin(2 * np.pi * X["hour"] / 24)
X["hour_cos"]  = np.cos(2 * np.pi * X["hour"] / 24)

# --- 4. Cyclical encoding for day_of_week ---
X["dow_sin"] = np.sin(2 * np.pi * X["day_of_week"] / 7)
X["dow_cos"] = np.cos(2 * np.pi * X["day_of_week"] / 7)

# --- 5. Cyclical encoding for month ---
X["month_sin"] = np.sin(2 * np.pi * X["month"] / 12)
X["month_cos"] = np.cos(2 * np.pi * X["month"] / 12)

# --- 6. Lag features (traffic_volume from y) ---
X["traffic_volume_lag1"]   = y.shift(1)
X["traffic_volume_lag24"]  = y.shift(24)
X["traffic_volume_lag168"] = y.shift(168)

# --- 7. Rolling features ---
X["traffic_roll_mean_3"]   = y.shift(1).rolling(window=3).mean()
X["traffic_roll_mean_24"]  = y.shift(1).rolling(window=24).mean()
X["traffic_roll_std_24"]   = y.shift(1).rolling(window=24).std()

# --- 8. Drop rows created by lagging/rolling ---
X = X.dropna().reset_index(drop=True)
# y = y[X.index]  # align target

In [42]:
X.drop('date_time', axis=1, inplace=True)

In [43]:
X.head()

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,is_holiday,year,month,day,...,dow_sin,dow_cos,month_sin,month_cos,traffic_volume_lag1,traffic_volume_lag24,traffic_volume_lag168,traffic_roll_mean_3,traffic_roll_mean_24,traffic_roll_std_24
0,288.86,0.0,0.0,75,Clouds,broken clouds,0,2012,10,9,...,0.781831,0.62349,-0.866025,0.5,4460.0,4259.0,5545.0,5645.333333,3728.25,2152.352409
1,287.36,0.0,0.0,90,Clouds,overcast clouds,0,2012,10,9,...,0.781831,0.62349,-0.866025,0.5,3418.0,3069.0,4516.0,4668.333333,3693.208333,2150.18068
2,285.11,0.0,0.0,90,Clouds,overcast clouds,0,2012,10,9,...,0.781831,0.62349,-0.866025,0.5,2775.0,2378.0,4767.0,3551.0,3680.958333,2154.724223
3,283.46,0.0,0.0,90,Clouds,overcast clouds,0,2012,10,9,...,0.781831,0.62349,-0.866025,0.5,2306.0,2030.0,5026.0,2833.0,3677.958333,2156.666439
4,282.45,0.0,0.0,90,Clouds,overcast clouds,0,2012,10,9,...,0.781831,0.62349,-0.866025,0.5,1846.0,1400.0,4918.0,2309.0,3670.291667,2163.096884


In [44]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48036 entries, 0 to 48035
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   temp                   48036 non-null  float64
 1   rain_1h                48036 non-null  float64
 2   snow_1h                48036 non-null  float64
 3   clouds_all             48036 non-null  int64  
 4   weather_main           48036 non-null  object 
 5   weather_description    48036 non-null  object 
 6   is_holiday             48036 non-null  int64  
 7   year                   48036 non-null  int32  
 8   month                  48036 non-null  int32  
 9   day                    48036 non-null  int32  
 10  hour                   48036 non-null  int32  
 11  day_of_week            48036 non-null  int32  
 12  hour_sin               48036 non-null  float64
 13  hour_cos               48036 non-null  float64
 14  dow_sin                48036 non-null  float64
 15  do