In [1]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [2]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [3]:
# fetch dataset 
metro_interstate_traffic_volume = fetch_ucirepo(id=492) 
  
# data (as pandas dataframes) 
X = metro_interstate_traffic_volume.data.features 
y = metro_interstate_traffic_volume.data.targets   

In [4]:
X.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00


In [5]:
X

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00
...,...,...,...,...,...,...,...,...
48199,,283.45,0.0,0.0,75,Clouds,broken clouds,2018-09-30 19:00:00
48200,,282.76,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 20:00:00
48201,,282.73,0.0,0.0,90,Thunderstorm,proximity thunderstorm,2018-09-30 21:00:00
48202,,282.09,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 22:00:00


In [6]:
X.iloc[0,7]

'2012-10-02 09:00:00'

In [7]:
X.iloc[-1,7]

'2018-09-30 23:00:00'

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   holiday              61 non-null     object 
 1   temp                 48204 non-null  float64
 2   rain_1h              48204 non-null  float64
 3   snow_1h              48204 non-null  float64
 4   clouds_all           48204 non-null  int64  
 5   weather_main         48204 non-null  object 
 6   weather_description  48204 non-null  object 
 7   date_time            48204 non-null  object 
dtypes: float64(3), int64(1), object(4)
memory usage: 2.9+ MB


In [9]:
y.head()

Unnamed: 0,traffic_volume
0,5545
1,4516
2,4767
3,5026
4,4918


In [10]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   traffic_volume  48204 non-null  int64
dtypes: int64(1)
memory usage: 376.7 KB


biến đổi holiday

In [11]:
X['holiday'].unique()

array([nan, 'Columbus Day', 'Veterans Day', 'Thanksgiving Day',
       'Christmas Day', 'New Years Day', 'Washingtons Birthday',
       'Memorial Day', 'Independence Day', 'State Fair', 'Labor Day',
       'Martin Luther King Jr Day'], dtype=object)

In [12]:
# 1. Tạo cột is_holiday (0/1)
X["is_holiday"] = X["holiday"].notna().astype(int)

# 2. Xóa cột holiday gốc
X = X.drop(columns=["holiday"])

biến đổi datetime

In [13]:
# --- 1. Convert datetime ---
date_time_copy = pd.to_datetime(X["date_time"])

In [14]:
start = date_time_copy.min()
end = date_time_copy.max()
print(start, end)

2012-10-02 09:00:00 2018-09-30 23:00:00


In [15]:
start = date_time_copy.min()
end = date_time_copy.max()
print(start, end)

2012-10-02 09:00:00 2018-09-30 23:00:00


In [16]:
total_hours = int((end - start) / pd.Timedelta(hours=1)) + 1
print("Số giờ liên tục dự kiến:", total_hours)


Số giờ liên tục dự kiến: 52551


In [17]:
print("Số bản ghi trong dataset:", len(X))

Số bản ghi trong dataset: 48204


In [18]:
train = X[X["date_time"] < "2018-01-01"]
test  = X[X["date_time"] >= "2018-01-01"]

In [19]:
# --- 0. Convert datetime ---
X["date_time"] = pd.to_datetime(X["date_time"])

In [20]:
# --- Create train/test mask ---
train_mask = X["date_time"] < "2018-01-01"
test_mask  = X["date_time"] >= "2018-01-01"

In [21]:
# --- 1. Extract base time features ---
X["year"] = X["date_time"].dt.year
X["month"] = X["date_time"].dt.month
X["day"] = X["date_time"].dt.day
X["hour"] = X["date_time"].dt.hour
X["day_of_week"] = X["date_time"].dt.dayofweek

# --- 2. Cyclical encoding ---
# Hour
X["hour_sin"] = np.sin(2 * np.pi * X["hour"] / 24)
X["hour_cos"] = np.cos(2 * np.pi * X["hour"] / 24)
# Day of week
X["dow_sin"] = np.sin(2 * np.pi * X["day_of_week"] / 7)
X["dow_cos"] = np.cos(2 * np.pi * X["day_of_week"] / 7)
# Month
X["month_sin"] = np.sin(2 * np.pi * X["month"] / 12)
X["month_cos"] = np.cos(2 * np.pi * X["month"] / 12)

In [22]:
# Combine train rows only
train_full = X[train_mask].copy()
train_full["traffic_volume"] = y[train_mask].values

biến đổi weather và weather discription

In [23]:
X['weather_main'].unique()
X['weather_description'].unique()

array(['scattered clouds', 'broken clouds', 'overcast clouds',
       'sky is clear', 'few clouds', 'light rain',
       'light intensity drizzle', 'mist', 'haze', 'fog',
       'proximity shower rain', 'drizzle', 'moderate rain',
       'heavy intensity rain', 'proximity thunderstorm',
       'thunderstorm with light rain', 'proximity thunderstorm with rain',
       'heavy snow', 'heavy intensity drizzle', 'snow',
       'thunderstorm with heavy rain', 'freezing rain', 'shower snow',
       'light rain and snow', 'light intensity shower rain', 'SQUALLS',
       'thunderstorm with rain', 'proximity thunderstorm with drizzle',
       'thunderstorm', 'Sky is Clear', 'very heavy rain',
       'thunderstorm with light drizzle', 'light snow',
       'thunderstorm with drizzle', 'smoke', 'shower drizzle',
       'light shower snow', 'sleet'], dtype=object)

tính mean traffic volume cho từng mô tả dựa trên dataset và gán score theo đó (data-driven)

tính weather score và fit transform chuẩn hóa trên tập train

In [24]:
# 1. Tính score theo mean traffic_volume
# Compute category mean (score)
weather_score = train_full.groupby("weather_description")["traffic_volume"].mean()
weather_score_map = weather_score.to_dict()

# 3. Gán score vào dataframe
X["weather_score"] = X["weather_description"].map(weather_score_map)

In [25]:
scaler = MinMaxScaler()

# Fit scaler on TRAIN ONLY
scaler.fit(X.loc[train_mask, ["weather_score"]])

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [26]:
# Transform both train & test
X.loc[train_mask, ["weather_score"]] = scaler.transform(X.loc[train_mask, ["weather_score"]])
X.loc[test_mask,  ["weather_score"]] = scaler.transform(X.loc[test_mask,  ["weather_score"]])

In [27]:
X = X.drop(["weather_description", "weather_main", "date_time"], axis=1)

In [28]:
train_X = X[train_mask].copy()
test_X  = X[test_mask].copy()

train_y = y[train_mask].copy()
test_y  = y[test_mask].copy()

áp dụng cho test 

Trong TEST có thể xuất hiện mô tả thời tiết chưa từng xuất hiện trong TRAIN => gán giá trị traffic trung bình của toàn bộ TRAIN

In [29]:
# --- 4. Tạo lag/rolling features trên toàn bộ dữ liệu (train+test) ---
combined_X = pd.concat([train_X, test_X], axis=0)
combined_y = pd.concat([train_y, test_y], axis=0)

# --- Lag features ---
combined_X["traffic_volume_lag1"]   = combined_y.shift(1)
combined_X["traffic_volume_lag24"]  = combined_y.shift(24)
combined_X["traffic_volume_lag168"] = combined_y.shift(168)

# --- Rolling features ---
combined_X["traffic_roll_mean_3"]  = combined_y.shift(1).rolling(window=3).mean()
combined_X["traffic_roll_mean_24"] = combined_y.shift(1).rolling(window=24).mean()
combined_X["traffic_roll_std_24"]  = combined_y.shift(1).rolling(window=24).std()

# --- Drop các hàng NaN do lag/rolling ---
combined_X = combined_X.dropna().reset_index(drop=True)
combined_y = combined_y.loc[combined_X.index].reset_index(drop=True)

train_mask_final = combined_X["year"] < 2018
test_mask_final  = combined_X["year"] >= 2018

X_train = combined_X[train_mask_final].copy()
X_test  = combined_X[test_mask_final].copy()
y_train = combined_y[train_mask_final].copy()
y_test  = combined_y[test_mask_final].copy()

In [30]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40087 entries, 0 to 40086
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   temp                   40087 non-null  float64
 1   rain_1h                40087 non-null  float64
 2   snow_1h                40087 non-null  float64
 3   clouds_all             40087 non-null  int64  
 4   is_holiday             40087 non-null  int64  
 5   year                   40087 non-null  int32  
 6   month                  40087 non-null  int32  
 7   day                    40087 non-null  int32  
 8   hour                   40087 non-null  int32  
 9   day_of_week            40087 non-null  int32  
 10  hour_sin               40087 non-null  float64
 11  hour_cos               40087 non-null  float64
 12  dow_sin                40087 non-null  float64
 13  dow_cos                40087 non-null  float64
 14  month_sin              40087 non-null  float64
 15  month_c

In [31]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7949 entries, 40087 to 48035
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   temp                   7949 non-null   float64
 1   rain_1h                7949 non-null   float64
 2   snow_1h                7949 non-null   float64
 3   clouds_all             7949 non-null   int64  
 4   is_holiday             7949 non-null   int64  
 5   year                   7949 non-null   int32  
 6   month                  7949 non-null   int32  
 7   day                    7949 non-null   int32  
 8   hour                   7949 non-null   int32  
 9   day_of_week            7949 non-null   int32  
 10  hour_sin               7949 non-null   float64
 11  hour_cos               7949 non-null   float64
 12  dow_sin                7949 non-null   float64
 13  dow_cos                7949 non-null   float64
 14  month_sin              7949 non-null   float64
 15  mont

In [32]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40087 entries, 0 to 40086
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   traffic_volume  40087 non-null  int64
dtypes: int64(1)
memory usage: 626.4 KB


In [33]:
y_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7949 entries, 40087 to 48035
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   traffic_volume  7949 non-null   int64
dtypes: int64(1)
memory usage: 124.2 KB


In [34]:
X_train.head()

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,is_holiday,year,month,day,hour,day_of_week,...,dow_cos,month_sin,month_cos,weather_score,traffic_volume_lag1,traffic_volume_lag24,traffic_volume_lag168,traffic_roll_mean_3,traffic_roll_mean_24,traffic_roll_std_24
0,288.86,0.0,0.0,75,0,2012,10,9,19,1,...,0.62349,-0.866025,0.5,0.421604,4460.0,4259.0,5545.0,5645.333333,3728.25,2152.352409
1,287.36,0.0,0.0,90,0,2012,10,9,20,1,...,0.62349,-0.866025,0.5,0.359073,3418.0,3069.0,4516.0,4668.333333,3693.208333,2150.18068
2,285.11,0.0,0.0,90,0,2012,10,9,21,1,...,0.62349,-0.866025,0.5,0.359073,2775.0,2378.0,4767.0,3551.0,3680.958333,2154.724223
3,283.46,0.0,0.0,90,0,2012,10,9,22,1,...,0.62349,-0.866025,0.5,0.359073,2306.0,2030.0,5026.0,2833.0,3677.958333,2156.666439
4,282.45,0.0,0.0,90,0,2012,10,9,23,1,...,0.62349,-0.866025,0.5,0.359073,1846.0,1400.0,4918.0,2309.0,3670.291667,2163.096884
