In [1]:
# importing basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
from tqdm import tqdm

# importing libraries for data preprocessing
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Load the data

In [3]:
data_path = '/kaggle/input/mn-traffic-processed/'
data = pd.read_csv(data_path + 'processed_data.csv')
data.head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday
0,26,1,1,2019-01-01,1,47,11.0,0.0,0.0,0.0,0.0,305.0,15.3,1031.2,1
1,26,1,1,2019-01-02,1,59,26.0,2.0,0.0,0.0,0.0,207.0,17.6,1017.7,0
2,26,1,1,2019-01-03,1,79,41.0,20.0,0.0,0.0,0.0,223.0,13.8,1013.15,0
3,26,1,1,2019-01-04,1,74,47.0,28.0,0.0,0.0,0.0,239.0,10.0,1008.6,0
4,26,1,1,2019-01-05,1,97,47.0,27.0,0.0,0.0,0.0,286.0,17.2,1014.1,0


# Date Manipulation

In [4]:
# stations_needed = [10390, 11517, 11236, 70413, 11228, 11196, 11205, 10310, 11191, 42507, 11238, 10730, 10794, 10800, 10808, 11179, 11280, 10069, 11726, 11510, 11283, 11516, 10919, 10899, 11464, 11273, 10840, 11747, 11749, 10830, 10206, 10205, 405, 336, 425, 389, 301, 303, 464, 10398]
# len(stations_needed)

In [5]:
data.shape

(21409776, 15)

In [6]:
data['station_id'].nunique() 

155

In [7]:
grouped_station = data.groupby('station_id')['date'].min()

In [8]:
data.shape

(21409776, 15)

In [9]:
# A need to remove some stations, not enough data for them
stations_to_remove = list(grouped_station[grouped_station>='2021-01-01'].index)
mask_stations = data['station_id'].isin(stations_to_remove)
data = data[~mask_stations]
data.shape

(19738896, 15)

In [10]:
grouped_station_dir_lane = data.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['date'].min()
grouped_station_dir_lane.sort_values(ascending=False)

station_id  dir_of_travel  lane_of_travel
45          3              2                 2023-01-01
            7              2                 2023-01-01
6461        3              3                 2022-10-01
            7              3                 2022-10-01
10398       7              3                 2022-03-01
11516       5              3                 2022-01-03
            1              3                 2022-01-03
11510       5              4                 2022-01-01
            1              4                 2022-01-01
9110        5              2                 2020-05-27
                           1                 2020-05-27
            1              2                 2020-05-27
                           1                 2020-05-27
3467        7              2                 2020-01-29
                           1                 2020-01-29
            3              2                 2020-01-29
                           1                 2020-01-29
2450  

In [11]:
# Same problem as before
unique_id_to_remove = list(grouped_station_dir_lane[grouped_station_dir_lane>='2021-01-01'].index)
df_ids_to_remove = pd.DataFrame(unique_id_to_remove, columns=['station_id', 'dir_of_travel', 'lane_of_travel'])
data = pd.merge(data, df_ids_to_remove, on=['station_id', 'dir_of_travel', 'lane_of_travel'], how='left', indicator=True)
data = data[data['_merge']=='left_only']
data.drop('_merge', axis=1, inplace=True)
data.shape

(19613352, 15)

In [12]:
# station_counts = data['station_id'].value_counts()

In [13]:
# # Remove the stations with more of the same data if they are same number of records 
# mask = station_counts.duplicated(keep='first') & station_counts.duplicated(keep='last')
# stations_to_keep = station_counts[~mask].index
# mask_stations = data['station_id'].isin(stations_to_keep)
# data = data[mask_stations]
# data.shape

In [14]:
stations_needed = [10390, 11517, 11236, 70413, 11228, 11196, 11205, 10310, 11191, 42507, 11238, 10730, 10794, 10800, 10808, 11179, 11280, 10069, 11726, 11510, 11283, 11516, 10919, 10899, 11464, 11273, 10840, 11747, 11749, 10830, 10206, 10205, 405, 336, 425, 389, 301, 303, 464, 10398]
len(stations_needed)
# Choose the stations within Minnepolis/St. Paul to avoid any RAM problems
mask_stations = data['station_id'].isin(stations_needed)
data = data[mask_stations]
data.shape

(8167560, 15)

In [15]:
# # Choose the stations within Minnepolis/St. Paul to avoid any RAM problems
# mask_stations = data['station_id'].isin(stations_needed)
# data = data[mask_stations]
# data.shape

In [16]:
mask = data['hour']==24
data[mask].head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday
19141263,301,3,1,2020-01-02,24,315,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0
19141264,301,3,1,2020-01-03,24,385,36.0,24.0,0.0,0.0,4.0,238.0,12.6,1011.7,0
19141265,301,3,1,2020-01-04,24,404,30.0,24.0,0.0,0.0,4.0,226.0,15.9,1018.9,0
19141266,301,3,1,2020-01-05,24,221,38.0,24.0,0.01,0.0,4.0,274.0,28.5,1014.9,0
19141267,301,3,1,2020-01-06,24,279,36.0,22.0,0.0,0.0,3.0,231.0,18.2,1019.3,0


In [17]:
# Creating the day of the week, month, and year columns
data.loc[mask, 'hour'] = 0
data.loc[mask, 'date'] = pd.to_datetime(data.loc[mask, 'date']) + pd.DateOffset(days=1)

data['day_of_week'] = pd.to_datetime(data['date']).dt.dayofweek
data['day'] = pd.to_datetime(data['date']).dt.day
data['week'] = pd.to_datetime(data['date']).dt.isocalendar().week
data['month'] = pd.to_datetime(data['date']).dt.month
data['year'] = pd.to_datetime(data['date']).dt.year

data.head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year
224821,301,3,1,2020-01-02,1,135,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020
224822,301,3,1,2020-01-03,1,164,36.0,24.0,0.0,0.0,4.0,238.0,12.6,1011.7,0,4,3,1,1,2020
224823,301,3,1,2020-01-04,1,291,30.0,24.0,0.0,0.0,4.0,226.0,15.9,1018.9,0,5,4,1,1,2020
224824,301,3,1,2020-01-05,1,281,38.0,24.0,0.01,0.0,4.0,274.0,28.5,1014.9,0,6,5,1,1,2020
224825,301,3,1,2020-01-06,1,146,36.0,22.0,0.0,0.0,3.0,231.0,18.2,1019.3,0,0,6,2,1,2020


In [18]:
# Prevent overload in RAM
years_remove = [2019]
mask = data['year'].isin(years_remove)
data = data[~mask]
data.shape

(8098256, 20)

In [19]:
# Calculate the final number of features
data['station_id'].nunique() + data['dir_of_travel'].nunique() + data['lane_of_travel'].nunique() 

49

In [20]:
mask = data['hour']==0
data[mask].head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year
19141263,301,3,1,2020-01-03 00:00:00,0,315,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,4,3,1,1,2020
19141264,301,3,1,2020-01-04 00:00:00,0,385,36.0,24.0,0.0,0.0,4.0,238.0,12.6,1011.7,0,5,4,1,1,2020
19141265,301,3,1,2020-01-05 00:00:00,0,404,30.0,24.0,0.0,0.0,4.0,226.0,15.9,1018.9,0,6,5,1,1,2020
19141266,301,3,1,2020-01-06 00:00:00,0,221,38.0,24.0,0.01,0.0,4.0,274.0,28.5,1014.9,0,0,6,2,1,2020
19141267,301,3,1,2020-01-07 00:00:00,0,279,36.0,22.0,0.0,0.0,3.0,231.0,18.2,1019.3,0,1,7,2,1,2020


In [21]:
data['datetime'] = pd.to_datetime(data['date']) + pd.to_timedelta(data['hour'], unit='h')
data.head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year,datetime
224821,301,3,1,2020-01-02,1,135,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 01:00:00
224822,301,3,1,2020-01-03,1,164,36.0,24.0,0.0,0.0,4.0,238.0,12.6,1011.7,0,4,3,1,1,2020,2020-01-03 01:00:00
224823,301,3,1,2020-01-04,1,291,30.0,24.0,0.0,0.0,4.0,226.0,15.9,1018.9,0,5,4,1,1,2020,2020-01-04 01:00:00
224824,301,3,1,2020-01-05,1,281,38.0,24.0,0.01,0.0,4.0,274.0,28.5,1014.9,0,6,5,1,1,2020,2020-01-05 01:00:00
224825,301,3,1,2020-01-06,1,146,36.0,22.0,0.0,0.0,3.0,231.0,18.2,1019.3,0,0,6,2,1,2020,2020-01-06 01:00:00


In [22]:
mask = data['hour']==5
data[mask].head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year,datetime
3514637,301,3,1,2020-01-02,5,76,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 05:00:00
3514638,301,3,1,2020-01-03,5,80,36.0,24.0,0.0,0.0,4.0,238.0,12.6,1011.7,0,4,3,1,1,2020,2020-01-03 05:00:00
3514639,301,3,1,2020-01-04,5,60,30.0,24.0,0.0,0.0,4.0,226.0,15.9,1018.9,0,5,4,1,1,2020,2020-01-04 05:00:00
3514640,301,3,1,2020-01-05,5,70,38.0,24.0,0.01,0.0,4.0,274.0,28.5,1014.9,0,6,5,1,1,2020,2020-01-05 05:00:00
3514641,301,3,1,2020-01-06,5,71,36.0,22.0,0.0,0.0,3.0,231.0,18.2,1019.3,0,0,6,2,1,2020,2020-01-06 05:00:00


In [23]:
data['day_of_week'].value_counts().sort_index()

day_of_week
0    1153383
1    1153152
2    1157001
3    1158632
4    1158696
5    1158696
6    1158696
Name: count, dtype: int64

In [24]:
data['day'].value_counts().sort_index()

day
1     264648
2     266048
3     266112
4     266112
5     266112
6     266112
7     266112
8     266112
9     266112
10    266112
11    266112
12    266112
13    266112
14    266112
15    266112
16    266112
17    266112
18    266112
19    266112
20    266112
21    266112
22    266112
23    266112
24    266112
25    266112
26    266112
27    266112
28    266112
29    249480
30    243936
31    155232
Name: count, dtype: int64

In [25]:
data['year'].value_counts().sort_index()

year
2020    2027345
2021    2023560
2022    2023560
2023    2023560
2024        231
Name: count, dtype: int64

# Data Type Conversion

In [26]:
data.dtypes

station_id                                  int64
dir_of_travel                               int64
lane_of_travel                              int64
date                                       object
hour                                        int64
volume                                      int64
Maximum Temperature degrees (F)           float64
Minimum Temperature degrees (F)           float64
Precipitation (inches)                    float64
Snow (inches)                             float64
Snow Depth (inches)                       float64
wdir                                      float64
wspd                                      float64
pres                                      float64
is_holiday                                  int64
day_of_week                                 int32
day                                         int32
week                                       UInt32
month                                       int32
year                                        int32


In [27]:
data['station_id'] = data['station_id'].astype('category')
data['dir_of_travel'] = data['dir_of_travel'].astype('category')
data['lane_of_travel'] = data['lane_of_travel'].astype('category')

data.dtypes

station_id                               category
dir_of_travel                            category
lane_of_travel                           category
date                                       object
hour                                        int64
volume                                      int64
Maximum Temperature degrees (F)           float64
Minimum Temperature degrees (F)           float64
Precipitation (inches)                    float64
Snow (inches)                             float64
Snow Depth (inches)                       float64
wdir                                      float64
wspd                                      float64
pres                                      float64
is_holiday                                  int64
day_of_week                                 int32
day                                         int32
week                                       UInt32
month                                       int32
year                                        int32


In [28]:
data.head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year,datetime
224821,301,3,1,2020-01-02,1,135,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 01:00:00
224822,301,3,1,2020-01-03,1,164,36.0,24.0,0.0,0.0,4.0,238.0,12.6,1011.7,0,4,3,1,1,2020,2020-01-03 01:00:00
224823,301,3,1,2020-01-04,1,291,30.0,24.0,0.0,0.0,4.0,226.0,15.9,1018.9,0,5,4,1,1,2020,2020-01-04 01:00:00
224824,301,3,1,2020-01-05,1,281,38.0,24.0,0.01,0.0,4.0,274.0,28.5,1014.9,0,6,5,1,1,2020,2020-01-05 01:00:00
224825,301,3,1,2020-01-06,1,146,36.0,22.0,0.0,0.0,3.0,231.0,18.2,1019.3,0,0,6,2,1,2020,2020-01-06 01:00:00


In [29]:
df = data.drop('date', axis=1)
df.head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year,datetime
224821,301,3,1,1,135,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 01:00:00
224822,301,3,1,1,164,36.0,24.0,0.0,0.0,4.0,238.0,12.6,1011.7,0,4,3,1,1,2020,2020-01-03 01:00:00
224823,301,3,1,1,291,30.0,24.0,0.0,0.0,4.0,226.0,15.9,1018.9,0,5,4,1,1,2020,2020-01-04 01:00:00
224824,301,3,1,1,281,38.0,24.0,0.01,0.0,4.0,274.0,28.5,1014.9,0,6,5,1,1,2020,2020-01-05 01:00:00
224825,301,3,1,1,146,36.0,22.0,0.0,0.0,3.0,231.0,18.2,1019.3,0,0,6,2,1,2020,2020-01-06 01:00:00


# Data Preprocessing

In [30]:
# Checking for missing values
df.isnull().sum().sum()

0

In [31]:
df.sort_values(['station_id', 'dir_of_travel', 'lane_of_travel', 'datetime'], inplace=True)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year,datetime
0,301,3,1,1,135,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 01:00:00
1,301,3,1,2,89,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 02:00:00
2,301,3,1,3,61,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 03:00:00
3,301,3,1,4,37,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 04:00:00
4,301,3,1,5,76,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 05:00:00


In [32]:
# Splitting the dataset into 3 sets: train, validation, and test
# mask_station = df['station_id'] == 10800
train_set = df[(df['year'] < 2023)]
valid_test = df[(df['year'] >= 2023)]
valid_set = valid_test[valid_test['month'] <= 6]
test_set = valid_test[valid_test['month'] > 6]

In [33]:
train_set.shape, valid_set.shape, test_set.shape

((6074465, 20), (1003695, 20), (1020096, 20))

In [34]:
mask_station = train_set['station_id'] == 301
mask_dir = train_set['dir_of_travel'] == 3
mask_lane = train_set['lane_of_travel'] == 1
train_set[mask_station & mask_dir & mask_lane].head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year,datetime
0,301,3,1,1,135,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 01:00:00
1,301,3,1,2,89,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 02:00:00
2,301,3,1,3,61,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 03:00:00
3,301,3,1,4,37,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 04:00:00
4,301,3,1,5,76,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 05:00:00


In [35]:
data.columns

Index(['station_id', 'dir_of_travel', 'lane_of_travel', 'date', 'hour',
       'volume', 'Maximum Temperature degrees (F)',
       'Minimum Temperature degrees (F)', 'Precipitation (inches)',
       'Snow (inches)', 'Snow Depth (inches)', 'wdir', 'wspd', 'pres',
       'is_holiday', 'day_of_week', 'day', 'week', 'month', 'year',
       'datetime'],
      dtype='object')

# Model Building

In [36]:
# importing libraries for machine learning models
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score, mean_absolute_error

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, PredefinedSplit
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

### Preparing the data

In [37]:
# Creating lag features
train_set['volume_lag1'] = train_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(1)
train_set['volume_lag2'] = train_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(2)
train_set['volume_lag3'] = train_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(3)
train_set.head()

  train_set['volume_lag1'] = train_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set['volume_lag1'] = train_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(1)
  train_set['volume_lag2'] = train_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set['volume_lag2'] = train_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(2)
  trai

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year,datetime,volume_lag1,volume_lag2,volume_lag3
0,301,3,1,1,135,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 01:00:00,,,
1,301,3,1,2,89,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 02:00:00,135.0,,
2,301,3,1,3,61,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 03:00:00,89.0,135.0,
3,301,3,1,4,37,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 04:00:00,61.0,89.0,135.0
4,301,3,1,5,76,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 05:00:00,37.0,61.0,89.0


In [38]:
valid_set['volume_lag1'] = valid_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(1)
valid_set['volume_lag2'] = valid_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(2)
valid_set['volume_lag3'] = valid_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(3)

test_set['volume_lag1'] = test_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(1)
test_set['volume_lag2'] = test_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(2)
test_set['volume_lag3'] = test_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(3)

  valid_set['volume_lag1'] = valid_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_set['volume_lag1'] = valid_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(1)
  valid_set['volume_lag2'] = valid_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_set['volume_lag2'] = valid_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(2)
  vali

In [39]:
# Readying the data
# Splitting the data to X and y
X_train = train_set.drop('volume', axis=1)
X_train = X_train.drop(['datetime'], axis=1)
y_train = train_set['volume']

X_valid = valid_set.drop('volume', axis=1)
X_valid = X_valid.drop(['datetime'], axis=1)
y_valid = valid_set['volume']

X_test = test_set.drop('volume', axis=1)
X_test = X_test.drop(['datetime'], axis=1)
y_test = test_set['volume']

In [40]:
train_set.shape, valid_set.shape, test_set.shape

((6074465, 23), (1003695, 23), (1020096, 23))

In [41]:
from sklearn.preprocessing import FunctionTransformer


def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

### Helper Function for prediction

In [42]:
def predict_ml(pipeline_model, X):
    pred = []
    X['volume_lag1'] = 0
    X['volume_lag2'] = 0
    X['volume_lag3'] = 0
    
    for i in range(len(X)):
        predicition = pipeline_model.predict(X.iloc[[i]])[0]
        
        X.loc[i+1, 'volume_lag1'] = predicition
        pred.append(predicition)
        
        if i > 0:
            X.loc[i+1, 'volume_lag2'] = X.loc[i, 'volume_lag1']
        
        if i > 1:
            X.loc[i+1, 'volume_lag3'] = X.loc[i, 'volume_lag2']
        
    
    return pred

In [43]:
def calculate_mape(y_actual, y_pred, epsilon=1e-5):
    y_a = y_actual.reset_index(drop=True)[y_actual.reset_index(drop=True)!=0]
    y_p = y_pred.reset_index(drop=True)[y_actual.reset_index(drop=True)!=0]
    
    return mean_absolute_percentage_error(y_a, y_p)

## Machine Learning Models

In [44]:
results = {}

results['train'] = {}
results['valid'] = {}

results

{'train': {}, 'valid': {}}

### XGBoost Regressor

In [45]:
# # Defining the columns
# cat_cols = ['station_id', 'dir_of_travel', 'lane_of_travel']
# num_cols = ['Maximum Temperature degrees (F)', 'Minimum Temperature degrees (F)', 'Precipitation (inches)', 
#             'Snow (inches)', 'Snow Depth (inches)', 'wdir', 'wspd', 'pres', 'year', 'day']
# trig_cols = ['hour', 'day_of_week', 'week', 'month']
# pass_cols = ['is_holiday']
# # pass_cols = ['hour', 'day_of_week', 'week', 'month', 'is_holiday']
# target_col = 'volume'


# # Adding the lag features to the columns
# num_cols += ['volume_lag1', 'volume_lag2', 'volume_lag3']


# # Transforming the categorical data
# cat_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# # Transforming the numerical data
# num_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
#     ('scaler', StandardScaler())
# ])

# # Transforming the date cols to trig data
# trig_transformer = ColumnTransformer(transformers=[
#     ('hour_sin', sin_transformer(24), ['hour']),
#     ('hour_cos', cos_transformer(24), ['hour']),
#     ('day_of_week_sin', sin_transformer(7), ['day_of_week']),
#     ('day_of_week_cos', cos_transformer(7), ['day_of_week']),
#     ('week_sin', sin_transformer(53), ['week']),
#     ('week_cos', cos_transformer(53), ['week']),
#     ('month_sin', sin_transformer(12), ['month']),
#     ('month_cos', cos_transformer(12), ['month'])
# ])


# # Combining the transformers
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', num_transformer, num_cols),
#         ('cat', cat_transformer, cat_cols),
#         ('trig', trig_transformer, trig_cols),
#         ('pass', 'passthrough',  pass_cols)
#     ])

In [46]:
# # Defining the model
# xgb = XGBRegressor(n_estimators=100, seed=42, verbosity=2)

# # Defining the pipeline
# xgb_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', xgb)
# ], verbose=True)

In [47]:
# # fitting the model
# xgb_pipeline.fit(X_train, y_train)

In [48]:
# # predicting the values
# y_train_pred = xgb_pipeline.predict(X_train)
# y_valid_pred = xgb_pipeline.predict(X_valid)

In [49]:
# y_train_pred = pd.DataFrame(y_train_pred)
# y_train_pred.head()

In [50]:
# y_train.head()

In [51]:
# y_valid_pred = pd.DataFrame(y_valid_pred)
# y_valid_pred.head()

In [52]:
# y_valid.head()

In [53]:
# # Calculating the metrics
# train_mae = mean_absolute_error(y_train, y_train_pred)
# train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
# train_mape = calculate_mape(y_train, y_train_pred)
# train_r2 = r2_score(y_train, y_train_pred)

# valid_mae = mean_absolute_error(y_valid, y_valid_pred)
# valid_rmse = mean_squared_error(y_valid, y_valid_pred, squared=False)
# valid_mape = calculate_mape(y_valid, y_valid_pred)
# valid_r2 = r2_score(y_valid, y_valid_pred)

# print(f'Train MAE: {train_mae}, Train RMSE: {train_rmse}, Train MAPE: {train_mape}, Train R2: {train_r2}')
# print(f'Validation MAE: {valid_mae}, Validation RMSE: {valid_rmse}, Validation MAPE: {valid_mape}, Validation R2: {valid_r2}')

In [54]:
# results['train']['rmse'] = [train_rmse]
# results['train']['mape'] = [train_rmse]
# results['train']['r2'] = [train_rmse]

# results['valid']['rmse'] = [train_rmse]
# results['valid']['mape'] = [train_rmse]
# results['valid']['r2'] = [train_rmse]

# results

In [55]:
# # Plotting the predictions
# plt.figure(figsize=(12, 6))
# plt.plot(y_valid.reset_index(drop=True), label='Actual')
# plt.plot(y_valid_pred, label='Predicted')
# plt.legend()
# plt.show()

In [56]:
# # Plotting for one station, and direction
# mask_station = X_valid.reset_index(drop=True)['station_id']==10800
# mask_dir = X_valid.reset_index(drop=True)['dir_of_travel']==3
# mask_lane = X_valid.reset_index(drop=True)['lane_of_travel']==1
# mask = mask_station & mask_dir & mask_lane

# plt.figure(figsize=(12, 6))
# plt.plot(y_valid.reset_index(drop=True)[mask], label='Actual')
# plt.plot(y_valid_pred[mask], label='Predicted')
# plt.legend()
# plt.show()

In [57]:
# # Plotting for one station, and direction
# mask_station = X_valid.reset_index(drop=True)['station_id']==10800
# mask_dir = X_valid.reset_index(drop=True)['dir_of_travel']==3
# mask_lane = X_valid.reset_index(drop=True)['lane_of_travel']==1
# mask_year = X_valid.reset_index(drop=True)['year']==2023
# mask_month = X_valid.reset_index(drop=True)['month']==5
# mask_day = X_valid.reset_index(drop=True)['day']==1
# mask = mask_station & mask_dir & mask_lane & mask_year & mask_month & mask_day

# plt.figure(figsize=(12, 6))
# plt.plot(y_valid.reset_index(drop=True)[mask], label='Actual')
# plt.plot(y_valid_pred[mask], label='Predicted')
# plt.legend()
# plt.show()

In [58]:
# X_valid.reset_index(drop=True)[mask].shape

In [59]:
# time.sleep(30)

### XGBoost Regressor with PCA

In [104]:
from sklearn.decomposition import PCA

In [103]:
# pca = PCA()
# weather_features = ['Maximum Temperature degrees (F)', 'Minimum Temperature degrees (F)', 'Precipitation (inches)', 
#                     'Snow (inches)', 'Snow Depth (inches)', 'wdir', 'wspd', 'pres']
# pca.fit(X_train[weather_features])
# var = pca.explained_variance_
# var

NameError: name 'PCA' is not defined

In [62]:
# plt.plot(var)
# plt.show()

In [63]:
# pca = PCA(n_components=3)
# X_train[['pca_1', 'pca_2', 'pca_3']] = pca.fit_transform(X_train[weather_features])

In [64]:
# X_valid[['pca_1', 'pca_2', 'pca_3']] = pca.transform(X_valid[weather_features])
# X_test[['pca_1', 'pca_2', 'pca_3']] = pca.transform(X_test[weather_features])

In [65]:
# X_train.shape

In [66]:
# X_train.head()

Now to try with PCA

In [105]:
# Defining the columns
cat_cols = ['station_id', 'dir_of_travel', 'lane_of_travel']
num_cols = ['year', 'day']
trig_cols = ['hour', 'day_of_week', 'week', 'month']
pass_cols = ['is_holiday']
weather_features = ['Maximum Temperature degrees (F)', 'Minimum Temperature degrees (F)', 'Precipitation (inches)', 
                    'Snow (inches)', 'Snow Depth (inches)', 'wdir', 'wspd', 'pres']
# pass_cols = ['hour', 'day_of_week', 'week', 'month', 'is_holiday']
target_col = 'volume'

num_cols += ['volume_lag1', 'volume_lag2', 'volume_lag3']

# Transforming the categorical data
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Transforming the numerical data
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

# Transforming the date cols to trig data
trig_transformer = ColumnTransformer(transformers=[
    ('hour_sin', sin_transformer(24), ['hour']),
    ('hour_cos', cos_transformer(24), ['hour']),
    ('day_of_week_sin', sin_transformer(7), ['day_of_week']),
    ('day_of_week_cos', cos_transformer(7), ['day_of_week']),
    ('week_sin', sin_transformer(53), ['week']),
    ('week_cos', cos_transformer(53), ['week']),
    ('month_sin', sin_transformer(12), ['month']),
    ('month_cos', cos_transformer(12), ['month'])
])

# Transforming the weather features using pca
weather_transformer = Pipeline(steps=[
    ('pca', PCA(n_components=3)),
    ('scaler', StandardScaler())
])

# Combining the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols),
        ('trig', trig_transformer, trig_cols),
        ('weather', weather_transformer, weather_features),
        ('pass', 'passthrough',  pass_cols)
    ])

In [68]:
# eval_metric=['rmse', 'mae']

In [69]:
# # Defining the model
# xgb = XGBRegressor(n_estimators=100, seed=42, verbosity=2, eval_metric=eval_metric)

# # Defining the pipeline
# xgb_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', xgb)
# ], verbose=True)

In [70]:
# X_train_transformed = preprocessor.fit_transform(X_train)
# X_valid_transformed = preprocessor.transform(X_valid)

# eval_set = [(X_train_transformed, y_train)]

In [71]:
# # fitting the model
# xgb.fit(X_train_transformed, y_train, eval_set=eval_set)

In [72]:
# # predicting the values
# y_train_pred = xgb_pipeline.predict(X_train)
# y_train_pred = np.maximum(y_train_pred, 0)

In [73]:
# y_valid_pred = xgb_pipeline.predict(X_valid)
# y_valid_pred = np.maximum(y_valid_pred, 0)

In [74]:
# y_train_pred = pd.DataFrame(y_train_pred)
# y_train_pred.head()

In [75]:
# y_train.head()

In [76]:
# y_valid_pred = pd.DataFrame(y_valid_pred)
# y_valid_pred.head()

In [77]:
# y_valid.head()

In [78]:
# # Calculating the metrics
# train_mae = mean_absolute_error(y_train, y_train_pred)
# train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
# train_mape = calculate_mape(y_train, y_train_pred)
# train_r2 = r2_score(y_train, y_train_pred)

# valid_mae = mean_absolute_error(y_valid, y_valid_pred)
# valid_rmse = mean_squared_error(y_valid, y_valid_pred, squared=False)
# valid_mape = calculate_mape(y_valid, y_valid_pred)
# valid_r2 = r2_score(y_valid, y_valid_pred)

# print(f'Train MAE: {train_mae}, Train RMSE: {train_rmse}, Train MAPE: {train_mape}, Train R2: {train_r2}')
# print(f'Validation MAE: {valid_mae}, Validation RMSE: {valid_rmse}, Validation MAPE: {valid_mape}, Validation R2: {valid_r2}')

In [79]:
# results['train']['rmse'] = [train_rmse]
# results['train']['mape'] = [train_rmse]
# results['train']['r2'] = [train_rmse]

# results['valid']['rmse'] = [train_rmse]
# results['valid']['mape'] = [train_rmse]
# results['valid']['r2'] = [train_rmse]

# results

In [80]:
# # Plotting the predictions
# plt.figure(figsize=(12, 6))
# plt.plot(y_valid.reset_index(drop=True), label='Actual')
# plt.plot(y_valid_pred, label='Predicted')
# plt.legend()
# plt.show()

In [81]:
# # Plotting for one station, and direction
# mask_station = X_valid.reset_index(drop=True)['station_id']==10800
# mask_dir = X_valid.reset_index(drop=True)['dir_of_travel']==3
# mask_lane = X_valid.reset_index(drop=True)['lane_of_travel']==1
# mask = mask_station & mask_dir & mask_lane

# plt.figure(figsize=(12, 6))
# plt.plot(y_valid.reset_index(drop=True)[mask], label='Actual')
# plt.plot(y_valid_pred[mask], label='Predicted')
# plt.legend()
# plt.show()

In [82]:
# # Plotting for one station, and direction
# mask_station = X_valid.reset_index(drop=True)['station_id']==10800
# mask_dir = X_valid.reset_index(drop=True)['dir_of_travel']==3
# mask_lane = X_valid.reset_index(drop=True)['lane_of_travel']==1
# mask_year = X_valid.reset_index(drop=True)['year']==2023
# mask_month = X_valid.reset_index(drop=True)['month']==5
# mask_day = X_valid.reset_index(drop=True)['day']==1
# mask = mask_station & mask_dir & mask_lane & mask_year & mask_month & mask_day

# plt.figure(figsize=(12, 6))
# plt.plot(y_valid.reset_index(drop=True)[mask], label='Actual')
# plt.plot(y_valid_pred[mask], label='Predicted')
# plt.legend()
# plt.show()

In [83]:
# history = {}

In [84]:
# history['train'] = xgb.evals_result()['validation_0']

In [85]:
# time.sleep(30)

In [86]:
# # Defining the model
# xgb = XGBRegressor(n_estimators=100, seed=42, verbosity=2, eval_metric=eval_metric)

# # Defining the pipeline
# xgb_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', xgb)
# ], verbose=True)

In [87]:
# X_train_transformed = preprocessor.fit_transform(X_train)
# X_valid_transformed = preprocessor.transform(X_valid)

# eval_set = [(X_valid_transformed, y_valid)]

In [88]:
# # fitting the model
# xgb.fit(X_train_transformed, y_train, eval_set=eval_set)

In [89]:
# # predicting the values
# y_train_pred = xgb_pipeline.predict(X_train)
# y_train_pred = np.maximum(y_train_pred, 0)

In [90]:
# y_valid_pred = xgb_pipeline.predict(X_valid)
# y_valid_pred = np.maximum(y_valid_pred, 0)

In [91]:
# y_train_pred = pd.DataFrame(y_train_pred)
# y_train_pred.head()

In [92]:
# y_valid_pred = pd.DataFrame(y_valid_pred)
# y_valid_pred.head()

In [93]:
# # Calculating the metrics
# train_mae = mean_absolute_error(y_train, y_train_pred)
# train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
# train_mape = calculate_mape(y_train, y_train_pred)
# train_r2 = r2_score(y_train, y_train_pred)

# valid_mae = mean_absolute_error(y_valid, y_valid_pred)
# valid_rmse = mean_squared_error(y_valid, y_valid_pred, squared=False)
# valid_mape = calculate_mape(y_valid, y_valid_pred)
# valid_r2 = r2_score(y_valid, y_valid_pred)

# print(f'Train MAE: {train_mae}, Train RMSE: {train_rmse}, Train MAPE: {train_mape}, Train R2: {train_r2}')
# print(f'Validation MAE: {valid_mae}, Validation RMSE: {valid_rmse}, Validation MAPE: {valid_mape}, Validation R2: {valid_r2}')

In [94]:
# history['validation'] = xgb.evals_result()['validation_0']

In [95]:
# history

In [96]:
# xgb.save_model("xgb.json")

In [97]:
# import pickle
# with open(f'/kaggle/working/xgboost', 'wb') as file_pi:
#     pickle.dump(history, file_pi)

In [98]:
# with open(f'/kaggle/working/xgboost', "rb") as file_pi:
#     h = pickle.load(file_pi)

In [99]:
# h

## Testing

In [106]:
# Defining the model
xgb = XGBRegressor(n_estimators=100, seed=42, verbosity=2)

# Defining the pipeline
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb)
], verbose=True)

In [107]:
X_all = pd.concat([X_train, X_valid], axis=0)
y_all = pd.concat([y_train, y_valid], axis=0)

In [108]:
X_all_transformed = preprocessor.fit_transform(X_all)

In [109]:
# fitting the model
xgb.fit(X_all_transformed, y_all)

In [110]:
# predicting the values
y_train_pred = xgb_pipeline.predict(X_train)
y_train_pred = np.maximum(y_train_pred, 0)

In [111]:
y_test_pred = xgb_pipeline.predict(X_test)
y_test_pred = np.maximum(y_test_pred, 0)

In [112]:
y_train_pred = pd.DataFrame(y_train_pred)
y_train_pred.head()

Unnamed: 0,0
0,87.045364
1,161.150574
2,94.811539
3,61.757645
4,94.298271


In [113]:
y_test_pred = pd.DataFrame(y_test_pred)
y_test_pred.head()

Unnamed: 0,0
0,87.021935
1,433.438171
2,276.27478
3,185.074371
4,130.871063


In [114]:
# Calculating the metrics
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
train_mape = calculate_mape(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
test_mape = calculate_mape(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train MAE: {train_mae}, Train RMSE: {train_rmse}, Train MAPE: {train_mape}, Train R2: {train_r2}')
print(f'Test MAE: {test_mae}, Test RMSE: {test_rmse}, Test MAPE: {test_mape}, Test R2: {test_r2}')

Train MAE: 46.906155875321424, Train RMSE: 75.92872853555359, Train MAPE: 0.2122544119472571, Train R2: 0.9737944709991847
Test MAE: 52.97771536108602, Test RMSE: 85.36264419352456, Test MAPE: 0.22303273649084024, Test R2: 0.9711870515043809


In [116]:
xgb.save_model("xgb_final.json")