In [1]:
# importing basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
from tqdm import tqdm

import tensorflow as tf

2024-09-02 13:59:17.749800: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-02 13:59:17.749990: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-02 13:59:17.931568: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
pd.set_option('display.max_columns', None)

# Load the data

In [3]:
data_path = '/kaggle/input/mn-traffic-processed/'
data = pd.read_csv(data_path + 'processed_data.csv')
data.head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday
0,26,1,1,2019-01-01,1,47,11.0,0.0,0.0,0.0,0.0,305.0,15.3,1031.2,1
1,26,1,1,2019-01-02,1,59,26.0,2.0,0.0,0.0,0.0,207.0,17.6,1017.7,0
2,26,1,1,2019-01-03,1,79,41.0,20.0,0.0,0.0,0.0,223.0,13.8,1013.15,0
3,26,1,1,2019-01-04,1,74,47.0,28.0,0.0,0.0,0.0,239.0,10.0,1008.6,0
4,26,1,1,2019-01-05,1,97,47.0,27.0,0.0,0.0,0.0,286.0,17.2,1014.1,0


# Date Manipulation

In [4]:
data.shape

(21409776, 15)

In [5]:
data['station_id'].nunique() 

155

In [6]:
grouped_station = data.groupby('station_id')['date'].min()

In [7]:
data.shape

(21409776, 15)

In [8]:
# A need to remove some stations, not enough data for them
stations_to_remove = list(grouped_station[grouped_station>='2021-01-01'].index)
mask_stations = data['station_id'].isin(stations_to_remove)
data = data[~mask_stations]
data.shape

(19738896, 15)

In [9]:
grouped_station_dir_lane = data.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['date'].min()
grouped_station_dir_lane.sort_values(ascending=False)

station_id  dir_of_travel  lane_of_travel
45          3              2                 2023-01-01
            7              2                 2023-01-01
6461        3              3                 2022-10-01
            7              3                 2022-10-01
10398       7              3                 2022-03-01
                                                ...    
365         1              1                 2019-01-01
233         5              1                 2019-01-01
            1              1                 2019-01-01
232         1              1                 2019-01-01
26          1              1                 2019-01-01
Name: date, Length: 525, dtype: object

In [10]:
# Same problem as before
unique_id_to_remove = list(grouped_station_dir_lane[grouped_station_dir_lane>='2021-01-01'].index)
df_ids_to_remove = pd.DataFrame(unique_id_to_remove, columns=['station_id', 'dir_of_travel', 'lane_of_travel'])
data = pd.merge(data, df_ids_to_remove, on=['station_id', 'dir_of_travel', 'lane_of_travel'], how='left', indicator=True)
data = data[data['_merge']=='left_only']
data.drop('_merge', axis=1, inplace=True)
data.shape

(19613352, 15)

In [11]:
# station_counts = data['station_id'].value_counts()

In [12]:
# # Remove the stations with more of the same data if they are same number of records  to prevent RAM overload
# mask = station_counts.duplicated(keep='first') & station_counts.duplicated(keep='last')
# stations_to_keep = station_counts[~mask].index
# mask_stations = data['station_id'].isin(stations_to_keep)
# data = data[mask_stations]
# data.shape

In [13]:
stations_needed = [10390, 11517, 11236, 70413, 11228, 11196, 11205, 10310, 11191, 42507, 11238, 10730, 10794, 10800, 10808, 11179, 11280, 10069, 11726, 11510, 11283, 11516, 10919, 10899, 11464, 11273, 10840, 11747, 11749, 10830, 10206, 10205, 405, 336, 425, 389, 301, 303, 464, 10398]
len(stations_needed)

40

In [14]:
# Choose the stations within Minnepolis/St. Paul to avoid any RAM problems
mask_stations = data['station_id'].isin(stations_needed)
data = data[mask_stations]
data.shape

(8167560, 15)

In [15]:
mask = data['hour']==24
data[mask].head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday
19141263,301,3,1,2020-01-02,24,315,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0
19141264,301,3,1,2020-01-03,24,385,36.0,24.0,0.0,0.0,4.0,238.0,12.6,1011.7,0
19141265,301,3,1,2020-01-04,24,404,30.0,24.0,0.0,0.0,4.0,226.0,15.9,1018.9,0
19141266,301,3,1,2020-01-05,24,221,38.0,24.0,0.01,0.0,4.0,274.0,28.5,1014.9,0
19141267,301,3,1,2020-01-06,24,279,36.0,22.0,0.0,0.0,3.0,231.0,18.2,1019.3,0


In [16]:
# Creating the day of the week, month, and year columns
data.loc[mask, 'hour'] = 0
data.loc[mask, 'date'] = pd.to_datetime(data.loc[mask, 'date']) + pd.DateOffset(days=1)

data['day_of_week'] = pd.to_datetime(data['date']).dt.dayofweek
data['day'] = pd.to_datetime(data['date']).dt.day
data['week'] = pd.to_datetime(data['date']).dt.isocalendar().week
data['month'] = pd.to_datetime(data['date']).dt.month
data['year'] = pd.to_datetime(data['date']).dt.year

data.head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year
224821,301,3,1,2020-01-02,1,135,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020
224822,301,3,1,2020-01-03,1,164,36.0,24.0,0.0,0.0,4.0,238.0,12.6,1011.7,0,4,3,1,1,2020
224823,301,3,1,2020-01-04,1,291,30.0,24.0,0.0,0.0,4.0,226.0,15.9,1018.9,0,5,4,1,1,2020
224824,301,3,1,2020-01-05,1,281,38.0,24.0,0.01,0.0,4.0,274.0,28.5,1014.9,0,6,5,1,1,2020
224825,301,3,1,2020-01-06,1,146,36.0,22.0,0.0,0.0,3.0,231.0,18.2,1019.3,0,0,6,2,1,2020


In [17]:
# Prevent overload in RAM
years_remove = [2019]
mask = data['year'].isin(years_remove)
data = data[~mask]
data.shape

(8098256, 20)

In [18]:
# Calculate the final number of features
data['station_id'].nunique() + data['dir_of_travel'].nunique() + data['lane_of_travel'].nunique() + 12

61

In [19]:
mask = data['hour']==0
data[mask].head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year
19141263,301,3,1,2020-01-03 00:00:00,0,315,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,4,3,1,1,2020
19141264,301,3,1,2020-01-04 00:00:00,0,385,36.0,24.0,0.0,0.0,4.0,238.0,12.6,1011.7,0,5,4,1,1,2020
19141265,301,3,1,2020-01-05 00:00:00,0,404,30.0,24.0,0.0,0.0,4.0,226.0,15.9,1018.9,0,6,5,1,1,2020
19141266,301,3,1,2020-01-06 00:00:00,0,221,38.0,24.0,0.01,0.0,4.0,274.0,28.5,1014.9,0,0,6,2,1,2020
19141267,301,3,1,2020-01-07 00:00:00,0,279,36.0,22.0,0.0,0.0,3.0,231.0,18.2,1019.3,0,1,7,2,1,2020


In [20]:
data['datetime'] = pd.to_datetime(data['date']) + pd.to_timedelta(data['hour'], unit='h')
data.head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year,datetime
224821,301,3,1,2020-01-02,1,135,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 01:00:00
224822,301,3,1,2020-01-03,1,164,36.0,24.0,0.0,0.0,4.0,238.0,12.6,1011.7,0,4,3,1,1,2020,2020-01-03 01:00:00
224823,301,3,1,2020-01-04,1,291,30.0,24.0,0.0,0.0,4.0,226.0,15.9,1018.9,0,5,4,1,1,2020,2020-01-04 01:00:00
224824,301,3,1,2020-01-05,1,281,38.0,24.0,0.01,0.0,4.0,274.0,28.5,1014.9,0,6,5,1,1,2020,2020-01-05 01:00:00
224825,301,3,1,2020-01-06,1,146,36.0,22.0,0.0,0.0,3.0,231.0,18.2,1019.3,0,0,6,2,1,2020,2020-01-06 01:00:00


In [21]:
mask = data['hour']==5
data[mask].head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year,datetime
3514637,301,3,1,2020-01-02,5,76,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 05:00:00
3514638,301,3,1,2020-01-03,5,80,36.0,24.0,0.0,0.0,4.0,238.0,12.6,1011.7,0,4,3,1,1,2020,2020-01-03 05:00:00
3514639,301,3,1,2020-01-04,5,60,30.0,24.0,0.0,0.0,4.0,226.0,15.9,1018.9,0,5,4,1,1,2020,2020-01-04 05:00:00
3514640,301,3,1,2020-01-05,5,70,38.0,24.0,0.01,0.0,4.0,274.0,28.5,1014.9,0,6,5,1,1,2020,2020-01-05 05:00:00
3514641,301,3,1,2020-01-06,5,71,36.0,22.0,0.0,0.0,3.0,231.0,18.2,1019.3,0,0,6,2,1,2020,2020-01-06 05:00:00


In [22]:
data['day_of_week'].value_counts().sort_index()

day_of_week
0    1153383
1    1153152
2    1157001
3    1158632
4    1158696
5    1158696
6    1158696
Name: count, dtype: int64

In [23]:
data['day'].value_counts().sort_index()

day
1     264648
2     266048
3     266112
4     266112
5     266112
6     266112
7     266112
8     266112
9     266112
10    266112
11    266112
12    266112
13    266112
14    266112
15    266112
16    266112
17    266112
18    266112
19    266112
20    266112
21    266112
22    266112
23    266112
24    266112
25    266112
26    266112
27    266112
28    266112
29    249480
30    243936
31    155232
Name: count, dtype: int64

In [24]:
data['week'].value_counts().sort_index()

week
1     142616
2     155232
3     155232
4     155232
5     155232
6     155232
7     155232
8     155232
9     155232
10    155232
11    155232
12    155232
13    155232
14    155232
15    155232
16    155232
17    155232
18    155232
19    155232
20    155232
21    155232
22    155232
23    155232
24    155232
25    155232
26    155232
27    155232
28    155232
29    155232
30    155232
31    155232
32    155232
33    155232
34    155232
35    155232
36    155232
37    155232
38    155232
39    155232
40    155232
41    155232
42    155232
43    155232
44    155232
45    155232
46    155232
47    155232
48    155232
49    155232
50    155232
51    155232
52    155232
53     38808
Name: count, dtype: Int64

In [25]:
data['month'].value_counts().sort_index()

month
1     685928
2     626472
3     687456
4     665280
5     687456
6     665280
7     687456
8     687456
9     665280
10    687456
11    665280
12    687456
Name: count, dtype: int64

In [26]:
data['year'].value_counts().sort_index()

year
2020    2027345
2021    2023560
2022    2023560
2023    2023560
2024        231
Name: count, dtype: int64

# Data Type Conversion

In [27]:
data.dtypes

station_id                                  int64
dir_of_travel                               int64
lane_of_travel                              int64
date                                       object
hour                                        int64
volume                                      int64
Maximum Temperature degrees (F)           float64
Minimum Temperature degrees (F)           float64
Precipitation (inches)                    float64
Snow (inches)                             float64
Snow Depth (inches)                       float64
wdir                                      float64
wspd                                      float64
pres                                      float64
is_holiday                                  int64
day_of_week                                 int32
day                                         int32
week                                       UInt32
month                                       int32
year                                        int32


In [28]:
data['station_id'] = data['station_id'].astype('category')
data['dir_of_travel'] = data['dir_of_travel'].astype('category')
data['lane_of_travel'] = data['lane_of_travel'].astype('category')

data.dtypes

station_id                               category
dir_of_travel                            category
lane_of_travel                           category
date                                       object
hour                                        int64
volume                                      int64
Maximum Temperature degrees (F)           float64
Minimum Temperature degrees (F)           float64
Precipitation (inches)                    float64
Snow (inches)                             float64
Snow Depth (inches)                       float64
wdir                                      float64
wspd                                      float64
pres                                      float64
is_holiday                                  int64
day_of_week                                 int32
day                                         int32
week                                       UInt32
month                                       int32
year                                        int32


In [29]:
data.head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year,datetime
224821,301,3,1,2020-01-02,1,135,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 01:00:00
224822,301,3,1,2020-01-03,1,164,36.0,24.0,0.0,0.0,4.0,238.0,12.6,1011.7,0,4,3,1,1,2020,2020-01-03 01:00:00
224823,301,3,1,2020-01-04,1,291,30.0,24.0,0.0,0.0,4.0,226.0,15.9,1018.9,0,5,4,1,1,2020,2020-01-04 01:00:00
224824,301,3,1,2020-01-05,1,281,38.0,24.0,0.01,0.0,4.0,274.0,28.5,1014.9,0,6,5,1,1,2020,2020-01-05 01:00:00
224825,301,3,1,2020-01-06,1,146,36.0,22.0,0.0,0.0,3.0,231.0,18.2,1019.3,0,0,6,2,1,2020,2020-01-06 01:00:00


# Data Preprocessing

In [30]:
df = data.copy()

In [31]:
# Checking for missing values
df.isnull().sum().sum()

0

In [32]:
df.sort_values(['station_id', 'dir_of_travel', 'lane_of_travel', 'datetime', 'hour'], inplace=True)
df.head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,date,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year,datetime
224821,301,3,1,2020-01-02,1,135,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 01:00:00
1047275,301,3,1,2020-01-02,2,89,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 02:00:00
1869729,301,3,1,2020-01-02,3,61,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 03:00:00
2692183,301,3,1,2020-01-02,4,37,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 04:00:00
3514637,301,3,1,2020-01-02,5,76,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 05:00:00


In [33]:
df = df.drop('date', axis=1)
df.head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year,datetime
224821,301,3,1,1,135,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 01:00:00
1047275,301,3,1,2,89,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 02:00:00
1869729,301,3,1,3,61,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 03:00:00
2692183,301,3,1,4,37,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 04:00:00
3514637,301,3,1,5,76,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 05:00:00


In [34]:
# Splitting the dataset into 3 sets: train, validation, and test
# mask_station = df['station_id'] == 10800
train_set = df[(df['year'] < 2023)]
valid_test = df[(df['year'] >= 2023)]
valid_set = valid_test[valid_test['month'] <= 6]
test_set = valid_test[valid_test['month'] > 6]

In [35]:
train_set.shape, valid_set.shape, test_set.shape

((6074465, 20), (1003695, 20), (1020096, 20))

In [36]:
mask_station = train_set['station_id'] == 10800
mask_dir = train_set['dir_of_travel'] == 3
mask_lane = train_set['lane_of_travel'] == 1
train_set[mask_station & mask_dir & mask_lane].head()

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year,datetime
581429,10800,3,1,1,85,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 01:00:00
1403883,10800,3,1,2,55,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 02:00:00
2226337,10800,3,1,3,47,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 03:00:00
3048791,10800,3,1,4,54,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 04:00:00
3871245,10800,3,1,5,126,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 05:00:00


### Helper Function

In [37]:
def lstm_inputs(X, y, n_timesteps, preprocessor, filename_X, filename_y, fit=True, save=True):
    X_reshaped, y_reshaped = [], []
    
    if fit:
        preprocessor.fit(X)
    
    X['unique_id'] = X['station_id'].astype(str) + '_' + X['dir_of_travel'].astype(str) + '_' + X['lane_of_travel'].astype(str)
    
    for category in tqdm(X['unique_id'].unique()):
        
        mask = X['unique_id']==category
        X_adj = X[mask]
        y_adj = y[mask].values.astype(np.float32)
        
        X_preprocessed = preprocessor.transform(X_adj).astype(np.float32)
        
        if hasattr(X_preprocessed, 'toarray'):
            X_preprocessed = X_preprocessed.toarray()
        
        if len(y_adj) < n_timesteps:
            continue
            
        for i in range(len(y_adj) - n_timesteps + 1):
            seq_x = X_preprocessed[i:i + n_timesteps]
            seq_y = y_adj[i + n_timesteps - 1] 
            
#             if len(seq_x) < n_timesteps:
#                 padding = np.zeros((n_timesteps - len(seq_x), X_preprocessed.shape[1]), dtype=np.float32)
#                 seq_x = np.vstack((padding, seq_x))
            
            X_reshaped.append(seq_x)
            y_reshaped.append(seq_y)
            
    X_numpy, y_numpy = np.array(X_reshaped), np.array(y_reshaped)
        
    if save:
        np.save(filename_X, X_numpy)
        np.save(filename_y, y_numpy)
            
    return tf.convert_to_tensor(X_numpy, dtype=tf.float32), tf.convert_to_tensor(y_numpy, dtype=tf.float32)

### Preparing the data

In [38]:
# importing libraries for preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [39]:
# Creating lag features
train_set['volume_lag1'] = train_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(1)
train_set['volume_lag2'] = train_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(2)
train_set['volume_lag3'] = train_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(3)
train_set.head()

  train_set['volume_lag1'] = train_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set['volume_lag1'] = train_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(1)
  train_set['volume_lag2'] = train_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set['volume_lag2'] = train_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(2)
  trai

Unnamed: 0,station_id,dir_of_travel,lane_of_travel,hour,volume,Maximum Temperature degrees (F),Minimum Temperature degrees (F),Precipitation (inches),Snow (inches),Snow Depth (inches),wdir,wspd,pres,is_holiday,day_of_week,day,week,month,year,datetime,volume_lag1,volume_lag2,volume_lag3
224821,301,3,1,1,135,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 01:00:00,,,
1047275,301,3,1,2,89,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 02:00:00,135.0,,
1869729,301,3,1,3,61,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 03:00:00,89.0,135.0,
2692183,301,3,1,4,37,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 04:00:00,61.0,89.0,135.0
3514637,301,3,1,5,76,37.0,31.0,0.0,0.0,5.0,272.0,19.1,1000.8,0,3,2,1,1,2020,2020-01-02 05:00:00,37.0,61.0,89.0


In [40]:
valid_set['volume_lag1'] = valid_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(1)
valid_set['volume_lag2'] = valid_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(2)
valid_set['volume_lag3'] = valid_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(3)

test_set['volume_lag1'] = test_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(1)
test_set['volume_lag2'] = test_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(2)
test_set['volume_lag3'] = test_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(3)

  valid_set['volume_lag1'] = valid_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_set['volume_lag1'] = valid_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(1)
  valid_set['volume_lag2'] = valid_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_set['volume_lag2'] = valid_set.groupby(['station_id', 'dir_of_travel', 'lane_of_travel'])['volume'].shift(2)
  vali

In [41]:
# Readying the data
# Defining the columns
# Defining the columns
cat_cols = ['station_id', 'dir_of_travel', 'lane_of_travel']
num_cols = ['Maximum Temperature degrees (F)', 'Minimum Temperature degrees (F)', 'Precipitation (inches)', 
            'Snow (inches)', 'Snow Depth (inches)', 'wdir', 'wspd', 'pres', 'year', 'day', 'volume_lag1']
trig_cols = ['hour', 'day_of_week', 'week', 'month']
pass_cols = ['is_holiday']
# pass_cols = ['hour', 'day_of_week', 'week', 'month', 'is_holiday']
target_col = 'volume'

# Splitting the data to X and y
X_train = train_set.drop(target_col, axis=1)
X_train = X_train.drop(['datetime'], axis=1)
X_train = X_train.drop(['volume_lag2', 'volume_lag3'], axis=1) # optional
y_train = train_set[target_col]

X_valid = valid_set.drop(target_col, axis=1)
X_valid = X_valid.drop(['datetime'], axis=1)
X_valid = X_valid.drop(['volume_lag2', 'volume_lag3'], axis=1) # optional
y_valid = valid_set[target_col]

X_test = test_set.drop(target_col, axis=1)
X_test = X_test.drop(['datetime'], axis=1)
X_test = X_test.drop(['volume_lag2', 'volume_lag3'], axis=1) # optional
y_test = test_set[target_col]

# Adding volume_lag2 and volume_lag3 optional
# num_cols += ['volume_lag2', 'volume_lag3']

In [42]:
from sklearn.preprocessing import FunctionTransformer


def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

In [43]:
# Transforming the categorical data
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Transforming the numerical data
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

# Transforming the date cols to trig data
trig_transformer = ColumnTransformer(transformers=[
    ('hour_sin', sin_transformer(24), ['hour']),
    ('hour_cos', cos_transformer(24), ['hour']),
    ('day_of_week_sin', sin_transformer(7), ['day_of_week']),
    ('day_of_week_cos', cos_transformer(7), ['day_of_week']),
    ('week_sin', sin_transformer(53), ['week']),
    ('week_cos', cos_transformer(53), ['week']),
    ('month_sin', sin_transformer(12), ['month']),
    ('month_cos', cos_transformer(12), ['month'])
])


# Combining the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols),
        ('trig', trig_transformer, trig_cols),
        ('pass', 'passthrough',  pass_cols)
    ])

In [44]:
# Reshaping the input
!mkdir train
n_timesteps = 3
filename_X = f'/kaggle/working/train/X_train_{n_timesteps}timesteps_lag1only'
filename_y = f'/kaggle/working/train/y_train_{n_timesteps}timesteps_lag1only'
X_train_tensor, y_train_tensor = lstm_inputs(X_train, y_train, n_timesteps, preprocessor, filename_X=filename_X, filename_y=filename_y)
time.sleep(10)

100%|██████████| 231/231 [04:53<00:00,  1.27s/it]


In [45]:
# Reshaping the input
!mkdir valid
filename_X = f'/kaggle/working/valid/X_valid_{n_timesteps}timesteps_lag1only'
filename_y = f'/kaggle/working/valid/y_valid_{n_timesteps}timesteps_lag1only'
X_valid_tensor, y_valid_tensor = lstm_inputs(X_valid, y_valid, n_timesteps, preprocessor, filename_X=filename_X, filename_y=filename_y)
time.sleep(10)

100%|██████████| 231/231 [00:52<00:00,  4.39it/s]


In [46]:
# Reshaping the input
!mkdir test
filename_X = f'/kaggle/working/test/X_test_{n_timesteps}timesteps_lag1only'
filename_y = f'/kaggle/working/test/y_test_{n_timesteps}timesteps_lag1only'
X_test_tensor, y_test_tensor = lstm_inputs(X_test, y_test, n_timesteps, preprocessor, filename_X=filename_X, filename_y=filename_y)
time.sleep(10)

100%|██████████| 231/231 [00:53<00:00,  4.34it/s]


In [47]:
X_train_tensor.shape, y_train_tensor.shape

(TensorShape([6074234, 2, 70]), TensorShape([6074234]))

In [48]:
X_valid_tensor.shape, y_valid_tensor.shape

(TensorShape([1003464, 2, 70]), TensorShape([1003464]))

In [49]:
X_train_tensor[0]

<tf.Tensor: shape=(2, 70), dtype=float32, numpy=
array([[-0.8088384 , -0.36274067, -0.33453488, -0.18419866,  1.2734506 ,
         0.69969475,  0.29899016, -2.0042899 , -1.2237915 , -1.5611751 ,
        -1.1824385 , -1.1823847 ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.25881904,  0.9659258 ,  0.43388373, -0.90096885,
  

In [50]:
y_train_tensor[0], y_train_tensor[1]

(<tf.Tensor: shape=(), dtype=float32, numpy=89.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=61.0>)