# Info:
Our aim in this competition is to predict energy consumption of buildings.

There are 4 types of energy to predict:

0: electricity <br>
1: chilledwater <br>
2: steam <br>
3: hotwater <br>


https://www.kaggle.com/code/corochann/optuna-tutorial-for-hyperparameter-optimization
https://www.kaggle.com/code/corochann/ashrae-training-lgbm-by-meter-type/input

# Libs: 

In [1]:
import gc
import os
from pathlib import Path
import random
import sys

from tqdm import tqdm_notebook as tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML

# --- plotly ---
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

# --- models ---
from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# Load Data Reduce Memory Usage:
https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
%%time

# Read data...
root = '../Optuna/'
train_df = pd.read_csv(os.path.join(root, 'train.csv'))
weather_train_df = pd.read_csv(os.path.join(root, 'weather_train.csv'))
test_df = pd.read_csv(os.path.join(root, 'test.csv'))
weather_test_df = pd.read_csv(os.path.join(root, 'weather_test.csv'))
building_meta_df = pd.read_csv(os.path.join(root, 'building_metadata.csv'))
sample_submission = pd.read_csv(os.path.join(root, 'sample_submission.csv'))

Wall time: 38.6 s


In [4]:
train_df = reduce_mem_usage(train_df)
weather_train_df = reduce_mem_usage(weather_train_df)
test_df = reduce_mem_usage(test_df)
weather_test_df = reduce_mem_usage(weather_test_df)
building_meta_df = reduce_mem_usage(building_meta_df)
sample_submission = reduce_mem_usage(sample_submission)

Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 173.84 MB
Decreased by 71.8%
Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 4.45 MB
Decreased by 53.6%
Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 358.53 MB
Decreased by 71.8%
Memory usage of dataframe is 19.04 MB
Memory usage after optimization is: 8.83 MB
Decreased by 53.6%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 65.6%
Memory usage of dataframe is 636.26 MB
Memory usage after optimization is: 198.83 MB
Decreased by 68.7%


# EDA

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 4 columns):
 #   Column         Dtype   
---  ------         -----   
 0   building_id    int16   
 1   meter          int8    
 2   timestamp      category
 3   meter_reading  float32 
dtypes: category(1), float32(1), int16(1), int8(1)
memory usage: 173.8 MB


In [6]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.0
1,1,0,2016-01-01 00:00:00,0.0
2,2,0,2016-01-01 00:00:00,0.0
3,3,0,2016-01-01 00:00:00,0.0
4,4,0,2016-01-01 00:00:00,0.0


# Preprocessing:

In [7]:
train_df['date'] = pd.to_datetime(train_df['timestamp'])
train_df['meter_reading_log1p'] = np.log1p(train_df['meter_reading'])
np.sum(train_df['meter_reading_log1p'].values < 0)

0

In [8]:
def plot_date_usage(train_df, meter=0, building_id=0):
    train_temp_df = train_df[train_df['meter'] == meter]
    train_temp_df = train_temp_df[train_temp_df['building_id'] == building_id]    
    train_temp_df_meter = train_temp_df.groupby('date')['meter_reading_log1p'].sum()
    train_temp_df_meter = train_temp_df_meter.to_frame().reset_index()
    fig = px.line(train_temp_df_meter, x='date', y='meter_reading_log1p')
    fig.show()

In [9]:
plot_date_usage(train_df, meter=0, building_id=0)
# All electricity meter is 0 until May 20 for site_id == 0, so we are removing this data from the training
# data

In [10]:
# It corresponds to building_id <= 104
building_meta_df[building_meta_df.site_id == 0]

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,
...,...,...,...,...,...,...
100,0,100,Lodging/residential,24456,1968.0,
101,0,101,Office,18860,1986.0,
102,0,102,Office,15876,1983.0,
103,0,103,Education,21657,2016.0,


In [11]:
def preprocess(df):
    df["hour"] = df["date"].dt.hour
    df["day"] = df["date"].dt.day
    df["month"] =  df["date"].dt.month
    df["year"] =  df["date"].dt.year
    df["dayofweek"] =  df["date"].dt.dayofweek
    df["weekend"] =  df["dayofweek"] >= 5
    return df

In [12]:
train_df = preprocess(train_df)

In [13]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 12 columns):
 #   Column               Dtype   
---  ------               -----   
 0   building_id          int16   
 1   meter                int8    
 2   timestamp            category
 3   meter_reading        float32 
 4   date                 category
 5   meter_reading_log1p  float32 
 6   hour                 int64   
 7   day                  int64   
 8   month                int64   
 9   year                 int64   
 10  dayofweek            int64   
 11  weekend              bool    
dtypes: bool(1), category(2), float32(2), int16(1), int64(5), int8(1)
memory usage: 1.1 GB


In [14]:
#Removing rows where eletricity meter is 0 until 20/05/2020 and site_id ==0:
train_df = train_df.query("not (building_id <= 104 & meter == 0 & (day <= 20 & month <= 5 & year <= 2020))")

In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19968871 entries, 103 to 20216099
Data columns (total 12 columns):
 #   Column               Dtype   
---  ------               -----   
 0   building_id          int16   
 1   meter                int8    
 2   timestamp            category
 3   meter_reading        float32 
 4   date                 category
 5   meter_reading_log1p  float32 
 6   hour                 int64   
 7   day                  int64   
 8   month                int64   
 9   year                 int64   
 10  dayofweek            int64   
 11  weekend              bool    
dtypes: bool(1), category(2), float32(2), int16(1), int64(5), int8(1)
memory usage: 1.2 GB


In [16]:
# Calculate max, mean, min, std of the meter_reading_log1p by specific building:
df_group = train_df.groupby(['building_id', 'meter'])['meter_reading_log1p']
building_mean = df_group.mean().astype(np.float16)
building_median = df_group.median().astype(np.float16)
building_min = df_group.min().astype(np.float16)
building_max = df_group.max().astype(np.float16)
building_std = df_group.std().astype(np.float16)
building_stats_df = pd.concat([building_mean, building_median, building_min, building_max, building_std], axis=1,
                              keys=['building_mean', 'building_median', 'building_min', 'building_max', 'building_std']).reset_index()
building_stats_df.head()

Unnamed: 0,building_id,meter,building_mean,building_median,building_min,building_max,building_std
0,0,0,4.609375,5.472656,0.0,5.804688,1.978516
1,1,0,4.058594,4.875,0.0,5.390625,1.710938
2,2,0,2.544922,2.923828,0.0,4.234375,1.24707
3,3,0,4.984375,5.863281,0.0,6.488281,2.15625
4,4,0,6.238281,7.359375,0.0,7.570312,2.646484


In [17]:
# Merge with original df to bring as stats features:
train_df = pd.merge(train_df, building_stats_df, on=['building_id', 'meter'], how='left', copy=False)
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,date,meter_reading_log1p,hour,day,month,year,dayofweek,weekend,building_mean,building_median,building_min,building_max,building_std
0,105,0,2016-01-01 00:00:00,23.3036,2016-01-01,3.190624,0,1,1,2016,4,False,4.316406,4.332031,3.191406,5.164062,0.318115
1,106,0,2016-01-01 00:00:00,0.3746,2016-01-01,0.318163,0,1,1,2016,4,False,0.751953,0.559082,0.0,2.890625,0.478516
2,106,3,2016-01-01 00:00:00,0.0,2016-01-01,0.0,0,1,1,2016,4,False,1.023438,0.0,0.0,3.712891,1.268555
3,107,0,2016-01-01 00:00:00,175.184006,2016-01-01,5.171529,0,1,1,2016,4,False,4.570312,5.78125,0.039703,6.382812,2.009766
4,108,0,2016-01-01 00:00:00,91.265297,2016-01-01,4.524668,0,1,1,2016,4,False,5.457031,5.449219,4.417969,6.113281,0.216187


# Filling Nan values by interpolation - weather_train:

In [18]:
weather_train_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.700012,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.200012,70.0,1.5
2,0,2016-01-01 02:00:00,22.799999,2.0,21.1,0.0,1020.200012,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.099976,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6


In [19]:
weather_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139773 entries, 0 to 139772
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   site_id             139773 non-null  int8    
 1   timestamp           139773 non-null  category
 2   air_temperature     139718 non-null  float32 
 3   cloud_coverage      70600 non-null   float32 
 4   dew_temperature     139660 non-null  float32 
 5   precip_depth_1_hr   89484 non-null   float32 
 6   sea_level_pressure  129155 non-null  float32 
 7   wind_direction      133505 non-null  float32 
 8   wind_speed          139469 non-null  float32 
dtypes: category(1), float32(7), int8(1)
memory usage: 4.5 MB


In [20]:
weather_train_df.isna().sum()

site_id                   0
timestamp                 0
air_temperature          55
cloud_coverage        69173
dew_temperature         113
precip_depth_1_hr     50289
sea_level_pressure    10618
wind_direction         6268
wind_speed              304
dtype: int64

In [21]:
weather_train_df['date'] = pd.to_datetime(weather_train_df['timestamp'])
weather_test_df['date'] = pd.to_datetime(weather_test_df['timestamp'])

weather = pd.concat([weather_train_df, weather_test_df],ignore_index=True)
#del weather_test_df
weather_key = ['site_id', 'date']

In [22]:
temp_skeleton = weather[weather_key + ['air_temperature']].drop_duplicates(subset=weather_key).sort_values(by=weather_key).copy()
# calculate ranks of hourly temperatures within date/site_id chunks
temp_skeleton['temp_rank'] = temp_skeleton.groupby(['site_id', temp_skeleton.date])['air_temperature'].rank('average')
temp_skeleton.head(20)

Unnamed: 0,site_id,date,air_temperature,temp_rank
0,0,2016-01-01 00:00:00,25.0,1.0
1,0,2016-01-01 01:00:00,24.4,1.0
2,0,2016-01-01 02:00:00,22.799999,1.0
3,0,2016-01-01 03:00:00,21.1,1.0
4,0,2016-01-01 04:00:00,20.0,1.0
5,0,2016-01-01 05:00:00,19.4,1.0
6,0,2016-01-01 06:00:00,21.1,1.0
7,0,2016-01-01 07:00:00,21.1,1.0
8,0,2016-01-01 08:00:00,20.6,1.0
9,0,2016-01-01 09:00:00,21.1,1.0


In [23]:
# create a dataframe of site_ids (0-16) x mean hour rank of temperature within day (0-23)
df_2d = temp_skeleton.groupby(['site_id', temp_skeleton.date.dt.hour])['temp_rank'].mean().unstack(level=1)
df_2d.head()

date,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
# Subtract the columnID of temperature peak by 14, getting the timestamp alignment gap.
site_ids_offsets = pd.Series(df_2d.values.argmax(axis=1) - 14)
site_ids_offsets.index.name = 'site_id'

In [25]:
def timestamp_align(df):
    df['offset'] = df.site_id.map(site_ids_offsets)
    df['timestamp_aligned'] = (df.date - pd.to_timedelta(df.offset, unit='H'))
    df['date'] = df['timestamp_aligned']
    #del df['timestamp_aligned']
    return df

In [26]:
#del weather
#del temp_skeleton
gc.collect()

835

In [27]:
weather_train_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,date
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.700012,0.0,0.0,2016-01-01 00:00:00
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.200012,70.0,1.5,2016-01-01 01:00:00
2,0,2016-01-01 02:00:00,22.799999,2.0,21.1,0.0,1020.200012,0.0,0.0,2016-01-01 02:00:00
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.099976,0.0,0.0,2016-01-01 03:00:00
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6,2016-01-01 04:00:00


In [28]:
weather_train_df = timestamp_align(weather_train_df)
weather_train_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,date,offset,timestamp_aligned
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.700012,0.0,0.0,2016-01-01 14:00:00,-14,2016-01-01 14:00:00
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.200012,70.0,1.5,2016-01-01 15:00:00,-14,2016-01-01 15:00:00
2,0,2016-01-01 02:00:00,22.799999,2.0,21.1,0.0,1020.200012,0.0,0.0,2016-01-01 16:00:00,-14,2016-01-01 16:00:00
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.099976,0.0,0.0,2016-01-01 17:00:00,-14,2016-01-01 17:00:00
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6,2016-01-01 18:00:00,-14,2016-01-01 18:00:00


In [29]:
weather_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139773 entries, 0 to 139772
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   site_id             139773 non-null  int8          
 1   timestamp           139773 non-null  category      
 2   air_temperature     139718 non-null  float32       
 3   cloud_coverage      70600 non-null   float32       
 4   dew_temperature     139660 non-null  float32       
 5   precip_depth_1_hr   89484 non-null   float32       
 6   sea_level_pressure  129155 non-null  float32       
 7   wind_direction      133505 non-null  float32       
 8   wind_speed          139469 non-null  float32       
 9   date                139773 non-null  datetime64[ns]
 10  offset              139773 non-null  int64         
 11  timestamp_aligned   139773 non-null  datetime64[ns]
dtypes: category(1), datetime64[ns](2), float32(7), int64(1), int8(1)
memory usage: 7.7 MB


In [30]:
# Interpolate do not work with datetime64[ns] (date,timestamp_aligned), category (timestamp) or
# int64 (offset) columns, so we create an aux df:
# Create an index for df since we will not have the timestamp to merge them:
weather_train_df['index'] = weather_train_df.index
weather_train_df_aux = weather_train_df
# Create an index for df since we will not have de timestamp to merge them:
weather_train_df_aux['offset']= weather_train_df_aux['offset'].astype(float)
weather_train_df_aux = weather_train_df_aux.drop(['date','timestamp_aligned', 'timestamp'], axis=1) 
weather_train_df_aux = weather_train_df_aux.groupby('site_id').apply(lambda group: group.interpolate(limit_direction='both'))
weather_train_df_aux.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 139773 entries, 0 to 139772
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   site_id             139773 non-null  int8   
 1   air_temperature     139773 non-null  float32
 2   cloud_coverage      122545 non-null  float32
 3   dew_temperature     139773 non-null  float32
 4   precip_depth_1_hr   113500 non-null  float32
 5   sea_level_pressure  131018 non-null  float32
 6   wind_direction      139773 non-null  float32
 7   wind_speed          139773 non-null  float32
 8   offset              139773 non-null  float64
 9   index               139773 non-null  int64  
dtypes: float32(7), float64(1), int64(1), int8(1)
memory usage: 7.1 MB


In [31]:
# Nulls before interpolation:
weather_train_df.isna().sum()

site_id                   0
timestamp                 0
air_temperature          55
cloud_coverage        69173
dew_temperature         113
precip_depth_1_hr     50289
sea_level_pressure    10618
wind_direction         6268
wind_speed              304
date                      0
offset                    0
timestamp_aligned         0
index                     0
dtype: int64

In [32]:
# Nulls after interpolation:
weather_train_df_aux.isna().sum()

site_id                   0
air_temperature           0
cloud_coverage        17228
dew_temperature           0
precip_depth_1_hr     26273
sea_level_pressure     8755
wind_direction            0
wind_speed                0
offset                    0
index                     0
dtype: int64

In [33]:
weather_train_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,date,offset,timestamp_aligned,index
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.700012,0.0,0.0,2016-01-01 14:00:00,-14.0,2016-01-01 14:00:00,0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.200012,70.0,1.5,2016-01-01 15:00:00,-14.0,2016-01-01 15:00:00,1
2,0,2016-01-01 02:00:00,22.799999,2.0,21.1,0.0,1020.200012,0.0,0.0,2016-01-01 16:00:00,-14.0,2016-01-01 16:00:00,2
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.099976,0.0,0.0,2016-01-01 17:00:00,-14.0,2016-01-01 17:00:00,3
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6,2016-01-01 18:00:00,-14.0,2016-01-01 18:00:00,4


In [34]:
# Merge to bring back original features:
weather_train_df = pd.merge(weather_train_df_aux, weather_train_df, on=['index', 'site_id'], 
                            how='left', copy=False,suffixes=('', '_y'))
# Drop duplicates columns:
weather_train_df.drop(weather_train_df.filter(regex='_y$').columns, axis=1, inplace=True)
weather_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 139773 entries, 0 to 139772
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   site_id             139773 non-null  int8          
 1   air_temperature     139773 non-null  float32       
 2   cloud_coverage      122545 non-null  float32       
 3   dew_temperature     139773 non-null  float32       
 4   precip_depth_1_hr   113500 non-null  float32       
 5   sea_level_pressure  131018 non-null  float32       
 6   wind_direction      139773 non-null  float32       
 7   wind_speed          139773 non-null  float32       
 8   offset              139773 non-null  float64       
 9   index               139773 non-null  int64         
 10  timestamp           139773 non-null  category      
 11  date                139773 non-null  datetime64[ns]
 12  timestamp_aligned   139773 non-null  datetime64[ns]
dtypes: category(1), datetime64[ns

In [35]:
weather_train_df.head()

Unnamed: 0,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,offset,index,timestamp,date,timestamp_aligned
0,0,25.0,6.0,20.0,-1.0,1019.700012,0.0,0.0,-14.0,0,2016-01-01 00:00:00,2016-01-01 14:00:00,2016-01-01 14:00:00
1,0,24.4,4.0,21.1,-1.0,1020.200012,70.0,1.5,-14.0,1,2016-01-01 01:00:00,2016-01-01 15:00:00,2016-01-01 15:00:00
2,0,22.799999,2.0,21.1,0.0,1020.200012,0.0,0.0,-14.0,2,2016-01-01 02:00:00,2016-01-01 16:00:00,2016-01-01 16:00:00
3,0,21.1,2.0,20.6,0.0,1020.099976,0.0,0.0,-14.0,3,2016-01-01 03:00:00,2016-01-01 17:00:00,2016-01-01 17:00:00
4,0,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6,-14.0,4,2016-01-01 04:00:00,2016-01-01 18:00:00,2016-01-01 18:00:00
