In [1]:
from os.path import join as pjoin
import numpy as np
import pandas as pd
from datetime import datetime
from pytz import timezone
import pytz

In [2]:
RAW_DATA_DIR = "../data/raw"

print('Loading init weather data...')
# load and concatenate weather data
weather_dtypes = {
    'site_id': np.uint8,
    'air_temperature': np.float32,
    'cloud_coverage': np.float32,
    'dew_temperature': np.float32,
    'precip_depth_1_hr': np.float32,
    'sea_level_pressure': np.float32,
    'wind_direction': np.float32,
    'wind_speed': np.float32,
}

weather_train = pd.read_csv(
    pjoin(RAW_DATA_DIR, 'weather_train.csv'),
    dtype=weather_dtypes,
    parse_dates=['timestamp']
)
weather_test = pd.read_csv(
    pjoin(RAW_DATA_DIR, 'weather_test.csv'),
    dtype=weather_dtypes,
    parse_dates=['timestamp']
)

weather = pd.concat(
    [
        weather_train,
        weather_test
    ],
    ignore_index=True
)
# del redundant dfs
del weather_train, weather_test

Loading init weather data...


In [3]:
weather.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.700012,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.200012,70.0,1.5
2,0,2016-01-01 02:00:00,22.799999,2.0,21.1,0.0,1020.200012,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.099976,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6


In [7]:
time_zones = pd.read_csv('../data/time_zones.csv', delimiter = ";")
time_zones.head()

Unnamed: 0,site_id,timezone,country_code,location
0,0,US/Eastern,US,"Orlando, FL"
1,1,Europe/London,UK,"UK, Southhampton"
2,2,US/Mountain,US,"Tempe, AZ"
3,3,US/Eastern,US,"Washington, WA"
4,4,US/Pacific,US,"San Francisco, CA"


In [8]:
weather_tz = weather.merge(time_zones, on = "site_id", how = "left")

In [9]:
weather_tz.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,timezone,country_code,location
0,0,2016-01-01 00:00:00,25.000000,6.0,20.0,,1019.700012,0.0,0.0,US/Eastern,US,"Orlando, FL"
1,0,2016-01-01 01:00:00,24.400000,,21.1,-1.0,1020.200012,70.0,1.5,US/Eastern,US,"Orlando, FL"
2,0,2016-01-01 02:00:00,22.799999,2.0,21.1,0.0,1020.200012,0.0,0.0,US/Eastern,US,"Orlando, FL"
3,0,2016-01-01 03:00:00,21.100000,2.0,20.6,0.0,1020.099976,0.0,0.0,US/Eastern,US,"Orlando, FL"
4,0,2016-01-01 04:00:00,20.000000,2.0,20.0,-1.0,1020.000000,250.0,2.6,US/Eastern,US,"Orlando, FL"
...,...,...,...,...,...,...,...,...,...,...,...,...
417011,15,2018-12-31 19:00:00,3.300000,,1.7,,1018.299988,150.0,7.7,US/Eastern,US,"Pittsburgh, PA"
417012,15,2018-12-31 20:00:00,2.800000,,1.1,,1017.799988,140.0,5.1,US/Eastern,US,"Pittsburgh, PA"
417013,15,2018-12-31 21:00:00,2.800000,,1.7,-1.0,1017.200012,140.0,6.2,US/Eastern,US,"Pittsburgh, PA"
417014,15,2018-12-31 22:00:00,2.800000,,2.2,8.0,1016.099976,140.0,5.1,US/Eastern,US,"Pittsburgh, PA"


In [10]:
# define a time format
fmt = '%Y-%m-%d %H:%M:%S %Z%z'
# time object
utc = pytz.utc

In [11]:
utc.localize(weather_tz.timestamp[1])

Timestamp('2016-01-01 01:00:00+0000', tz='UTC')

In [12]:
# bring all timestamps into utc timeformat
weather_tz['timestamp'] = weather_tz.timestamp.apply(lambda x: utc.localize(x))


In [13]:
weather_tz.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,timezone,country_code,location
0,0,2016-01-01 00:00:00+00:00,25.0,6.0,20.0,,1019.700012,0.0,0.0,US/Eastern,US,"Orlando, FL"
1,0,2016-01-01 01:00:00+00:00,24.4,,21.1,-1.0,1020.200012,70.0,1.5,US/Eastern,US,"Orlando, FL"
2,0,2016-01-01 02:00:00+00:00,22.799999,2.0,21.1,0.0,1020.200012,0.0,0.0,US/Eastern,US,"Orlando, FL"
3,0,2016-01-01 03:00:00+00:00,21.1,2.0,20.6,0.0,1020.099976,0.0,0.0,US/Eastern,US,"Orlando, FL"
4,0,2016-01-01 04:00:00+00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6,US/Eastern,US,"Orlando, FL"


In [14]:
weather_tz['timezone'] = weather_tz.timezone.apply(lambda x: timezone(x))

In [18]:
weather_tz['timestamp_local'] = weather_tz.apply(lambda x: x.timestamp.astimezone(x.timezone), axis = 1)

In [19]:
weather_tz.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,timezone,country_code,location,timestamp_local
0,0,2016-01-01 00:00:00+00:00,25.0,6.0,20.0,,1019.700012,0.0,0.0,US/Eastern,US,"Orlando, FL",2015-12-31 19:00:00-05:00
1,0,2016-01-01 01:00:00+00:00,24.4,,21.1,-1.0,1020.200012,70.0,1.5,US/Eastern,US,"Orlando, FL",2015-12-31 20:00:00-05:00
2,0,2016-01-01 02:00:00+00:00,22.799999,2.0,21.1,0.0,1020.200012,0.0,0.0,US/Eastern,US,"Orlando, FL",2015-12-31 21:00:00-05:00
3,0,2016-01-01 03:00:00+00:00,21.1,2.0,20.6,0.0,1020.099976,0.0,0.0,US/Eastern,US,"Orlando, FL",2015-12-31 22:00:00-05:00
4,0,2016-01-01 04:00:00+00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6,US/Eastern,US,"Orlando, FL",2015-12-31 23:00:00-05:00


In [20]:
weather_tz.timestamp_local[1] == pd.Timestamp('2015-12-31 20:00:00-0500', tz='US/Eastern')

True

In [21]:
weather_tz.timestamp_local == pd.Timestamp('2015-12-31 20:00:00-0500', tz='US/Eastern')

0         False
1          True
2         False
3         False
4         False
          ...  
417011    False
417012    False
417013    False
417014    False
417015    False
Name: timestamp_local, Length: 417016, dtype: bool

In [22]:
k = weather_tz.timestamp_local == pd.Timestamp('2016-03-13 03:00:00-0400', tz='US/Eastern')

In [23]:
weather_tz.loc[k,]

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,timezone,country_code,location,timestamp_local
1735,0,2016-03-13 07:00:00+00:00,20.0,,16.700001,0.0,1017.299988,160.0,3.1,US/Eastern,US,"Orlando, FL",2016-03-13 03:00:00-04:00
10508,1,2016-03-13 07:00:00+00:00,3.4,,3.1,,1034.699951,30.0,1.5,Europe/London,UK,"UK, Southhampton",2016-03-13 07:00:00+00:00
19282,2,2016-03-13 07:00:00+00:00,15.6,0.0,0.0,0.0,1016.599976,80.0,3.6,US/Mountain,US,"Tempe, AZ",2016-03-13 00:00:00-07:00
28065,3,2016-03-13 07:00:00+00:00,12.8,,8.9,0.0,1016.799988,0.0,0.0,US/Eastern,US,"Washington, WA",2016-03-13 03:00:00-04:00
36844,4,2016-03-13 07:00:00+00:00,13.9,,11.7,,1019.400024,170.0,6.2,US/Pacific,US,"San Francisco, CA",2016-03-12 23:00:00-08:00
45614,5,2016-03-13 07:00:00+00:00,7.0,,4.0,,,50.0,2.1,Europe/London,UK,"UK, London",2016-03-13 07:00:00+00:00
54382,6,2016-03-13 07:00:00+00:00,15.0,0.0,10.6,0.0,1015.700012,0.0,0.0,US/Eastern,US,Philadelphia,2016-03-13 03:00:00-04:00
63060,7,2016-03-13 07:00:00+00:00,4.2,,0.3,,1017.299988,310.0,3.6,Canada/Eastern,CA,Montreal/Ottawa,2016-03-13 03:00:00-04:00
71779,8,2016-03-13 07:00:00+00:00,20.0,,16.700001,0.0,1017.299988,160.0,3.1,US/Eastern,US,"Orlando, FL",2016-03-13 03:00:00-04:00
80562,9,2016-03-13 07:00:00+00:00,14.4,0.0,12.2,0.0,1009.0,190.0,1.5,US/Central,US,"Austin, TX",2016-03-13 01:00:00-06:00


In [25]:
timezone("US/Mountain")

<DstTzInfo 'US/Mountain' LMT-1 day, 17:00:00 STD>