In [1]:
import re
import string
import pandas as pd
import numpy as np
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

In [2]:
pcsvpath = '../pollen/datafile.csv'
sample = pd.read_csv(pcsvpath)

In [3]:
sample

Unnamed: 0,date,lat,lon,t,2t
0,2000-01-01,-10.0,113.0,302.2,300.6
1,2000-01-01,-10.0,114.0,302.2,300.8
2,2000-01-01,-10.0,115.0,302.2,301.3
3,2000-01-01,-10.0,116.0,302.4,301.9
4,2000-01-01,-10.0,117.0,302.5,302.3
...,...,...,...,...,...
177067,2000-01-31 18:00:00,-43.0,150.0,289.4,287.9
177068,2000-01-31 18:00:00,-43.0,151.0,289.1,287.3
177069,2000-01-31 18:00:00,-43.0,152.0,288.9,287.4
177070,2000-01-31 18:00:00,-43.0,153.0,288.6,287.4


t : surface level temp </br>
2t: temp at 2m level </br>

In [4]:
sample.dtypes

date     object
lat     float64
lon     float64
t       float64
2t      float64
dtype: object

## Preprocessing

In [5]:
def kelvinToCelsius(kelvin):
    
    return kelvin - 273.15

### 1. Filter Melbourne 
Coord for Melbourne: -37.814, 144.96332

In [6]:
# consider the grid

melbourne = [-37.814, 144.96332]
lat = float(round(melbourne[0])) # -38
lon = float(round(melbourne[1])) # 145

sample_mel = sample[(sample['lat'] == lat) & (sample['lon'] == lon)]

### 2. Convert unit

In [7]:
# convert unit
sample_mel['t_C'] = sample_mel['t'].apply(kelvinToCelsius)
sample_mel['2t_C'] = sample_mel['2t'].apply(kelvinToCelsius)

### 3. Datetime

In [8]:
## utc->aest

sample_mel['datetime'] = pd.to_datetime(sample_mel['date'], format='%Y-%m-%d %H:%M:%S')

sample_mel = sample_mel.drop(['date', 't', '2t', 'lat', 'lon'], axis=1)

sample_mel

Unnamed: 0,t_C,2t_C,datetime
1208,15.450000,15.15,2000-01-01 00:00:00
2636,19.150000,18.25,2000-01-02 00:00:00
4064,25.150000,24.75,2000-01-03 00:00:00
5492,11.350000,11.45,2000-01-04 00:00:00
6920,11.250000,11.35,2000-01-05 00:00:00
...,...,...,...
171140,15.050000,13.25,2000-01-27 18:00:00
172568,14.550024,12.95,2000-01-28 18:00:00
173996,14.250000,12.25,2000-01-29 18:00:00
175424,13.450000,11.15,2000-01-30 18:00:00


In [9]:
from dateutil import tz

# METHOD 2: Auto-detect zones:
from_zone = tz.tzutc()
to_zone = tz.tzlocal()

# utc = datetime.utcnow()
utc = datetime.strptime('2000-01-31 18:00:00', '%Y-%m-%d %H:%M:%S')

# Tell the datetime object that it's in UTC time zone since 
# datetime objects are 'naive' by default
utc = utc.replace(tzinfo=from_zone)

# Convert time zone
local = utc.astimezone(to_zone).strftime('%Y-%m-%d %H:%M:%S')

print(local)

2000-02-01 05:00:00


## handle with different features

In [10]:
df_new = sample_mel.copy()

# extract date
df_new['date'] = df_new['datetime'].dt.strftime('%Y-%m-%d')

# extract hour
df_new['hour'] = df_new['datetime'].dt.hour

# print out four hour
df_new['hour'].value_counts().index.to_list()

[0, 6, 12, 18]

In [11]:
# df_new[df_new['date'] == '2000-01-01']

### Create new columns for 4 hours

In [12]:
def new_hour_df(df, hour):

    data_h_new = df[df['hour'] == hour]

    data_h_new = data_h_new.set_index('date')   

    data_h_new.rename(columns = {'t_C':'t_' + str(hour) + '_1d', '2t_C':'2t_' + str(hour) + '_1d'}, inplace = True)
    
    data_h_new = data_h_new.drop(['hour', 'datetime'], axis=1)
    
    return data_h_new

In [13]:
df_0 = new_hour_df(df_new, 0)
df_6 = new_hour_df(df_new, 6)
df_12 = new_hour_df(df_new, 12)
df_18 = new_hour_df(df_new, 18)

In [14]:
df_12

Unnamed: 0_level_0,t_12_1d,2t_12_1d
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-01-01,10.85,10.85
2000-01-02,15.65,15.55
2000-01-03,13.95,13.45
2000-01-04,8.85,9.05
2000-01-05,10.25,10.45
2000-01-06,10.95,11.05
2000-01-07,13.85,13.95
2000-01-08,19.85,19.75
2000-01-09,21.85,21.55
2000-01-10,22.65,22.55


### Calculate the mean/min/max of 4 hours for one day

In [15]:
# group by date with hours by mean
df_mean = df_new.groupby('date')[['t_C', '2t_C']].mean()

df_mean.rename(columns = {'t_C':'t_mean_1d', '2t_C':'2t_mean_1d'}, inplace = True)

df_mean

Unnamed: 0_level_0,t_mean_1d,2t_mean_1d
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-01-01,12.725,12.6
2000-01-02,18.575,18.275
2000-01-03,18.45,18.15
2000-01-04,10.425,10.525
2000-01-05,11.15,11.2
2000-01-06,12.875,12.8
2000-01-07,15.35,15.125
2000-01-08,21.325,20.95
2000-01-09,23.850006,23.525
2000-01-10,23.5,23.2


In [16]:
# group by date with hours by min
df_min = df_new.groupby('date')[['t_C', '2t_C']].min()

df_min.rename(columns = {'t_C':'t_min_1d', '2t_C':'2t_min_1d'}, inplace = True)

df_min

Unnamed: 0_level_0,t_min_1d,2t_min_1d
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-01-01,6.95,7.25
2000-01-02,14.65,14.95
2000-01-03,9.35,9.45
2000-01-04,8.85,9.05
2000-01-05,9.85,9.95
2000-01-06,9.85,9.95
2000-01-07,10.05,10.05
2000-01-08,16.05,16.05
2000-01-09,19.55,19.45
2000-01-10,14.65,14.65


In [17]:
# group by date with hours by max
df_max = df_new.groupby('date')[['t_C', '2t_C']].max()

df_max.rename(columns = {'t_C':'t_max_1d', '2t_C':'2t_max_1d'}, inplace = True)

df_max

Unnamed: 0_level_0,t_max_1d,2t_max_1d
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-01-01,17.65,17.15
2000-01-02,24.85,24.35
2000-01-03,25.35,24.95
2000-01-04,12.55,12.35
2000-01-05,13.25,13.05
2000-01-06,15.85,15.65
2000-01-07,20.45,19.95
2000-01-08,27.05,26.55
2000-01-09,28.25,27.85
2000-01-10,30.65,29.95


In [18]:
df = df_0.join(df_6).join(df_12).join(df_18).join(df_mean).join(df_min).join(df_max)
df

Unnamed: 0_level_0,t_0_1d,2t_0_1d,t_6_1d,2t_6_1d,t_12_1d,2t_12_1d,t_18_1d,2t_18_1d,t_mean_1d,2t_mean_1d,t_min_1d,2t_min_1d,t_max_1d,2t_max_1d
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2000-01-01,15.45,15.15,17.65,17.15,10.85,10.85,6.95,7.25,12.725,12.6,6.95,7.25,17.65,17.15
2000-01-02,19.15,18.25,24.85,24.35,15.65,15.55,14.65,14.95,18.575,18.275,14.65,14.95,24.85,24.35
2000-01-03,25.15,24.75,25.35,24.95,13.95,13.45,9.35,9.45,18.45,18.15,9.35,9.45,25.35,24.95
2000-01-04,11.35,11.45,12.55,12.35,8.85,9.05,8.95,9.25,10.425,10.525,8.85,9.05,12.55,12.35
2000-01-05,11.25,11.35,13.25,13.05,10.25,10.45,9.85,9.95,11.15,11.2,9.85,9.95,13.25,13.05
2000-01-06,14.85,14.55,15.85,15.65,10.95,11.05,9.85,9.95,12.875,12.8,9.85,9.95,15.85,15.65
2000-01-07,17.05,16.55,20.45,19.95,13.85,13.95,10.05,10.05,15.35,15.125,10.05,10.05,20.45,19.95
2000-01-08,22.35,21.45,27.05,26.55,19.85,19.75,16.05,16.05,21.325,20.95,16.05,16.05,27.05,26.55
2000-01-09,25.750024,25.25,28.25,27.85,21.85,21.55,19.55,19.45,23.850006,23.525,19.55,19.45,28.25,27.85
2000-01-10,26.05,25.65,30.65,29.95,22.65,22.55,14.65,14.65,23.5,23.2,14.65,14.65,30.65,29.95


In [19]:
# making variables for t: surface level temp

#df['t_mean_3d'] = df['t_mean_1d'].transform(lambda x: x.rolling(3, center=False).mean())
#df['t_max_3d'] = df['t_max_1d'].transform(lambda x: x.rolling(3, center=False).max())
#df['t_min_3d'] = df['t_min_1d'].transform(lambda x: x.rolling(3, center=False).min())

In [20]:
# check t_mean_3d: (12.725+18.575+18.450)/3 = 16.583333333333332 -> 2000-01-03

In [21]:
# making variables for 2t: temp at 2m level

#df['2t_mean_3d'] = df['2t_mean_1d'].transform(lambda x: x.rolling(3, center=False).mean())
#df['2t_max_3d'] = df['2t_max_1d'].transform(lambda x: x.rolling(3, center=False).max())
#df['2t_min_3d'] = df['2t_min_1d'].transform(lambda x: x.rolling(3, center=False).min())

In [22]:
# write in function 

def make_variables_past_days(var_list, day_list, df):
    
    for var in var_list:
        
        for i in day_list:
        
            df[var + '_mean_' + str(i) + 'd'] = df[var + '_mean_1d'].transform(lambda x: x.rolling(i, center=False).mean())
            df[var + '_max_' + str(i) + 'd'] = df[var + '_max_1d'].transform(lambda x: x.rolling(i, center=False).max())
            df[var + '_min_' + str(i) + 'd'] = df[var + '_min_1d'].transform(lambda x: x.rolling(i, center=False).min())
        
    return df

In [23]:
# displaying the DataFrame
display(make_variables_past_days(['t', '2t'], [3,5,7,30], df))

Unnamed: 0_level_0,t_0_1d,2t_0_1d,t_6_1d,2t_6_1d,t_12_1d,2t_12_1d,t_18_1d,2t_18_1d,t_mean_1d,2t_mean_1d,...,2t_min_3d,2t_mean_5d,2t_max_5d,2t_min_5d,2t_mean_7d,2t_max_7d,2t_min_7d,2t_mean_30d,2t_max_30d,2t_min_30d
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01,15.45,15.15,17.65,17.15,10.85,10.85,6.95,7.25,12.725,12.6,...,,,,,,,,,,
2000-01-02,19.15,18.25,24.85,24.35,15.65,15.55,14.65,14.95,18.575,18.275,...,,,,,,,,,,
2000-01-03,25.15,24.75,25.35,24.95,13.95,13.45,9.35,9.45,18.45,18.15,...,7.25,,,,,,,,,
2000-01-04,11.35,11.45,12.55,12.35,8.85,9.05,8.95,9.25,10.425,10.525,...,9.05,,,,,,,,,
2000-01-05,11.25,11.35,13.25,13.05,10.25,10.45,9.85,9.95,11.15,11.2,...,9.05,14.15,24.95,7.25,,,,,,
2000-01-06,14.85,14.55,15.85,15.65,10.95,11.05,9.85,9.95,12.875,12.8,...,9.05,14.19,24.95,9.05,,,,,,
2000-01-07,17.05,16.55,20.45,19.95,13.85,13.95,10.05,10.05,15.35,15.125,...,9.95,13.56,24.95,9.05,14.096429,24.95,7.25,,,
2000-01-08,22.35,21.45,27.05,26.55,19.85,19.75,16.05,16.05,21.325,20.95,...,9.95,14.12,26.55,9.05,15.289286,26.55,9.05,,,
2000-01-09,25.750024,25.25,28.25,27.85,21.85,21.55,19.55,19.45,23.850006,23.525,...,10.05,16.72,27.85,9.95,16.039286,27.85,9.05,,,
2000-01-10,26.05,25.65,30.65,29.95,22.65,22.55,14.65,14.65,23.5,23.2,...,14.65,19.12,29.95,9.95,16.760714,29.95,9.05,,,


In [24]:
df.shape

(31, 38)