In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:
df = pd.read_csv('dataset.gz')

In [3]:
df.head()

Unnamed: 0,Year,Month,Day,Hour,Dual Fuel,Hydro,Natural Gas,Nuclear,Other Fossil Fuels,Other Renewables,Wind,Price
0,2016,1,1,0,2633.0,2667.0,1271.0,5422.0,0.0,311.0,417.0,15.55904
1,2016,1,1,1,2614.0,2341.0,1197.0,5423.0,0.0,310.0,471.0,14.43644
2,2016,1,1,2,2281.0,1929.0,1192.0,5422.0,0.0,308.0,469.0,13.2404
3,2016,1,1,3,2204.0,1910.0,1194.0,5422.0,0.0,310.0,437.0,13.4922
4,2016,1,1,4,2380.0,1999.0,1210.0,5422.0,0.0,310.0,413.0,12.67385


In [4]:
df.tail()

Unnamed: 0,Year,Month,Day,Hour,Dual Fuel,Hydro,Natural Gas,Nuclear,Other Fossil Fuels,Other Renewables,Wind,Price
21923,2018,7,2,18,10360.0,4340.0,5529.0,5265.0,1162.0,235.0,452.0,84.65012
21924,2018,7,2,19,10385.0,3803.0,5355.0,5265.0,1135.0,237.0,539.0,64.70675
21925,2018,7,2,20,10519.0,3469.0,5145.0,5264.0,1062.0,235.0,559.0,58.27039
21926,2018,7,2,21,9866.0,3321.0,4954.0,5266.0,945.0,237.0,645.0,50.50309
21927,2018,7,2,22,8049.0,2802.0,4840.0,5272.0,885.0,240.0,718.0,45.59523


## Create Timestamp

In [5]:
def parse_timestamp(row: pd.Series) -> pd.Timestamp:
    """
    Parse a timestamp from a row with datetime data.

    Args:
        row: a row with the following date time fields in the index:
        - Year
        - Month
        - Day
        - Hour

    Returns:
        a single timestamp from the integer data

    """
    # extract the relevant fields
    y = row['Year']
    mo = row['Month']
    d = row['Day']
    h = row['Hour']
    # format a datetime string
    timestr = '{}/{}/{} {}:00'.format(mo, d, y, h)
    # parse the datetime and reutrn in
    return pd.to_datetime(timestr)

In [6]:
df['Date'] = df[['Year', 'Month', 'Day', 'Hour']].progress_apply(parse_timestamp, axis=1)

100%|██████████| 21928/21928 [00:07<00:00, 2769.62it/s]


In [7]:
df.head()

Unnamed: 0,Year,Month,Day,Hour,Dual Fuel,Hydro,Natural Gas,Nuclear,Other Fossil Fuels,Other Renewables,Wind,Price,Date
0,2016,1,1,0,2633.0,2667.0,1271.0,5422.0,0.0,311.0,417.0,15.55904,2016-01-01 00:00:00
1,2016,1,1,1,2614.0,2341.0,1197.0,5423.0,0.0,310.0,471.0,14.43644,2016-01-01 01:00:00
2,2016,1,1,2,2281.0,1929.0,1192.0,5422.0,0.0,308.0,469.0,13.2404,2016-01-01 02:00:00
3,2016,1,1,3,2204.0,1910.0,1194.0,5422.0,0.0,310.0,437.0,13.4922,2016-01-01 03:00:00
4,2016,1,1,4,2380.0,1999.0,1210.0,5422.0,0.0,310.0,413.0,12.67385,2016-01-01 04:00:00


In [8]:
df.tail()

Unnamed: 0,Year,Month,Day,Hour,Dual Fuel,Hydro,Natural Gas,Nuclear,Other Fossil Fuels,Other Renewables,Wind,Price,Date
21923,2018,7,2,18,10360.0,4340.0,5529.0,5265.0,1162.0,235.0,452.0,84.65012,2018-07-02 18:00:00
21924,2018,7,2,19,10385.0,3803.0,5355.0,5265.0,1135.0,237.0,539.0,64.70675,2018-07-02 19:00:00
21925,2018,7,2,20,10519.0,3469.0,5145.0,5264.0,1062.0,235.0,559.0,58.27039,2018-07-02 20:00:00
21926,2018,7,2,21,9866.0,3321.0,4954.0,5266.0,945.0,237.0,645.0,50.50309,2018-07-02 21:00:00
21927,2018,7,2,22,8049.0,2802.0,4840.0,5272.0,885.0,240.0,718.0,45.59523,2018-07-02 22:00:00


# Split

In [9]:
# split the data along the year 2018. i.e. leave the first 2 years of data
# for training and the last half year for testing (a 4/5 to 1/5 split)
train = df[df.Date < pd.to_datetime('06/01/2018')]
test = df[df.Date >= pd.to_datetime('06/01/2018')]

In [10]:
del train['Date']
del test['Date']

## Train

In [11]:
train.head()

Unnamed: 0,Year,Month,Day,Hour,Dual Fuel,Hydro,Natural Gas,Nuclear,Other Fossil Fuels,Other Renewables,Wind,Price
0,2016,1,1,0,2633.0,2667.0,1271.0,5422.0,0.0,311.0,417.0,15.55904
1,2016,1,1,1,2614.0,2341.0,1197.0,5423.0,0.0,310.0,471.0,14.43644
2,2016,1,1,2,2281.0,1929.0,1192.0,5422.0,0.0,308.0,469.0,13.2404
3,2016,1,1,3,2204.0,1910.0,1194.0,5422.0,0.0,310.0,437.0,13.4922
4,2016,1,1,4,2380.0,1999.0,1210.0,5422.0,0.0,310.0,413.0,12.67385


In [12]:
train.tail()

Unnamed: 0,Year,Month,Day,Hour,Dual Fuel,Hydro,Natural Gas,Nuclear,Other Fossil Fuels,Other Renewables,Wind,Price
21156,2018,5,31,19,3967.0,3571.0,3466.0,5357.0,0.0,265.0,541.0,32.47095
21157,2018,5,31,20,3961.0,3624.0,3323.0,5356.0,0.0,266.0,576.0,32.55496
21158,2018,5,31,21,3606.0,3569.0,3201.0,5355.0,0.0,265.0,712.0,29.19445
21159,2018,5,31,22,2881.0,3466.0,2851.0,5352.0,0.0,267.0,657.0,25.50839
21160,2018,5,31,23,2220.0,3432.0,2727.0,5351.0,0.0,266.0,618.0,23.37656


In [13]:
train.to_csv('train.gz', compression='gzip', index=None)

## Test

In [14]:
test.head()

Unnamed: 0,Year,Month,Day,Hour,Dual Fuel,Hydro,Natural Gas,Nuclear,Other Fossil Fuels,Other Renewables,Wind,Price
21161,2018,6,1,0,1909.0,3414.0,2337.0,5350.0,0.0,270.0,634.0,25.84154
21162,2018,6,1,1,1821.0,3432.0,2100.0,5335.0,0.0,269.0,561.0,22.78535
21163,2018,6,1,2,1665.0,3368.0,1971.0,5352.0,0.0,268.0,577.0,20.7444
21164,2018,6,1,3,1728.0,3426.0,2013.0,5358.0,0.0,268.0,451.0,20.22108
21165,2018,6,1,4,1830.0,3386.0,2042.0,5354.0,0.0,266.0,396.0,20.54554


In [15]:
test.tail()

Unnamed: 0,Year,Month,Day,Hour,Dual Fuel,Hydro,Natural Gas,Nuclear,Other Fossil Fuels,Other Renewables,Wind,Price
21923,2018,7,2,18,10360.0,4340.0,5529.0,5265.0,1162.0,235.0,452.0,84.65012
21924,2018,7,2,19,10385.0,3803.0,5355.0,5265.0,1135.0,237.0,539.0,64.70675
21925,2018,7,2,20,10519.0,3469.0,5145.0,5264.0,1062.0,235.0,559.0,58.27039
21926,2018,7,2,21,9866.0,3321.0,4954.0,5266.0,945.0,237.0,645.0,50.50309
21927,2018,7,2,22,8049.0,2802.0,4840.0,5272.0,885.0,240.0,718.0,45.59523


In [16]:
test.to_csv('test.gz', compression='gzip', index=None)