In [13]:
import numpy as np
import pandas as pd
import datetime

# 1. Functions

In [23]:
def add_day_of_year_column(df_src, column_name='date'):
    df_out = df_src.copy()
    df_out['day_of_year'] = df_out[column_name].apply(lambda d: date_to_nth_day(str(d)))
    return df_out

def date_to_nth_day(date, format='%Y%m%d'):
    date = datetime.datetime.strptime(date, format)
    new_year_day = datetime.datetime(year=date.year, month=1, day=1)
    return (date - new_year_day).days + 1 + (365 * (date.year - 2014))

# 2. Generating Cluster File

*Idea:* _Cluster the stations based on the existing training data and add the cluster groups as additional input for predictions_

In [31]:
df_time_path = '../data/tmp/df_time.csv'
df_time = pd.read_csv(df_time_path)
df_time = df_time.drop(['Unnamed: 0'], axis=1)

In [32]:
df_time.shape

(18683824, 3)

In [33]:
df_time.head()

Unnamed: 0,station,date,TMIN
0,AE000041196,20140101,128
1,AE000041196,20140102,145
2,AE000041196,20140103,140
3,AE000041196,20140106,162
4,AE000041196,20140109,115


In [84]:
df_test = df_time.copy()
df_test.head()
df_test['date'] = df_test['date'].apply(lambda d: date_to_nth_day(str(d)))

# create pivot table
df_test = df_test.drop_duplicates(['station','date'])
df_pivot = df_test.pivot(index='station', columns='date', values='TMIN')

#fill NaN value forward
df_pivot = df_pivot.fillna(method='ffill', axis=1)
df_pivot = df_pivot.groupby('station').min()
df_flattened = pd.DataFrame(df_pivot.to_records())

In [81]:
df_test.head()

Unnamed: 0,station,date,TMIN
0,AE000041196,1,128
1,AE000041196,2,145
2,AE000041196,3,140
3,AE000041196,6,162
4,AE000041196,9,115


In [82]:
df_pivot.head()

date,1,2,3,4,5,6,7,8,9,10,...,1451,1452,1453,1454,1455,1456,1457,1458,1459,1460
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AE000041196,128.0,145.0,140.0,140.0,140.0,162.0,162.0,162.0,115.0,115.0,...,126.0,126.0,126.0,126.0,126.0,150.0,132.0,114.0,113.0,114.0
AEM00041194,159.0,159.0,160.0,156.0,150.0,186.0,186.0,186.0,148.0,148.0,...,177.0,186.0,186.0,199.0,162.0,162.0,189.0,189.0,176.0,176.0
AEM00041217,136.0,128.0,136.0,136.0,112.0,141.0,141.0,141.0,122.0,126.0,...,150.0,164.0,164.0,164.0,164.0,155.0,186.0,186.0,186.0,126.0
AEM00041218,131.0,137.0,137.0,137.0,106.0,164.0,164.0,164.0,164.0,124.0,...,146.0,146.0,146.0,129.0,152.0,139.0,139.0,124.0,124.0,124.0
AG000060390,25.0,25.0,25.0,68.0,85.0,58.0,50.0,46.0,48.0,48.0,...,49.0,49.0,49.0,29.0,36.0,36.0,104.0,104.0,79.0,52.0


In [83]:
df_flattened.head()

Unnamed: 0,station,1,2,3,4,5,6,7,8,9,...,1451,1452,1453,1454,1455,1456,1457,1458,1459,1460
0,AE000041196,128.0,145.0,140.0,140.0,140.0,162.0,162.0,162.0,115.0,...,126.0,126.0,126.0,126.0,126.0,150.0,132.0,114.0,113.0,114.0
1,AEM00041194,159.0,159.0,160.0,156.0,150.0,186.0,186.0,186.0,148.0,...,177.0,186.0,186.0,199.0,162.0,162.0,189.0,189.0,176.0,176.0
2,AEM00041217,136.0,128.0,136.0,136.0,112.0,141.0,141.0,141.0,122.0,...,150.0,164.0,164.0,164.0,164.0,155.0,186.0,186.0,186.0,126.0
3,AEM00041218,131.0,137.0,137.0,137.0,106.0,164.0,164.0,164.0,164.0,...,146.0,146.0,146.0,129.0,152.0,139.0,139.0,124.0,124.0,124.0
4,AG000060390,25.0,25.0,25.0,68.0,85.0,58.0,50.0,46.0,48.0,...,49.0,49.0,49.0,29.0,36.0,36.0,104.0,104.0,79.0,52.0
