In [None]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

import sys
sys.path.append("../")
import KNMIRequest as KR
import HolidayRequest as HR

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/train.csv', index_col=0, parse_dates=True)
df_train = df.copy()
df_train.head()

In [None]:
df_test = pd.read_csv('../input/test.csv', index_col=0, parse_dates=True)
df_test.head()

In [None]:
df_KNMI = KR.get_KNMI_DD(start=20190101, end=20191209, vars_=["FG", "FHX", "TN", "TX", "TG", "DR", "RH", "RHX", "VVN"])
df_KNMI.set_index("Date", inplace=True)

# renaming the KNMI columns
df_KNMI = df_KNMI.rename(columns={'FG': 'mean windspeed',
                                 'FHX': 'hightest hourly gust',
                                 'TN':  'min temp',
                                 'TX':  'max temp',
                                 'TG':  'mean temp',
                                 'DR':  'duration rain',
                                 'RH':  'sum rain',
                                 'RHX': 'highest hourly rain',
                                 'VVN': 'minimum visibility'})

ser_holidays = HR.get_holiday_range()
df_nat_hol = pd.read_csv('../input/Nationale Feestdagen.csv')
df_nat_hol['date'] = pd.to_datetime(df_nat_hol['Datum'])
df_KNMI

In [None]:
df = pd.read_csv('../input/train.csv', index_col=0, parse_dates=True)
df_train = df.copy()
df_train.head()

def df_prep(df, epoch):
    df['start_time'] = pd.to_datetime(df['start_time'])
    df['end_time'] = pd.to_datetime(df['end_time'])
    df['date'] = pd.to_datetime(df['date']).dt.date
    
    df = df["date"].value_counts().to_frame().reset_index().sort_values("index")
    df.rename(columns={"index" : "date", "date" : "count"}, inplace=True)

    df["date"] = pd.to_datetime(df["date"])
    df['weekday'] = df['date'].dt.day_name()
    
    df['Friday'] = (df['weekday'] == 'Friday')
    df['Weekend'] = (df['weekday'] == 'Saturday') | (df['weekday'] == 'Sunday')
    df['School holiday'] = df['date'].isin(ser_holidays)
    
    df['National holiday'] = df['date'].isin(df_nat_hol['date'].dt.date)
    
    df['Days from epoch'] = (df['date'] - epoch).dt.days + 1
    df.set_index("date", inplace=True)
    
    
    return df.join(df_KNMI).reset_index().drop(["Station", "weekday"], axis=1)

#     df['Days from epoch'] = (df['start_time'] - df['start_time'].min()).dt.days
# df_train = df_prep(df_train)
# df_train.head()

In [None]:
df_KNMI

In [None]:
epoch = pd.to_datetime(min(df_train["date"].min(), df_test["date"].min()))

In [None]:
df_train = df_prep(df_train, epoch)
df_train.head()

In [None]:
df_test = df_prep(df_test, epoch)
df_test.head()

In [None]:
df_train.to_pickle('../input/train.pkl')
df_test.to_pickle('../input/test.pkl')

## Validation data prep

In [None]:
df_validation = pd.read_csv('../input/sampleSubmission.csv', index_col=0, parse_dates=True)

df_validation.reset_index(inplace=True)
df_validation = df_validation.rename(columns={'id': 'date'})

df_validation.head()

In [None]:
min_date = int(str(df_validation['date'].dt.date.min()).replace("-", ""))
max_date = int(str(df_validation['date'].dt.date.max()).replace("-", ""))

df_val_KNMI = KR.get_KNMI_DD(start=min_date, end=max_date, vars_=["FG", "FHX", "TN", "TX", "TG", "DR", "RH", "RHX", "VVN"])
df_val_KNMI = df_val_KNMI.rename(columns={'FG': 'mean windspeed',
                                 'FHX': 'hightest hourly gust',
                                 'TN':  'min temp',
                                 'TX':  'max temp',
                                 'TG':  'mean temp',
                                 'DR':  'duration rain',
                                 'RH':  'sum rain',
                                 'RHX': 'highest hourly rain',
                                 'VVN': 'minimum visibility'})

df_val_KNMI.drop('Station', axis=1, inplace=True)
df_val_KNMI.head()

In [None]:
def data_prep_subm(df):
    df["date"] = pd.to_datetime(df["date"])
    df['weekday'] = df['date'].dt.day_name()
    
    df['Friday'] = (df['weekday'] == 'Friday')
    df['Weekend'] = (df['weekday'] == 'Saturday') | (df['weekday'] == 'Sunday')
    df['School holiday'] = df['date'].isin(ser_holidays)
    
    df['National holiday'] = df['date'].isin(df_nat_hol['date'].dt.date)
    
    df['Days from epoch'] = (df['date'] - epoch).dt.days + 1
    
    df.drop('weekday', axis=1, inplace=True)

In [None]:
data_prep_subm(df_validation)

# merge the KNMI dataset with the validation dataset
df_validation = df_validation.merge(df_val_KNMI, left_on='date', right_on='Date')
df_validation.drop('Date', axis=1, inplace=True)

df_validation.head()

In [None]:
df_validation.to_pickle('../input/validation.pkl')