In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#import mpld3
#mpld3.enable_notebook()

# Import local packages
from src.data_processing import load_csv_from_zip as lcfz

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
%matplotlib inline

In [None]:
train, test = lcfz.read_csv_from_zip('./../data/input/bike-sharing-demand.zip', ['train.csv', 'test.csv'])

## Define all the helper functions

In [None]:
def make_datetime_index(df):
    df.set_index(pd.to_datetime(df.pop('datetime')), inplace=True)
    return df

In [None]:
def make_datetime_features(df):
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['day'] = df.index.dayofweek
    df['hour'] = df.index.hour
    return df

In [None]:
def make_vacation_feature(df):
    df['vacations'] = 0

    df.loc['2011-04-15':'2011-04-25', 'vacations'] = 1
    df.loc['2011-06-25':'2011-08-21', 'vacations'] = 1
    df.loc['2011-12-22':'2012-01-02', 'vacations'] = 1
    df.loc['2012-03-31':'2012-04-09', 'vacations'] = 1
    df.loc['2012-06-23':'2012-08-26', 'vacations'] = 1
    df.loc['2012-12-22':'2012-12-31', 'vacations'] = 1
    return df

In [None]:
def select_features(df):
    df.drop(columns=['temp'], inplace=True)
    return df

In [None]:
def add_missing_rows(df):
    temp = df.copy()
    temp['delta_t'] = temp.index.to_series().diff()
    temp['delta_t'].fillna(pd.Timedelta('0 hour'))
    temp[(temp['delta_t']>pd.Timedelta('1 hours')) & (temp['delta_t']< pd.Timedelta('1 days'))]['delta_t']

    last_row_before_gap = 0
    return df

In [None]:
def remove_outlier(df):
    index_list = df[(df['temp'] > 20) & (df['atemp'] < 15)].index
    index_list.append(df[df['weather'] == 4].index)
    df.drop(index=index_list, inplace=True)
    return df

In [None]:
def target_to_log(df):
    df['casual'] = np.log1p(df['casual'])
    df['registered'] = np.log1p(df['registered'])
    df['count'] = np.log1p(df['count'])
    return df

In [None]:
def hour_to_cos(df):
    df['second_harm'] = np.cos(4.0 * np.pi * df['hour'] / 24.0)
    df['hour'] = np.cos(2.0 * np.pi * df['hour'] / 24.0)
    return df

In [None]:
def hour_to_sin(df):
    df['fourth_harm'] = np.sin(8.0 * np.pi * df['hour'] / 24.0 + 3.0 * np.pi / 12.0)
    df['third_harm'] = np.sin(6.0 * np.pi * df['hour'] / 24.0 + 3.0 * np.pi / 12.0)
    df['second_harm'] = np.sin(4.0 * np.pi * df['hour'] / 24.0 + 4.0 * np.pi / 12.0)
    df['hour'] = np.sin(2.0 * np.pi * df['hour'] / 24.0 + 3.0 * np.pi / 12.0)
    return df

## Make a wrapper function for easier call

In [None]:
def train_prep(df):
    df = make_datetime_index(df)
    df = make_datetime_features(df)
    df = make_vacation_feature(df)
    df = remove_outlier(df)
    df = select_features(df)
    #df = hour_to_sin(df)
    #df = add_missing_rows(df)  # Specific to training set
    df = target_to_log(df)  # Specific to training set
    return df

In [None]:
def test_prep(df):
    df = make_datetime_index(df)
    df = make_datetime_features(df)
    df = make_vacation_feature(df)
    df = select_features(df)
    #df = hour_to_sin(df)
    return df

## Call wrapper function and create "prepared" datasets

In [None]:
train_prepared = train_prep(train.copy())
test_prepared = test_prep(test.copy())

In [None]:
train_prepared.head()

In [None]:
test_prepared.head()

## Save new datasets

In [None]:
train_prepared.to_csv("./../data/input/train_prepared.csv")
test_prepared.to_csv("./../data/input/test_prepared.csv")

In [None]:
train_prepared.info()

In [None]:
train_prepared.describe()