In [None]:
!pip install skforecast
import pandas as pd
from datetime import datetime
import math
from sklearn.metrics.pairwise import manhattan_distances # manhattan_distances(X, Y=None, *, sum_over_features=True)[source]
from datetime import datetime
import math
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster
from skforecast.utils import save_forecaster
from skforecast.utils import load_forecaster
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import shutil
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
import os
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

def mkdir(d):
    if not os.path.exists(d):
        os.makedirs(d)
        
french_holidays = [
    "2020-07-14",
    "2020-08-15",
    "2020-09-22",
    "2020-11-01",    
    "2020-11-11",
    "2020-12-21",  
    "2020-12-24", 
    "2020-12-25", 
    "2020-12-26", 
    "2020-12-31",     
    
    "2021-01-01",
    "2021-03-20",
    "2021-04-02",
    "2021-04-04",
    "2021-04-05",
    "2021-05-01",
    "2021-05-08",
    "2021-05-13",
    "2021-05-23",
    "2021-05-24",
    "2021-05-30",
    "2021-06-20",
    "2021-06-21",
    "2021-07-14",
    "2021-08-15",    
    "2021-09-22",    
    "2021-11-01",    
    "2021-11-11",     
    "2021-12-21",  
    "2021-12-24", 
    "2021-12-25", 
    "2021-12-26", 
    "2021-12-31", 
]



In [None]:
# Reading input files
train_path = '../input/estationdata/public_data/train.csv'
test_path = '../input/estationdata/public_data/test.csv'

train = pd.read_csv(train_path, sep=",")
train['date'] = pd.to_datetime(train['date'])
train = train.drop(['Postcode'], axis=1)

# Removing data before corona restrictions
corona_date = datetime.strptime("2020-10-18 00:00:00", '%Y-%m-%d %H:%M:%S')  
train = train[train['date'] > corona_date]

test = pd.read_csv(test_path, sep=",")
test['date'] = pd.to_datetime(test['date'])
test = test.drop(['Postcode'], axis=1)

In [None]:
stations = train['Station'].unique()
dfs = []

for station in stations[:]: # Let's add all the missing datapoints back into the dataframe and fill the known nan's (station, area...)
    single_station_df = train[train['Station'] == station]
    single_station_df = single_station_df.set_index('date').asfreq('15min')
    where_nan = single_station_df.isnull().any(axis=1)
    
    area = single_station_df['area'].value_counts().index.tolist()[0]
    latitude = single_station_df['Latitude'].value_counts().index.tolist()[0]
    longitude = single_station_df['Longitude'].value_counts().index.tolist()[0]

    single_station_df['Station'] = single_station_df['Station'].fillna(station)
    single_station_df['area'] = single_station_df['area'].fillna(area)
    single_station_df['Latitude'] = single_station_df['Latitude'].fillna(latitude)
    single_station_df['Longitude'] = single_station_df['Longitude'].fillna(longitude)

    single_station_df["imputed"] = np.where(where_nan, 1, 0)
    single_station_df['date'] = single_station_df.index
    dfs.append(single_station_df)
    
train = pd.concat(dfs, ignore_index=True).sort_values(by=['date'])

for df in [train, test]:
    df['tod'] = (df.date.dt.hour * 60 + df.date.dt.minute) / 15
    df['dow'] = df.date.dt.dayofweek
    df['month'] = df.date.dt.month
    df['dayofyear'] = df.date.dt.dayofyear
    df['year'] = df.date.dt.year
    
    df['year'] = df['year'].replace([2020], 0)
    df['year'] = df['year'].replace([2021], 1)
    df['year'] = df['year'].replace([2022], 2)
    
    df['tod_sin'] = np.sin(2 * np.pi * df['tod']/96.0)
    df['tod_cos'] = np.cos(2 * np.pi * df['tod']/96.0)

    df['dow_sin'] = np.sin(2 * np.pi * df['dow']/7.0)
    df['dow_cos'] = np.cos(2 * np.pi * df['dow']/7.0)

    df['month_sin'] = np.sin(2 * np.pi * df['month']/12.0)
    df['month_cos'] = np.cos(2 * np.pi * df['month']/12.0)

    df['dayofyear_sin'] = np.sin(2 * np.pi * df['dayofyear']/365.0)
    df['dayofyear_cos'] = np.cos(2 * np.pi * df['dayofyear']/365.0)
    
    df['holiday'] = df['date'].isin(french_holidays).astype(int)

    
test['imputed'] = 0
train

In [None]:
targets = ["Available","Charging","Passive","Other"] 

exog_variables_local = ['tod_sin', 'tod_cos',
                  'dow_sin', 'dow_cos', 
                  'month_sin', 'month_cos', 
                  'dayofyear_sin', 'dayofyear_cos', 
                  'year', 'imputed', 'holiday', 
                  'trend']

exog_variables_global = ['Latitude', 'Longitude']

val_local = False # we can set this flag to true to split our dataset into a train and validation set for local evaluation
if val_local: # train val split 
    split_date = datetime.strptime("2021-02-14 00:00:00", '%Y-%m-%d %H:%M:%S')  
    test = train[train['date'] > split_date]    
    train = train[train['date'] <= split_date]
    test[targets[0] + '_GT'] = test[targets[0]]
    test[targets[1] + '_GT'] = test[targets[1]]
    test[targets[2] + '_GT'] = test[targets[2]]
    test[targets[3] + '_GT'] = test[targets[3]]
    test = test.drop(targets, axis=1)
    

### Impute

In [None]:
# impute train
where_not_null = train.imputed == 0 # add an indicator for values that we impute
where_null = train.imputed == 1

train = train.dropna() # Let's drop na, it works better than any imputation method

'''
# Methods that did not work for imputation

#train[targets] = train[targets].fillna(train.groupby(['date', 'area'])[targets].transform('mean')) # Fill by mean
#train[targets] = train[targets].fillna(train.groupby(['date'])[targets].transform('median')) # Fill by mean

#for station in stations[:]:
#    at_train_station = train['Station'] == station
#    train[at_train_station] = train[at_train_station].fillna(method='bfill')
    #train[at_train_station] = train[at_train_station].interpolate(method='spline', order=5)
    
#train = train.fillna(method='ffill')
#train = train.interpolate(method='nearest')
#train = train.interpolate(method='spline', order=5)
#train = train.interpolate(method='time')
'''

train.isna().sum()

### Train & Predict

#### XGBRegressor

In [None]:
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestClassifier
from statsmodels.tsa.arima.model import ARIMA

def train_predict_for_station(train_df_single_station, predict_df_single_station, num, val_local):
    prediction_interval_length = len(predict_df_single_station)
    regressor = xgb.XGBRegressor(n_estimators=1)
    forecaster  = ForecasterAutoreg(regressor=regressor, lags=20)

    predictions_xgb_regressor = []
    predictions_arima = []
    
    for target in ['Available', "Charging","Passive", "Other"]:
        #predict with xgboost
        forecaster.fit(train_df_single_station[target].rolling(10).mean().iloc[10:], 
                       exog=train_df_single_station[exog_variables_local].iloc[10:])
        target_predictions_xgb = forecaster.predict(prediction_interval_length, exog=predict_df_single_station[exog_variables_local])
        predictions_xgb_regressor.append(target_predictions_xgb)
        
        # predict with arima
        model = ARIMA(train_df_single_station[target].rolling(10).mean().iloc[10:].reset_index(drop=True), order=(2, 1, 1))
        fitted_arima = model.fit()
        target_predictions_argima = fitted_arima.forecast(prediction_interval_length)
        predictions_arima.append(target_predictions_argima)
        
    return predictions_xgb_regressor, predictions_arima

In [None]:
for i, station in enumerate(stations[:]):
    at_train_station = train['Station'] == station
    at_test_station = test['Station'] == station
    
    predictions_xgb_regressor, predictions_arima = train_predict_for_station(train[at_train_station], test[at_test_station], i, val_local)
    
    # put preds into dataframe
    index = test[at_test_station].index
    test.loc[at_test_station, 'Available_xgb_regressor'] = pd.Series(predictions_xgb_regressor[0].values, index=index)
    test.loc[at_test_station, 'Charging_xgb_regressor'] = pd.Series(predictions_xgb_regressor[1].values, index=index)
    test.loc[at_test_station, 'Passive_xgb_regressor'] = pd.Series(predictions_xgb_regressor[2].values, index=index)
    test.loc[at_test_station, 'Other_xgb_regressor'] = pd.Series(predictions_xgb_regressor[3].values, index=index)
    
    test.loc[at_test_station, 'Available_arima'] = pd.Series(predictions_arima[0].values, index=index)
    test.loc[at_test_station, 'Charging_arima'] = pd.Series(predictions_arima[1].values, index=index)
    test.loc[at_test_station, 'Passive_arima'] = pd.Series(predictions_arima[2].values, index=index)
    test.loc[at_test_station, 'Other_arima'] = pd.Series(predictions_arima[3].values, index=index)
        
    print("Done training station", station)


#### XGB Classifier

In [None]:
# Create Classes from targets
from tqdm.notebook import tqdm
tqdm.pandas()

concat = train['Available'].astype(int).astype(str) + train['Charging'].astype(int).astype(str) + train['Passive'].astype(int).astype(str) + train['Other'].astype(int).astype(str)
unique_combos = [c for c in concat.unique() if np.sum([int(digit) for digit in list(c)]) == 3]

assert len(unique_combos) == 20

unique_combos.sort()
unique_combos.reverse()
class_dict = {c: idx for idx, c in enumerate(unique_combos)}
class_dict

def targets_to_class(target_values):
    key = ''.join([str(t) for t in target_values])
    return class_dict[key]
    
for df in [train]:
    df['class'] = df.progress_apply(lambda row: targets_to_class([int(row[target]) for target in targets]), axis=1)

In [None]:
# one-hot-encode stations
station_columns = [f'Station_{station}' for station in stations]

train_station = train['Station']
test_station = test['Station']

train = pd.get_dummies(train, columns=['Station'])
train['Station'] = train_station

test = pd.get_dummies(test, columns=['Station'])
test['Station'] = test_station

# one hot encode are
area_columns = [f'area_{area}' for area in train['area'].unique()]

train_area = train['area']
test_area= test['area']

train = pd.get_dummies(train, columns=['area'])
train['area'] = train_area

test = pd.get_dummies(test, columns=['area'])
test['area'] = test_area

exog_variables = exog_variables_local + exog_variables_global + station_columns + area_columns

train

In [None]:
classifier = xgb.XGBClassifier(n_estimators=1) 
classifier.fit(train[exog_variables], train['class'], eval_set=[(train[exog_variables], train['class'])])
predictions = classifier.predict(test[exog_variables])

test['predicted_class'] = predictions

def class_to_target(clazz: int, target: str) -> int:
    target_str = list(class_dict.keys())[list(class_dict.values()).index(clazz)]
    return int(target_str[targets.index(target)])

for t in targets:
    test[t + "_xgb_classifier"] = test['predicted_class'].apply(lambda pred: class_to_target(pred, target=t))

### Ensembling the predictions and scaling them

In [None]:
xgbr_preds = ['Available_xgb_regressor', 'Charging_xgb_regressor', 'Passive_xgb_regressor', 'Other_xgb_regressor']
arima_preds = ['Available_arima', 'Charging_arima', 'Passive_arima', 'Other_arima']
xgbc_preds = ['Available_xgb_classifier', 'Charging_xgb_classifier', 'Passive_xgb_classifier', 'Other_xgb_classifier']

def scale_nums_to_sum(nums, station, value=3):
    if np.sum(nums) == 0:
        return [0, 0, 0, 0]
    return [num / np.sum(nums) * value for num in nums]


def round_and_rescale(df, cols):
    for col in cols: 
        df.loc[test[col] < 0, col] = 0
        df.loc[test[col] > 3, col] = 3
    df[cols] = df[cols].apply(np.round)
    
    k = df.apply(lambda row : scale_nums_to_sum([row[cols[0]],
                                                 row[cols[1]], 
                                                 row[cols[2]],
                                                 row[cols[3]]],
                                                 row["Station"]), axis = 1)
    k = np.array(k.to_list())

    df[cols[0]] = k[:, 0]
    df[cols[1]] = k[:, 1]
    df[cols[2]] = k[:, 2]
    df[cols[3]] = k[:, 3]
    
    return df


def rescale_and_round(df, cols):
    for col in cols: 
        df.loc[test[col] < 0, col] = 0
        df.loc[test[col] > 3, col] = 3
    
    
    k = df.apply(lambda row : scale_nums_to_sum([row[cols[0]],
                                                 row[cols[1]], 
                                                 row[cols[2]],
                                                 row[cols[3]]],
                                                 row["Station"]), axis = 1)
    k = np.array(k.to_list())

    df[cols[0]] = k[:, 0]
    df[cols[1]] = k[:, 1]
    df[cols[2]] = k[:, 2]
    df[cols[3]] = k[:, 3]
    
    df[cols] = df[cols].apply(np.round)
    
    return df

test = rescale_and_round(test, xgbr_preds)
test = round_and_rescale(test, arima_preds)

for xgbr_pred, arima_pred, xgbc_pred, target in zip(xgbr_preds, arima_preds, xgbc_preds, targets):    
    test[target] = test[arima_pred] * 0.4 + test[xgbr_pred] * 0.35 + test[xgbc_pred] * 0.25
    
test = rescale_and_round(test, targets)

In [None]:
from sklearn.metrics import mean_absolute_error
import os

if val_local: # Let's check how we did on the validation set
    station_maes = []
    for station in stations:
        at_station = test['Station'] == station
        target_maes = []

        for target in targets:
            error = mean_absolute_error(test.dropna()[at_station][target], test.dropna()[at_station][target + '_GT'])
            target_maes.append(error)

        station_maes.append(np.sum(target_maes))

    print("All stations mea:", np.mean(station_maes))
    plt.figure(figsize=(15,20))
    plt.barh(stations, station_maes)
    plt.show()
    
else: # Let's make a submission!
    pred_area = test.groupby(['date', 'area']).agg({
        'Available': 'sum',
        'Charging': 'sum',
        'Passive': 'sum',
        'Other': 'sum'}).reset_index()

    pred_global = test.groupby('date').agg({
        'Available': 'sum',
        'Charging': 'sum',
        'Passive': 'sum',
        'Other': 'sum'}).reset_index()
    
    mkdir("result")

    test[["date","area","Station","Available","Charging","Passive","Other"]].reset_index(drop=True).to_csv("./result/station.csv", index=False)
    test[["date","area","Available","Charging","Passive","Other"]].reset_index(drop=True).to_csv("./result/area.csv", index=False)
    test[["date","Available","Charging","Passive","Other"]].reset_index(drop=True).to_csv("./result/global.csv", index=False)
    shutil.make_archive("./result",'zip', "./result")
