In [1]:
import pandas as pd
import numpy as np
from causalimpact import CausalImpact

In [2]:
def custom_sort_key(s):
    parts = s.split('_')
    return int(parts[1])

In [2]:
def causalimpact_eval(dataset_name,dataset_type,forecast_horizon):
    if dataset_type == "sim":
        # y_true_df_A = pd.read_csv("../datasets/text_data/" + dataset_type +  \
        #         "/" + dataset_name + "_test_actual.csv")
        # # Reading the original data to calculate the MASE errors
        # y_true_df_B = pd.read_csv("../datasets/text_data/" + dataset_type +  \
        #         "/" + dataset_name + "_train.csv")
        # data_row_A = y_true_df_A.pivot(index='time', columns='series_id', values='value')
        # data_row_B = y_true_df_B.pivot(index='time', columns='series_id', values='value')
        # data_row = pd.concat([data_row_B, data_row_A],ignore_index=True)
        # data_row_A = data_row_A.T
        # data_row_B = data_row_B.T
        data_row = pd.read_csv("../datasets/text_data/sim/"+dataset_name+".csv")
        length_of_series = len(data_row.index)
        data_row_for_errors = pd.read_csv("../datasets/text_data/sim/"+dataset_name+"_for_errors.csv").iloc[:,1:]
        data_row_A = data_row_for_errors.iloc[length_of_series-forecast_horizon:, :].T
        data_row_B = data_row_for_errors.iloc[:length_of_series-forecast_horizon, :].T

    if dataset_type == "calls911":
        control = ["BRIDGEPORT", "BRYN ATHYN", "DOUGLASS", "HATBORO", "HATFIELD BORO",
                      "LOWER FREDERICK", "NEW HANOVER", "NORRISTOWN", "NORTH WALES", "SALFORD",
                      "SPRINGFIELD", "TRAPPE"]
        data_row = pd.read_csv('../datasets/text_data/' + dataset_type\
                            + '/'+dataset_name+'.csv').iloc[:, 1:]
        # y_true_df_A = data_row.iloc[len(data_row['date'])-forecast_horizon:, 1:].T
        # y_true_df_B = data_row.iloc[:len(data_row['date'])-forecast_horizon, 1:].T
        # data_row_A = y_true_df_A
        # data_row_B = y_true_df_B
        data_row_cols = data_row.columns
        data_row_for_errors = data_row.loc[:,control]
        length_of_series = len(data_row.index)
        y_true_df_A = data_row_for_errors.iloc[length_of_series-forecast_horizon:, :].T
        y_true_df_B = data_row_for_errors.iloc[:length_of_series-forecast_horizon, :].T
        data_row_A = y_true_df_A
        # print(data_row_A)
        data_row_B = y_true_df_B

    errors_directory = '../results/benchmarks/errors/'

    errors_file_name_mean_median = 'mean_median_' + dataset_name + '_causalimpact'
    SMAPE_file_name_all_errors = 'all_smape_errors_' + dataset_name + '_causalimpact'
    MASE_file_name_all_errors = 'all_mase_errors_' + dataset_name + '_causalimpact'

    errors_file_full_name_mean_median = errors_directory + errors_file_name_mean_median+'.txt'
    SMAPE_file_full_name_all_errors = errors_directory + SMAPE_file_name_all_errors
    MASE_file_full_name_all_errors = errors_directory + MASE_file_name_all_errors
    
    output = '../results/benchmarks/predicted/' + dataset_name +\
          '_causalimpact.csv'
    y_pred_list = []
    for i in data_row.columns:
        ci = CausalImpact(data_row.loc[:,[i] + [col for col in \
                    data_row.columns if col != i]],
                [0,length_of_series-forecast_horizon-1],
                [length_of_series-forecast_horizon,
                length_of_series-1])
        # evaluate the model
        y_pred = ci.inferences.loc[(length_of_series-\
                    forecast_horizon):(length_of_series-1),'preds']
        y_pred_list.append(y_pred)
    y_pred_df = pd.DataFrame(y_pred_list)
    y_pred_df.to_csv(output, index=False, header=False)
    # np.savetxt(output, pd.DataFrame(y_pred_list), delimiter = ',')

    # y_pred_df= pd.read_csv(output, header=None)
    y_pred_for_errors = y_pred_df.copy()
    if dataset_type == "calls911":
        y_pred_for_errors['names'] = data_row_cols
        y_pred_for_errors.set_index('names', inplace=True)
        y_pred_for_errors = y_pred_for_errors.loc[control,:]
    
    no_of_series = len(data_row_B.index) 

    # SMAPE
    time_series_wise_SMAPE = 2 * np.abs(y_pred_for_errors - np.array(data_row_A)) /\
        (np.abs(y_pred_for_errors) + np.abs(np.array(data_row_A)))
    SMAPEPerSeries = np.mean(time_series_wise_SMAPE, axis=1)
    mean_SMAPE = np.mean(SMAPEPerSeries)
    mean_SMAPE_str = f"mean_SMAPE:{mean_SMAPE}"
    print(mean_SMAPE_str)
    np.savetxt(SMAPE_file_full_name_all_errors+'.txt', SMAPEPerSeries, delimiter=",", fmt='%f')
    
    mase_vector = []
    for i in range(no_of_series):
        lagged_diff = [data_row_B.iloc[i,j] - \
                   data_row_B.iloc[i,j - forecast_horizon]\
                      for j in range(forecast_horizon,\
                        len(data_row_B.columns))]
        mase_vector.append(np.mean(np.abs(np.array(np.array(data_row_A.iloc[i]))\
                 - np.array(y_pred_for_errors.iloc[i])) / np.mean(np.abs(lagged_diff))))

    mean_MASE = np.mean(mase_vector)
    mean_MASE_str = f"mean_MASE:{mean_MASE}"
    print(mean_MASE_str)

    np.savetxt(MASE_file_full_name_all_errors+'.txt', mase_vector, delimiter=",", fmt='%f')

    # Writing the SMAPE results to file
    with open(errors_file_full_name_mean_median, 'w') as f:
        # f.write('\n'.join([mean_SMAPE_str, median_SMAPE_str, std_SMAPE_str]))
        f.write('\n'.join([mean_SMAPE_str]))

    # Writing the MASE results to file
    with open(errors_file_full_name_mean_median, 'a') as f:
        # f.write('\n'.join([mean_MASE_str, median_MASE_str, std_MASE_str]))
        f.write('\n'.join([mean_MASE_str]))


In [3]:
dataset_name = 'calls911_benchmarks'
dataset_type = 'calls911'
forecast_horizon=7
causalimpact_eval(dataset_name,dataset_type,forecast_horizon)



mean_SMAPE:0.4605758534262359
mean_MASE:1.66116492063318


In [4]:
dataset_name_test = ['sim_10_60_l_he', 'sim_10_60_l_ho',\
                     'sim_10_60_nl_he', 'sim_10_60_nl_ho',\
                     'sim_10_222_l_he', 'sim_10_222_l_ho',\
                     'sim_10_222_nl_he', 'sim_10_222_nl_ho',\
                     'sim_101_60_l_he', 'sim_101_60_l_ho',\
                     'sim_101_60_nl_he', 'sim_101_60_nl_ho',\
                     'sim_101_222_l_he', 'sim_101_222_l_ho',\
                     'sim_101_222_nl_he', 'sim_101_222_nl_ho',\
                     'sim_500_60_l_he', 'sim_500_60_l_ho',\
                     'sim_500_60_nl_he', 'sim_500_60_nl_ho',\
                     'sim_500_222_l_he', 'sim_500_222_l_ho',\
                     'sim_500_222_nl_he', 'sim_500_222_nl_ho']
dataset_type = 'sim'
forecast_horizon=12
for i in dataset_name_test:
    print(i)
    causalimpact_eval(i,dataset_type,forecast_horizon)

sim_10_60_l_he




mean_SMAPE:0.27958661942690627
mean_MASE:0.8977582814708288
sim_10_60_l_ho




mean_SMAPE:0.38903406371272575
mean_MASE:1.2867213326157017
sim_10_60_nl_he




mean_SMAPE:0.675987810719893
mean_MASE:1.2960567559265397
sim_10_60_nl_ho




mean_SMAPE:0.7255808673557371
mean_MASE:0.924315700099919
sim_10_222_l_he




mean_SMAPE:0.24873956441982722
mean_MASE:0.8806062199518229
sim_10_222_l_ho




mean_SMAPE:0.3274668434989577
mean_MASE:1.3546820103672685
sim_10_222_nl_he




mean_SMAPE:0.6565759689599837
mean_MASE:1.411481185972155
sim_10_222_nl_ho




In [3]:
# The index needs to be sorted again, if I want to do the placebo test
# first sort, then do placebo test
def custom_sort_key(s):
    parts = s.split('_')
    return int(parts[1])

def transform_sim(dataset_name, dataset_type):
    y_true_df_A = pd.read_csv("../datasets/text_data/" + dataset_type +  \
            "/" + dataset_name + "_test_actual.csv")
    output = '../results/benchmarks/predicted/' + dataset_name +\
        '_causalimpact.txt'
    y_pred_df= pd.read_csv(output, header=None)
    y_pred_df.index = y_true_df_A.index
    y_pred_df = y_pred_df.loc[sorted(y_pred_df.index, key=custom_sort_key),:]
    
    np.savetxt('../results/benchmarks/predicted/' + dataset_name +\
        '_T_causalimpact.txt', pd.DataFrame(y_pred_df), delimiter = ',')


In [1]:
%history

%history


In [6]:
transform_sim('calls911_benchmarks', 'calls911')

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/text_data/calls911/calls911_benchmarks_test_actual.csv'

mean_SMAPE:0.3700299471410633


In [42]:
# MASE


mean_MASE:1.6336656457292509
