In [32]:
import pickle
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import scipy.stats as st

INPUT_FILE = 'pickle/df_arrivals_aggregated.pkl'

In [33]:
# Time parameters
STEP_IN_SECONDS = 30
CUSTOMER_SPEED_PER_STEP = 0.5
NUMBER_OF_HOURS = 8
number_of_steps_in_hour = lambda step_in_seconds: 60 * 60 / step_in_seconds
steps_in_hour = int(number_of_steps_in_hour(STEP_IN_SECONDS))

In [34]:
def read_input_customer_distribution(input_file=INPUT_FILE):
    with open(input_file, 'rb') as f:
        df = pickle.load(f)
    return df

In [62]:
def cast_to_discrete_dist(dist):
    dist = [i if i > 0 else 0 for i in dist]
    dist = [round(x) for x in dist]
    return dist


def generate_customers_dist(df, sigma, steps_in_hour):
    avg_customer_per_step_list = []
    bias_aggregated = []
    customers_distribution = []

    for customers_in_hour in df["value"]:
        avg_customer_per_step = customers_in_hour / steps_in_hour
        avg_customer_per_step_list.append(avg_customer_per_step)
        # Generate samples
        mu = avg_customer_per_step
        bias_customers_in_hour = np.random.normal(mu, sigma, steps_in_hour)
        bias_customers_in_hour = cast_to_discrete_dist(bias_customers_in_hour)
        customers_distribution += bias_customers_in_hour
        bias_aggregated.append(sum(bias_customers_in_hour))

    return bias_aggregated, customers_distribution

In [36]:
def generate_multiple_customers_dist(df, runs, sigma, steps_in_hour):
    df_step_list = []

    for run in range(runs):
        df_step = pd.DataFrame()
        sample_customer, _ = generate_customers_dist(df, sigma, steps_in_hour)
        df_step["value"] = sample_customer
        df_step["run"] = run
        df_step["hour"] = [i + 8 for i in range(len(sample_customer))]
        df_step_list.append(df_step)
    df_step_out = pd.concat(df_step_list)
    return df_step_out

In [37]:
def compute_confidence_interval(df_step_out):
    l_list, u_list = [], []
    for hour in df["hour"]:
        subset_df = df_step_out.query(f"hour=={hour}")
        values = subset_df["value"]
        l, u = st.t.interval(0.95, len(values)-1, loc=np.mean(values), scale=st.sem(values))
        l_list.append(l)
        u_list.append(u)
    return l_list, u_list

In [38]:
def aggregate_runs(df_step_out, df):
    df_step_avg = df_step_out.groupby(by=["hour"]).mean()
    df_step_avg = df_step_avg.reset_index()
    # Confidence interval
    l_list, u_list = compute_confidence_interval(df_step_out)
    # Final dataframe
    df_full = pd.DataFrame()
    df_full["hour"] = df_step_avg["hour"]
    df_full["step_avg"] = df_step_avg["value"]
    df_full["real"] = df["value"]
    df_full["lower"] = l_list
    df_full["upper"] = u_list
    df_full["error"] = abs(df_full["step_avg"] - df_full["real"]) / df_full["real"]
    return df_full


In [39]:
# Create traces
def plot_distributions(df_full):

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df_full["hour"], y=df_full["real"],
                        mode='lines',
                        name='True distribution'))
    fig.add_trace(
        go.Scatter(
            x=df_full["hour"],
            y=df_full["step_avg"],
            mode='lines',
            name='Synthetic step distribution',
            error_y=dict(
                type='data',
                symmetric=False,
                array=np.array(df_full["upper"]) - np.array(df_full["step_avg"]),
                arrayminus=np.array(df_full["step_avg"]) - np.array(df_full["lower"]))
            ))
    fig.update_layout(
        xaxis = dict(
            tickmode = 'linear',
            tick0 = 0,
            dtick = 1
        ),
        yaxis = dict(
            tickmode = 'linear',
            tick0 = 0,
            dtick = 15
        ),
        title={
            'text': '<br>'.join(["Real customer distribution", "vs",
                                 "Synthetic aggregated per-step biased distribution"]),
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
        xaxis_title="Hour",
        yaxis_title="Number of incoming customers",
        legend_title="Distribution",
    )
    return fig

In [120]:
sigma = 0.4
n_runs = 30

df = read_input_customer_distribution()
df_step_out = generate_multiple_customers_dist(df, n_runs, sigma, steps_in_hour)
df_full = aggregate_runs(df_step_out, df)
fig = plot_distributions(df_full)
fig.show()

In [41]:
def plot_distribution_error(df_full):

    fig = go.Figure()
    fig.add_trace(go.Bar(x=df_full["hour"], y=df_full["error"]))
    fig.update_layout(
        xaxis = dict(
            tickmode = 'linear',
            tick0 = 0,
            dtick = 1
        ),
        yaxis = dict(
            tickmode = 'linear',
            tick0 = 0,
            dtick = 0.1
        ),
        title={
            'text': 'Relative error for customer distribution',
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
        xaxis_title="Hour",
        yaxis_title="Relative error",
        legend_title="Store",
    )
    return fig

In [42]:
fig = plot_distribution_error(df_full)
fig.show()



In [43]:
_, customers_distribution = generate_customers_dist(df, sigma, steps_in_hour)
customers_distribution = pd.DataFrame(customers_distribution)
customers_distribution

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
1795,1
1796,1
1797,1
1798,0


In [121]:
mu = .14166666666666666
sigma = 0.3

real_distribution = np.random.normal(mu, sigma, steps_in_hour)
discrete_distribution = cast_to_discrete_dist(real_distribution)

In [122]:
import plotly.express as px

fig = go.Figure()
fig = px.histogram(x=discrete_distribution)
fig.update_layout(bargap=0.1)
fig.show()

In [123]:
fig = px.histogram(df, x=real_distribution, nbins=30)
fig.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 0.1
    ),
)

