In [1]:
import os
import sys

current_dir =  os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import torch
import pyro
from collab import foraging_toolkit as ft
import torch.nn.functional as F
import logging
import time
import dill
import copy

from scipy.signal import find_peaks

import plotly.io as pio
from plotly import express as px, graph_objects as go, figure_factory as ff
from pyro.nn import PyroModule
import pyro.distributions as dist
from pyro.infer.autoguide import (
    AutoNormal,
    AutoDiagonalNormal,
    AutoMultivariateNormal,
    init_to_mean,
    init_to_value,
)


from pyro.contrib.autoguide import AutoLaplaceApproximation
from pyro.infer import SVI, Trace_ELBO, MCMC, NUTS
from pyro.optim import Adam
import pyro.optim as optim
from pyro.infer import Predictive
from pyro.infer import MCMC, NUTS


In [2]:
path = "../data/central_park_birds_cleaned_2022/central_park_objects.pkl"
with open(path, "rb") as file:
    central_park_objects = dill.load(file)


In [3]:
def cp_prep_data_for_iference(obj):
    df = obj.how_farDF.copy()
    print("Initial dataset size:", len(df))
    df.dropna(inplace=True)
    print("After dropping NAs:", len(df))
    
    columns_to_normalize = [
        "distance",
        "proximity_standardized",
    ]

    for column in columns_to_normalize:
        df[column] = ft.normalize(df[column])
    
    return torch.tensor(df['distance'].values), torch.tensor(df['proximity_standardized'].values), torch.tensor(df['how_far_squared_scaled'].values), 
    

In [4]:
def model_sigmavar_proximity(distance, proximity, how_far):
    d = pyro.sample("d", dist.Normal(0, .6))
    p = pyro.sample("p", dist.Normal(0, .6))
    b = pyro.sample("b", dist.Normal(.5, .6))

    ds = pyro.sample("ds", dist.Normal(0, .6))
    ps = pyro.sample("ps", dist.Normal(0, .6))
    bs = pyro.sample("bs", dist.Normal(.2, .6))

    sigmaRaw = bs + ds * distance +  ps * proximity
    sigma = pyro.deterministic("sigma", F.softplus(sigmaRaw))
    mean = b + d * distance +  p * proximity

    with pyro.plate("data", len(how_far)):
        pyro.sample("obs", dist.Normal(mean, sigma), obs=how_far)

In [5]:
def get_samples(distance, proximity, how_far, model = model_sigmavar_proximity,
                num_svi_iters = 1000,
                num_mcmc_samples = 200, 
                num_samples = 1000):
    
    guide = AutoMultivariateNormal(model, init_loc_fn=init_to_mean)
    svi = SVI(model_sigmavar_proximity,
        guide,
        optim.Adam({"lr": .01}),
        loss=Trace_ELBO())

    iterations = []
    losses = []
    
    logging.info(f"Starting SVI inference with {num_svi_iters} iterations.")
    start_time = time.time()
    pyro.clear_param_store()
    for i in range(num_svi_iters):
        elbo = svi.step(distance, proximity, how_far)
        iterations.append(i)
        losses.append(elbo)
        if i % 200 == 0:
            logging.info("Elbo loss: {}".format(elbo))
    end_time = time.time()
    elapsed_time = end_time - start_time
    logging.info("SVI inference completed in %.2f seconds.", elapsed_time)

    fig = px.line(x=iterations, y=losses, title="ELBO loss", template="presentation")
    labels={"iterations": "iteration", "losses": "loss"}
    fig.update_xaxes(showgrid=False, title_text=labels["iterations"])
    fig.update_yaxes(showgrid=False, title_text=labels["losses"])
    fig.update_layout(width=700)
    fig.show()
        
    predictive = Predictive(model, guide=guide, 
                        num_samples=num_samples)
    
    proximity_svi = {k: v.flatten().reshape(num_samples, -1).detach().cpu().numpy()
            for k, v in predictive(distance, proximity, how_far).items()
            if k != "obs"}


    print ("SVI-based coefficient marginals:")
    for site, values in ft.summary(proximity_svi, ["d", "p"]).items():
            print("Site: {}".format(site))
            print(values, "\n")
            
    return {"svi_samples": proximity_svi, "svi_guide": guide, "svi_predictive": predictive}

In [6]:
ducks_objects = central_park_objects[0]
keys = [19, 46, 85]

duck_outcomes = {}

for key in keys:
    obj = ducks_objects[key]
    print (f"Working on ducks with optimal={key}")
    distance, proximity, how_far= cp_prep_data_for_iference(obj)
    ft.visualise_bird_predictors(distance, proximity, how_far, vis_sampling_rate=.05, titles = [f"Distance (ducks)", f"Proximity (ducks, optimal={key})"],
                                 x_axis_labels = ["distance", "proximity"])

    duck_outcomes[key] = get_samples(distance, proximity, how_far)
    

Working on ducks with optimal=19
Initial dataset size: 202152
After dropping NAs: 199001


2023-09-28 15:00:56,063:  Starting SVI inference with 1000 iterations.
2023-09-28 15:00:56,143:  Elbo loss: 147328.7742858756
2023-09-28 15:01:02,156:  Elbo loss: -196784.60144905013
2023-09-28 15:01:07,981:  Elbo loss: -208510.23886370676
2023-09-28 15:01:14,169:  Elbo loss: -208817.0505722433
2023-09-28 15:01:20,992:  Elbo loss: -207455.6592630716
2023-09-28 15:01:28,115:  SVI inference completed in 32.05 seconds.


SVI-based coefficient marginals:
Site: d
       mean       std        5%       25%       50%       75%       95%
0 -0.336413  0.022209 -0.371625 -0.351945 -0.336485 -0.321156 -0.299304 

Site: p
      mean       std        5%       25%       50%       75%       95%
0  0.11194  0.031015  0.062056  0.090972  0.111599  0.133411  0.161668 

Working on ducks with optimal=46
Initial dataset size: 202152
After dropping NAs: 199001


2023-09-28 15:01:39,056:  Starting SVI inference with 1000 iterations.
2023-09-28 15:01:39,193:  Elbo loss: 185342.0515621787
2023-09-28 15:01:46,643:  Elbo loss: -188210.98658590036
2023-09-28 15:01:53,328:  Elbo loss: -215860.05554586896
2023-09-28 15:02:00,336:  Elbo loss: -210315.40348184784
2023-09-28 15:02:06,963:  Elbo loss: -214664.42444593905
2023-09-28 15:02:14,185:  SVI inference completed in 35.13 seconds.


SVI-based coefficient marginals:
Site: d
       mean       std       5%       25%       50%       75%       95%
0 -0.341049  0.029336 -0.39121 -0.360307 -0.340574 -0.320565 -0.292205 

Site: p
       mean      std        5%       25%       50%       75%       95%
0  0.163708  0.02657  0.120099  0.146819  0.163627  0.180959  0.206197 

Working on ducks with optimal=85
Initial dataset size: 202152
After dropping NAs: 199001


2023-09-28 15:02:26,650:  Starting SVI inference with 1000 iterations.
2023-09-28 15:02:26,881:  Elbo loss: 169314.06077309969
2023-09-28 15:02:34,205:  Elbo loss: -141772.24683104135
2023-09-28 15:02:43,778:  Elbo loss: -191896.12427678925
2023-09-28 15:02:51,252:  Elbo loss: -191605.86757902236
2023-09-28 15:02:58,962:  Elbo loss: -203859.1234043407
2023-09-28 15:03:06,795:  SVI inference completed in 40.14 seconds.


SVI-based coefficient marginals:
Site: d
       mean       std        5%       25%       50%       75%       95%
0 -0.324351  0.029483 -0.373866 -0.344714 -0.323557 -0.304695 -0.277549 

Site: p
       mean       std       5%       25%       50%       75%       95%
0 -0.136242  0.029695 -0.18296 -0.157051 -0.136015 -0.116364 -0.087984 



In [7]:
def plot_coefs(outcomes, title, ann_start_y = 100, ann_break_y = 50, generate_object = False):

    keys = [19, 46, 85]
    samples = {}

    for key in keys:
        samples[key] =  outcomes[key]["svi_samples"]["p"].flatten()
    
    samples_df = pd.DataFrame(samples)
    samples_df_medians = samples_df.median(axis = 0 ).tolist()

    fig_coefs = px.histogram(samples_df, template = "presentation", 
                opacity = .4,
                labels={"variable": "optimal proximity", "value": "proximity coefficient"},
                        width  = 700,
                        title  = title,
                        marginal="rug"
                        )


    for i, color in enumerate(['#1f77b4', '#ff7f0e', '#2ca02c']):
            fig_coefs.add_vline(x=samples_df_medians[i], line_dash="dash", line_color=color, name=f"Median ({samples_df_medians[i]})")
        
        
            fig_coefs.add_annotation(
            x=samples_df_medians[i],
            y= ann_start_y + ann_break_y * i,  # Adjust the vertical position of the label
            text=f"{samples_df_medians[i]:.2f}",
            bgcolor="white",
            showarrow=False,
            opacity=0.8,
            )

    fig_coefs.update_layout(barmode='overlay', yaxis=dict(showticklabels=False, title=None, showgrid=False)) 

    if generate_object:
        return fig_coefs
    else:
        fig_coefs.show()


In [8]:
ducks_coefs_plot = plot_coefs(duck_outcomes, "Ducks: proximity coefficients", ann_start_y = 200, ann_break_y = 80, generate_object = True)

ducks_coefs_plot.show()

pio.write_image(ducks_coefs_plot, 'exported_figures/duck_coefs_plot.png', 
               engine = "kaleido", width=600, height=600, scale=5)


In [9]:


def calculate_R_squared_prox(distance, proximity,
                how_far, guide, subsample_size = 1000) :
    predictive = pyro.infer.Predictive(model_sigmavar_proximity, guide=guide, num_samples=1000)
    
    random_indices = np.random.choice(len(distance), size=subsample_size, replace=False)
    distance_sub = distance[random_indices]
    proximity_sub = proximity[random_indices]
    how_far_sub = how_far[random_indices]
    
    predictions = predictive(distance_sub, proximity_sub, how_far_sub)

    simulated_outcome = ( predictions['b'] + predictions['p'] * proximity +
                      predictions['d'] * distance )

    mean_sim_outcome = simulated_outcome.mean(0).detach().cpu().numpy()

    observed_mean = torch.mean(how_far)

    tss = torch.sum((how_far - observed_mean) ** 2)
    rss = torch.sum((how_far - mean_sim_outcome) ** 2)

    r_squared = 1 - (rss / tss)

    return r_squared.float().item()

    


In [10]:
for key in keys:
    guide = duck_outcomes[key]["svi_guide"]
    print (f"R^2 for ducks with optimal={key}:", calculate_R_squared_prox(distance, proximity, how_far, guide))
    
# interestingly, knowing where they won't go is useful

R^2 for ducks with optimal=19: 0.15786112844944
R^2 for ducks with optimal=46: 0.22453591227531433
R^2 for ducks with optimal=85: 0.3238811492919922


In [11]:
sps_objects = central_park_objects[1]
keys = [19, 46, 85]

sps_outcomes = {}

for key in keys:
    obj = sps_objects[key]
    print (f"Working on sparrows et al. with optimal={key}")
    distance, proximity, how_far= cp_prep_data_for_iference(obj)
    ft.visualise_bird_predictors(distance, proximity, how_far, vis_sampling_rate=.05, titles = [f"Distance (sparrows et al.)", f"Proximity (sparrows et al., optimal={key})"],
                                 x_axis_labels = ["distance", "proximity"])

    sps_outcomes[key] = get_samples(distance, proximity, how_far)
    

Working on sparrows et al. with optimal=19
Initial dataset size: 120078
After dropping NAs: 119050


2023-09-28 15:03:49,197:  Starting SVI inference with 1000 iterations.
2023-09-28 15:03:49,236:  Elbo loss: 100896.25243931587
2023-09-28 15:03:53,741:  Elbo loss: -145912.89913494096
2023-09-28 15:03:59,403:  Elbo loss: -157884.7682005655
2023-09-28 15:04:03,565:  Elbo loss: -157960.38447851944
2023-09-28 15:04:08,653:  Elbo loss: -166776.85282585162
2023-09-28 15:04:12,741:  SVI inference completed in 23.54 seconds.


SVI-based coefficient marginals:
Site: d
       mean       std        5%       25%       50%       75%       95%
0 -0.161207  0.022055 -0.198322 -0.175222 -0.161521 -0.145886 -0.126405 

Site: p
       mean       std        5%      25%       50%       75%       95%
0  0.067309  0.024498  0.026339  0.05139  0.067215  0.083687  0.107249 

Working on sparrows et al. with optimal=46
Initial dataset size: 120078
After dropping NAs: 119050


2023-09-28 15:04:20,820:  Starting SVI inference with 1000 iterations.
2023-09-28 15:04:20,859:  Elbo loss: 123671.8352714399
2023-09-28 15:04:25,384:  Elbo loss: -135786.6402176347
2023-09-28 15:04:29,845:  Elbo loss: -160007.4599943592
2023-09-28 15:04:34,211:  Elbo loss: -164399.90990289336
2023-09-28 15:04:38,683:  Elbo loss: -169161.04101027222
2023-09-28 15:04:44,051:  SVI inference completed in 23.23 seconds.


SVI-based coefficient marginals:
Site: d
       mean       std        5%       25%       50%       75%       95%
0 -0.161426  0.027701 -0.205749 -0.180731 -0.161801 -0.140798 -0.116616 

Site: p
       mean       std        5%       25%      50%       75%       95%
0  0.035524  0.021297  0.000863  0.020903  0.03506  0.050764  0.070176 

Working on sparrows et al. with optimal=85
Initial dataset size: 120078
After dropping NAs: 119050


2023-09-28 15:04:52,844:  Starting SVI inference with 1000 iterations.
2023-09-28 15:04:52,886:  Elbo loss: 106492.88385005113
2023-09-28 15:04:57,296:  Elbo loss: -142253.4569033924
2023-09-28 15:05:01,883:  Elbo loss: -153419.4287715507
2023-09-28 15:05:06,968:  Elbo loss: -151694.36204596015
2023-09-28 15:05:12,716:  Elbo loss: -166125.3260322371
2023-09-28 15:05:17,818:  SVI inference completed in 24.97 seconds.


SVI-based coefficient marginals:
Site: d
       mean       std        5%       25%       50%       75%      95%
0 -0.141483  0.028304 -0.187283 -0.161065 -0.141862 -0.122277 -0.09445 

Site: p
       mean       std        5%     25%       50%       75%       95%
0 -0.111827  0.022111 -0.148409 -0.1271 -0.112995 -0.096663 -0.075075 



In [12]:
sps_coefs_plot = plot_coefs(sps_outcomes, "Sparrows et al.: proximity coefficients", ann_start_y = 200, ann_break_y = 80, generate_object = True)

sps_coefs_plot.show()

pio.write_image(sps_coefs_plot, 'exported_figures/sps_coefs_plot.png', 
               engine = "kaleido", width=600, height=600, scale=5)