In [None]:
import sys

sys.path.insert(0, "..")

import random
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.utils import resample

import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import dill
import random
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import torch
import pyro
import foraging_toolkit as ft


import torch.nn.functional as F
import pyro.distributions as dist
import pyro.optim as optim
from pyro.nn import PyroModule
from pyro.infer.autoguide import (
    AutoNormal,
    AutoDiagonalNormal,
    AutoMultivariateNormal,
    init_to_mean,
    init_to_value,
)
from pyro.contrib.autoguide import AutoLaplaceApproximation
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam
from pyro.infer import Predictive
from pyro.infer import MCMC, NUTS

import os
import logging

logging.basicConfig(format="%(message)s", level=logging.INFO)
smoke_test = "CI" in os.environ

In [None]:
# run once and dill
# see further comments
subset_starts = 420
subset_ends = 480

#1800 seconds overall

locust = ft.load_and_clean_locust(path = "locust_data/15EQ20191202_tracked.csv",
                                  desired_frames = 900,
                                  grid_size=45,
                                  rewards_x = [0.68074, -0.69292],
                                  rewards_y = [-0.03068, -0.03068],
                                  subset_starts= subset_starts,
                                  subset_ends = subset_ends,)

loc_subset = locust['subset']
loc_all = locust['all_frames']

loc_all.birdsDF.shape

In [None]:
# run once and dill (below)
# to have a re-usable processed dataset
# see further comments
loc_all = ft.derive_predictors(loc_all, 
                                        rewards_decay= .4,
                                        visibility_range= 90,
                                        getting_worse = .001,
                                        optimal = 2.11,
                                        proximity_decay = 3,
                                        generate_communicates=True,
                                        info_time_decay=10,
                                        info_spatial_decay=0.1,
                                        finders_tolerance = 2, 
                                        time_shift= subset_starts - 1,
                                        sampling_rate= .1,
                                        restrict_to_invisible= False) 

In [None]:
# data check
print(loc_all.derivedDF.shape)
loc_all.derivedDF.head()

In [None]:
# serializing will take a bit
import dill

with open("locust_data/derived_all.pkl", "wb") as f:
    dill.dump(loc_all, f)  

# instead of processing anew
# you can load preprocessed data
# once you have created the file
# (file size does not allow for regular GitHub storage)
# (WARNING: remember to add to .gitignore to avoid errors)


In [None]:
# once you ran the ove cells once,
# you should be using this instead

with open("/Users/emily/code/collaborative-intelligence/random_hungry_followers/rhf_docs/locust_data/derived_all.pkl", "rb") as f:
    loc_all = dill.load(f)

In [None]:
print(loc_all.derivedDF.shape)
loc_all = ft.prep_data_for_robust_inference(loc_all, gridsize= 9)
print(loc_all.derivedDF.shape)

In [None]:
data = ft.get_tensorized_data(loc_all)
proximity, trace, visibility, communicate, how_far = data["proximity_standardized"],  data["trace_standardized"], data["visibility"],  data["communicate_standardized"], data["how_far"]

ft.visualise_bird_predictors(trace, proximity, how_far, com = communicate, vis_sampling_rate = .1)
# TODO: histogram versions of scatter plots. I suggest modifying ft.visualize_bird_predictors 

In [None]:
locust = loc_all.derivedDF

locust["proximity_id"] = locust.proximity_cat.astype("category").cat.codes
locust["trace_id"] = locust.trace_cat.astype("category").cat.codes
locust["communicate_id"] = locust.communicate_cat.astype("category").cat.codes
locust['how_far'] = locust.how_far_squared_scaled


In [None]:
# run training once, then dill and reuse,
# making sure you add to gitignore

svi_result_p, svi_result_t, svi_result_c = ft.get_svi_results(locust)

In [None]:
# Same drill: we dill results to avoid repeated computation
# make sure you add do git ignore and comment out training
# and pickling after running once

with open("locust_data/svi_results.pkl", "wb") as f:
    dill.dump((svi_result_p, svi_result_t, svi_result_c), f)


In [None]:
# loading: will work if you run the above training and dilling cells once
with open("locust_data/svi_results.pkl", "rb") as f:
    svi_result_p, svi_result_t, svi_result_c = dill.load(f)


In [None]:
#cleanup of extreme values resulting from empty cells

summary = {}
summary["id_p"] = sorted(locust["proximity_id"].unique())
summary["params_p"] = svi_result_p.params['auto_loc'][:-1]
summary["std_p"] = locust.groupby('proximity_id')['how_far'].std()

summary["id_t"] = sorted(locust["trace_id"].unique())[:-1]
summary["params_t"] = svi_result_t.params['auto_loc'][:-2]
summary["std_t"] = locust.groupby('trace_id')['how_far'].std()[:-1]


summary["id_c"] = sorted(locust["communicate_id"].unique())
summary["params_c"] = svi_result_c.params['auto_loc'][:-1]
summary["std_c"] = locust.groupby('communicate_id')['how_far'].std()


assert len(summary["id_p"]) == len(summary["params_p"]) == len(summary["std_p"]), "Lengths of id_p, params_p, and std_p do not match."
assert len(summary["id_t"]) == len(summary["params_t"]) == len(summary["std_t"]), "Lengths of id_t, params_t, and std_t do not match."
assert len(summary["id_c"]) == len(summary["params_c"]) == len(summary["std_c"]), "Lengths of id_c, params_c, and std_c do not match."


In [None]:
lr_p, lr_t, lr_c = [LinearRegression() for _ in range(3)]

X_p = np.array(summary["id_p"]).reshape(-1, 1)
X_t = np.array(summary["id_t"]).reshape(-1, 1)
X_c = np.array(summary["id_c"]).reshape(-1, 1)


lr_p.fit(X_p, summary["params_p"], sample_weight=1/summary["std_p"])
lr_t.fit(X_t, summary["params_t"], sample_weight=1/summary["std_t"])
lr_c.fit(X_c, summary["params_c"], sample_weight=1/summary["std_c"])


X_new = np.arange(0, 8.1, 0.1).reshape((-1, 1))
p_pred = lr_p.predict(X_new)
t_pred = lr_t.predict(X_new)
c_pred = lr_c.predict(X_new)

In [None]:

def sample_and_plot_coef(coef, input, model):

    coef_samples = []
    for _ in range(1000):
        X_resampled, y_resampled = resample(input, summary[f"params_{coef}"], random_state=np.random.randint(1000))
    
        model.fit(X_resampled, y_resampled)
        coef_samples.append(model.coef_[0])


    histogram_trace = go.Histogram(
    x=coef_samples,
    marker=dict(color='blue'),
    )

    layout = go.Layout(
        title= f'Histogram of coef_samples_{coef}',
        xaxis=dict(title='Coefficient Value'),
        yaxis=dict(title='Frequency'),
        paper_bgcolor='black',  
        plot_bgcolor='black',   
        font=dict(color='white'), 
    )

    fig = go.Figure(data=[histogram_trace], layout=layout)
    fig.show()

sample_and_plot_coef("p", X_p, lr_p)
sample_and_plot_coef("t", X_t, lr_t)
sample_and_plot_coef("c", X_c, lr_c)

In [None]:
def plot_summaries (coef, model , pred, title, raw_predictor, ylim = (0,1.2), xlim = (0,8), vis_sampling_rate = 1):


    fig =   px.scatter(
            x=summary[f"id_{coef}"],
            y=summary[f"params_{coef}"],
            error_y=summary[f"std_{coef}"]/2,
            opacity=0.9,
            template="plotly_dark",
        )

    fig.add_scatter(x=X_new.flatten(), y=pred, mode='lines', line=dict(color='red', width=2),
                    showlegend=False)
    

    def sample_and_scale_vector(vector, vis_sampling_rate, xlim):
        sample_size = int(vis_sampling_rate * len(vector))
        sampled_vector = np.random.choice(vector, size=sample_size, replace=False)
        scaled_vector = (sampled_vector - xlim[0]) *  (xlim[1] - xlim[0]) + xlim[0]
        return scaled_vector

    raw_predictor_vector = sample_and_scale_vector(locust[raw_predictor], vis_sampling_rate, xlim)
        

    fig.add_scatter(x=raw_predictor_vector, y=locust["how_far"], mode='markers', opacity=0.2, marker=dict(color='magenta', size = 3),
                    showlegend=False)

    fig.update_layout(
            title=f"{title}: estimated means, weighted linear model (w = {np.round(model.coef_, decimals=3)}) and raw data variances",
            xaxis_title="predictor",
            yaxis_title="how far score",
        )
    
        
    if ylim:
        fig.update_yaxes(range=ylim)
    if xlim:
        fig.update_xaxes(range=xlim)

    fig.show()


plot_summaries("p", lr_p, p_pred, raw_predictor= "proximity_standardized", title =  "Proximity", ylim = (0,1), vis_sampling_rate= .1)
plot_summaries("t", lr_t, t_pred, raw_predictor = "trace_standardized", title = "Trace", ylim = (0,1), xlim= (0,6), vis_sampling_rate = .1)
plot_summaries("c", lr_c, c_pred, raw_predictor = "communicate_standardized", title = "Communicate", ylim = (0,1), vis_sampling_rate = .1)