# Post Sim Run

This takes the simulation output and does some maths to get it to look exactly like the GPS output. This is intended to allow both sim and gps datasets to run trhough the same code so that 1:1 comparisons can be made.

In [None]:
# Import some libs

import pandas as pd
import os
import numpy as np
import plotly.express as px
import geopandas as gpd
import yaml 
import pyarrow as pa
import pyarrow.parquet as pq
from math import radians, sin, cos, sqrt, asin

import plotly.graph_objects as go

## Config and Functions

In [None]:
# Control Variables
resample = False
sample_size = 10000

sim_projection = 'EPSG:5070'
gps_projection = 'EPSG:4326'

chosen_projection = sim_projection

In [None]:
# Input
##########################
config_file = '../config/local_deer_config.yaml'
random_csv = '../output/random_vs_disease.csv'
hmm_csv = '../output/HMM_vs_disease.csv'
hmm_csv = '../output/LocalRun_1.csv'
dld_csv = '../output/DLD_vs_disease.csv' 

# The GPS file that is being used to check against.
gps_file = './output-data/filtered_gps.parq'

# Output
##########################
sim_parq = '../output/sim.parq' 
agent_gpkg = '../output/agent.gpkg'

sim_hr_gpkg = '../output/sim_homeranges.gpkg' 
gps_hr_gpkg = '../output/gps_homeranges.gpkg'


In [None]:
with open(config_file) as stream:
    try:
        params = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [None]:
def csv_to_gpkg(params):
    '''
    Convert the repast csv logfile into a geopackage with 
    the x,y coords as point geom, and maybe the centroid geom too...
    '''
    in_csv = params['logging']['agent_log_file']
    agent_gpkg = params['logging']['gpkg_log_file']
    centroid_gpkg = params['logging']['centroid_log_file']

    # Load the CSV file
    df = pd.read_csv(in_csv)

    # Convert to GeoDataFrame
    gdf = gpd.GeoDataFrame(
        df, 
        geometry=gpd.points_from_xy(df['x'], df['y'], crs='EPSG:5070')
    )

    # Save to a GeoPackage file
    gdf.to_file(agent_gpkg, layer='agent_location', driver='GPKG')

    
    # Convert to GeoDataFrame
    gdf = gpd.GeoDataFrame(
        df, 
        geometry=gpd.points_from_xy(df['centroid_x'], df['centroid_y'], crs='EPSG:5070')
    )

    # Save to a GeoPackage file
    gdf.to_file(centroid_gpkg, layer='centroid_location', driver='GPKG')
    
    return

In [None]:
def csv_to_parquet_chunked(csv_filepath, parquet_filepath, chunksize=100000):
    """
    Converts a large CSV file to Parquet format in chunks.

    Args:
        csv_filepath (str): Path to the input CSV file.
        parquet_filepath (str): Path to the output Parquet file.
        chunksize (int, optional): Number of rows to read per chunk. Defaults to 100000.
    """
    csv_stream = pd.read_csv(csv_filepath, chunksize=chunksize)
    
    # Get the schema from the first chunk
    first_chunk = next(csv_stream)
    parquet_schema = pa.Table.from_pandas(first_chunk).schema
    
    # Initialize Parquet writer
    with pq.ParquetWriter(parquet_filepath, schema=parquet_schema) as writer:
        # Write the first chunk
        table = pa.Table.from_pandas(first_chunk, schema=parquet_schema)
        writer.write_table(table)
    
        # Process remaining chunks
        for chunk in csv_stream:
            table = pa.Table.from_pandas(chunk, schema=parquet_schema)
            writer.write_table(table)

# Example usage: 

## Read Sim Output

Read several CSV files that have been created using the "make benchmark" command. These files are created using a couple of different config files. The goal here is to compare the different parameters in the benchmark config files.

For this experiment one month of data was generated using different "movement methods" for each run. The different movement methods are:

  - Random Movement: No behaviour states, no landscape interaction, just basic step and turn movement
  - HMM Movement: Multiple behaviour states that are set the step and turn parameters. Transistions between states is a result of the landscape data.
  - DLD movement: Complex movement influenced by behaviour states, home ranges, landscape info etc

The csv's are read into a dataframe in chunks and then saved to a parquet file. This should allow larger simulations to be processed on a smaller machine (like my laptop) without running into memory errors... up to a point.

In [None]:
csvs = [random_csv, hmm_csv, dld_csv]
# csvs = [ hmm_csv]

df_sim = pd.DataFrame()
for sim_csv in csvs:
    csv_to_parquet_chunked(sim_csv, sim_parq)
    df = pd.read_parquet(sim_parq)
    df.rename(columns = {'Timestamp':'timestamp',
                     'UUID':'ID' },inplace=True)
    
    filename = os.path.splitext(os.path.basename(sim_csv))[0] 
    df['filename'] = filename
    df_sim = pd.concat([df_sim, df])

Convert the parquet file into a geopandas dataframe and eventually a geopackage. This will allow it to be easily added to a QGIS project

In [None]:
# Convert to GeoDataFrame
gdf_sim = gpd.GeoDataFrame(
    df_sim, 
    geometry=gpd.points_from_xy(df_sim['x_proj'], df_sim['y_proj'], crs=sim_projection)
)
#Reproject to chosen project projection
gdf_sim = gdf_sim.to_crs(chosen_projection)

# Save to a GeoPackage file
gdf_sim.to_file(agent_gpkg, layer='agent_location', driver='GPKG')

In [None]:
gdf_sim.head()

## Read GPS Data
Once all the preprocessing has been done, lets read the smaller/compressed parquet file into memory,do some analysis, and start plotting it. 


In [None]:
df_gps = pd.read_parquet(gps_file)
gdf_gps = gpd.GeoDataFrame(
    df_gps, 
    geometry=gpd.points_from_xy(df_gps['lon'], df_gps['lat'], crs=gps_projection))

#Reproject to chosen project projection
gdf_gps = gdf_gps.to_crs(chosen_projection)

## Calculate HomeRange Timeseries

How much area is covered every week/month/year? Is it similar between the sim and GPS data?

Let's take both the GPS data and SIM data and group it by Agent ID, Month and, in the case of the simulation data, the Filename of the data. This is in order to keep the different benchmark results seperate.  


In [None]:
# Let's calculate the monthly group
gdf_sim['timestamp'] = pd.to_datetime(gdf_sim['timestamp'] )
gdf_sim['time_group'] = gdf_sim['timestamp'].dt.to_period('M')

gdf_gps['timestamp'] = pd.to_datetime(gdf_gps['timestamp'] )
gdf_gps['time_group'] = gdf_gps['timestamp'].dt.to_period('M')

sim_homeranges = gdf_sim.dissolve(["ID", "time_group", "filename"]).convex_hull.reset_index().set_geometry(0)
sim_time_counter = gdf_sim.groupby(['ID','time_group', "filename"])['timestamp'].count().reset_index()
sim_homeranges = pd.merge(
    left=sim_homeranges, 
    right=sim_time_counter,
    how='left',
    left_on=['ID', 'time_group', 'filename'],
    right_on=['ID', 'time_group', 'filename'])

gps_homeranges = gdf_gps.dissolve(["Deer ID", "time_group"]).convex_hull.reset_index().set_geometry(0)
gps_time_counter = gdf_gps.groupby(['Deer ID','time_group'])['timestamp'].count().reset_index()
gps_homeranges = pd.merge(
    left=gps_homeranges, 
    right=gps_time_counter,
    how='left',
    left_on=['Deer ID', 'time_group'],
    right_on=['Deer ID', 'time_group'])

deer_info = gdf_gps[['Deer ID', 'sex']].drop_duplicates().reset_index(drop=True)
gps_homeranges = pd.merge(
    left=gps_homeranges, 
    right=deer_info[['sex','Deer ID']],
    how='left',
    left_on=['Deer ID'],
    right_on=['Deer ID'])

gps_homeranges = gps_homeranges.rename(columns = {'timestamp':'samples_in_group' })
sim_homeranges = sim_homeranges.rename(columns = {'timestamp':'samples_in_group'})

In [None]:
sim_homeranges["area"] = sim_homeranges[0].area/ 10**6 # Square km's
gps_homeranges["area"] = gps_homeranges[0].area/ 10**6

In [None]:
gps_homeranges.to_file(gps_hr_gpkg, layer='agent_location', driver='GPKG')
sim_homeranges.to_file(sim_hr_gpkg, layer='agent_location', driver='GPKG') 

So now that the homeranges are written to geopackage files, they can be visualised in QGIS. Here's a quick example:



## Homerange sizes
Are the monthly home range areas similar to the GPS data? Let's plot the monthly home range area (convex hull of GPS/Simulated points per agent, per month). This homerange area is not specified in the model parameters and so "should" be similar if the model is behaving as expected. 

In [None]:
sim_homeranges

In [None]:
fig = go.Figure() 

 
fig.add_trace(go.Histogram(x=gps_homeranges["samples_in_group"],
                           histnorm='probability density',
                           name = 'GPS Data'))

for filepath in sim_homeranges['filename'].unique():  
    df = sim_homeranges[sim_homeranges['filename'] == filepath]
    
    fig.add_trace(go.Histogram(x=df["samples_in_group"],
                           histnorm='probability density',
                           name = f'{filepath} Data'))

fig.update_layout(
    title=dict(
        text="Monthly Number of Rows/Points in HomeRange"
    ),  
    yaxis=dict(
        title=dict(
            text="Normalised Points Per Month"
        )
    ),
    xaxis=dict(
        title=dict(
            text="Number of Points used for HomeRange per month"
        )
    ),
    legend=dict(
        title=dict(
            text="Movement Model"
        )
    ), 
)
fig.show()

The above figure shows that the simulated data is, obviously, more regular than the GPS data. There are a few GPS devices that have far fewer samples in a given month than expected. These few GPS points would alter the expected monthly home range size. Let's drop all rows that have a points/month that is too low.

In [None]:
# Do Some data filtering

gps_homeranges = gps_homeranges[gps_homeranges['samples_in_group'] > 600]
gps_homeranges = gps_homeranges[gps_homeranges['area'] < 40]


In [None]:
fig = go.Figure() 

 
fig.add_trace(go.Histogram(x=gps_homeranges["area"],
                           histnorm='probability density',
                           name = 'GPS Data'))

for filepath in sim_homeranges['filename'].unique():  
    df = sim_homeranges[sim_homeranges['filename'] == filepath]
    
    fig.add_trace(go.Histogram(x=df["area"],
                           histnorm='probability density',
                           name = f'{filepath} Data'))
fig.update_layout(
    title=dict(
        text="Monthly Home Range Area Histogram"
    ),  
    yaxis=dict(
        title=dict(
            text="Normalised Monthly Home Ranges"
        )
    ),
    xaxis=dict(
        title=dict(
            text="Home Range Area [km^2]"
        )
    ),
    legend=dict(
        title=dict(
            text="Data Source"
        )
    ), 
)
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)

fig.show()

The above figure shows that the GPS data home ranges are much smaller than any of the modelled home ranges... Not a good result...

In [None]:
fig = go.Figure() 

fig.add_trace(go.Histogram(x=gps_homeranges[gps_homeranges['sex']=='f']["area"],
                           histnorm='probability density',
                           name = 'Female GPS Data'))

fig.add_trace(go.Histogram(x=gps_homeranges[gps_homeranges['sex']=='m']["area"],
                           histnorm='probability density',
                           name = 'Male GPS Data'))
 
fig.update_layout(
    title=dict(
        text="Monthly Home Range Area Histogram"
    ),  
    yaxis=dict(
        title=dict(
            text="Normalised Monthly Home Ranges"
        )
    ),
    xaxis=dict(
        title=dict(
            text="Home Range Area [km^2]"
        )
    ),
    legend=dict(
        title=dict(
            text="Data Source"
        )
    ), 
)
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)

fig.show()

The above figure shows that the male/female homeranges in the GPS data is significantly different. It would therefor make sense to model the male/female movement parameters differently.

## Those same step and turn calcs again

Below is the method used to calculate step and turn values from a timeseries of points. It must be known that the values calculated are often for the previous points, so when looking at it with agents mixed up it'll just look plain wrong 

In [None]:
def calc_heading(lon_1, lat_1,
                 lon_2, lat_2,
                 lon_3, lat_3):
    
    '''
    Calculate the turning angle Theta_T to go from t-1 > t > t+1 
    or if Centroid is used as lon/lat_3 then  it should return
    Theta_Ct
    '''    
    dx = lon_2 - lon_1
    dy = lat_2 - lat_1
    angle_radians_1 = np.arctan2(dx, dy) # Compass direction of travel between current_pos and centroid 

    dx = lon_3 - lon_2
    dy = lat_3 - lat_2
    angle_radians_2 = np.arctan2(dx, dy) # Compass direction of travel between current_pos and centroid 
    
    turn_angle = (angle_radians_2 - angle_radians_1) % (2 * np.pi)

    turn_angle = np.mod(turn_angle + np.pi, 2*np.pi) - np.pi
     
    return turn_angle

def calc_step_and_turn(gdf_sim):
    gdf_sim = gdf_sim.to_crs('5070')
    gdf_sim['lon'] = gdf_sim.geometry.x
    gdf_sim['lat'] = gdf_sim.geometry.y
    gdf_sim = gdf_sim.to_crs(sim_projection)
    
    gdf_sim['geometry_prev'] = gdf_sim.groupby('ID')['geometry'].shift(1) 
    gdf_sim['geometry_next'] = gdf_sim.groupby('ID')['geometry'].shift(-1) 
    gdf_sim = gdf_sim.dropna(subset=['geometry_prev','geometry_next']) 
    
    gdf_sim['step_distance'] = gdf_sim.distance(gdf_sim['geometry_next'])*30
    gdf_sim['turn_angle']  = gdf_sim.apply(lambda row: calc_heading(row['geometry_prev'].x, row['geometry_prev'].y, 
                                                                    row['geometry'].x, row['geometry'].y, 
                                                                    row['geometry_next'].x, row['geometry_next'].y, 
                                                                    ), axis=1) 
    
    
    return gdf_sim

## Look at Movement Params

Similar to the initial data analysis, lets take a look at the step and turn distributions and then some other statistical measures like home range size.

In [None]:
gdf_sim = calc_step_and_turn(gdf_sim)

In [None]:

fig = go.Figure() 

fig.add_trace(go.Histogram(x=gdf_gps["turn_angle"],
                           histnorm='probability density',
                           name = 'GPS Turn Angles'))

for filepath in gdf_sim['filename'].unique(): 
    df = gdf_sim[gdf_sim['filename'] == filepath]
    fig.add_trace(go.Histogram(x=df["turn_angle"],
                               histnorm='probability density',
                               name = f'{filepath} Turn Angles'))

fig.update_traces(xbins=dict( # bins used for histogram
        start=-3.15,
        end=3.15,
        size=0.05
    ))

fig.update_layout(
    title=dict(
        text="Normalised Histogram of Turn Angles"
    ),  
    yaxis=dict(
        title=dict(
            text="Normalised Count"
        )
    ),
    xaxis=dict(
        title=dict(
            text="Turn Angle [Rad]"
        )
    ),
    legend=dict(
        title=dict(
            text="Data Source"
        )
    ), 
)
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)

fig.show()

In [None]:
fig = go.Figure() 

fig.add_trace(go.Histogram(x=gdf_gps["step_distance"],
                           histnorm='probability density',
                           name = 'GPS Data'))

for filepath in gdf_sim['filename'].unique(): 
    df = gdf_sim[gdf_sim['filename'] == filepath]
    fig.add_trace(go.Histogram(x=df["step"],
                               histnorm='probability density',
                               name = f'{filepath}'))
     

fig.update_traces(xbins=dict( # bins used for histogram
        start=0,
        end=1000,
        size=10
    ))

fig.update_layout(
    title=dict(
        text="Normalised Histogram of Step Distances"
    ),  
    yaxis=dict(
        title=dict(
            text="Normalised Count"
        )
    ),
    xaxis=dict(
        title=dict(
            text="Step Distance [meter]"
        )
    ),
    legend=dict(
        title=dict(
            text="Data Source"
        )
    ), 
)
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()

## HMM Step and Turns vs Behavioural State
The step and turn histograms for HMM doens't look right... There seems to be something off with it. 

In [None]:
hmm_sim = gdf_sim[gdf_sim['filename'] == 'LocalRun']
hmm_sim[['filename','turn','turn_angle','step','step_distance']]

In [None]:
hmm_sim

In [None]:
fig = go.Figure() 

fig.add_trace(go.Histogram(x=gdf_gps["turn_angle"],
                           histnorm='probability density',
                           name = 'GPS Data'))

states = hmm_sim['Behaviour State'].unique()
for behaviour_state in states: 
    df = hmm_sim[hmm_sim['Behaviour State'] == behaviour_state]
    fig.add_trace(go.Histogram(x=df["turn_angle"],
                               histnorm='probability density',
                               name = f'HMM -- {behaviour_state}'))
         

# fig.update_traces(xbins=dict( # bins used for histogram
#         start=0,
#         end=1000,
#         size=10
#     ))


fig.update_traces(xbins=dict( # bins used for histogram
        start=-3.15,
        end=3.15,
        size=0.05
    ))

fig.update_layout(
    title=dict(
        text="Normalised Histogram of Step Distances"
    ),  
    yaxis=dict(
        title=dict(
            text="Normalised Count"
        )
    ),
    xaxis=dict(
        title=dict(
            text="Step Distance [meter]"
        )
    ),
    legend=dict(
        title=dict(
            text="Data Source"
        )
    ), 
)
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()

## Timeseries Plots

In [None]:
gdf_sim[['step','step_distance']]

In [None]:

df_grouped = gdf_sim.groupby(['timestamp', 'Disease State']).size().reset_index(name='count')
fig = px.line(df_grouped, x='timestamp', y='count', color='Disease State',
              title='Count of Categories Over Time',
              labels={'timestamp': 'Timestamp', 'count': 'Count', 'category': 'Category'})

fig.show()

In [None]:
fig = go.Figure()  

for filepath in gdf_sim['filename'].unique(): 
    df = gdf_sim[gdf_sim['filename'] == filepath] 
    df_grouped = df.groupby(['timestamp', 'Behaviour State']).size().reset_index(name='count')
    states = df_grouped['Behaviour State'].unique()
    for behaviour_state in states:
        df_state = df_grouped[df_grouped['Behaviour State'] == behaviour_state] 
        fig.add_trace(go.Scatter(x=df_state["timestamp"],
                                 y=df_state['count'],
                                   name = f'{filepath} -- {behaviour_state}'))
      
fig.update_layout(
    title=dict(
        text="Timeseries of Behaviour State Counts"
    ),  
    yaxis=dict(
        title=dict(
            text="Agent Count"
        )
    ),
    xaxis=dict(
        title=dict(
            text="Timestamp"
        )
    ),
    legend=dict(
        title=dict(
            text="Data Source"
        )
    ), 
)
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()

In [None]:

df_grouped = gdf_sim.groupby(['timestamp'])['step_distance'].mean().reset_index(name='count')
fig = px.line(df_grouped, x='timestamp', y='count', 
              # color='Disease State',
              title='Average Step Distance Over Time',
              labels={'timestamp': 'Timestamp', 'count': 'Average Step Distance', 'category': 'Category'})

fig.show()

In [None]:

df_grouped = gdf_sim.groupby(['timestamp'])['turn_angle'].mean().reset_index(name='count')
fig = px.line(df_grouped, x='timestamp', y='count', 
              # color='Disease State',
              title='Average Turn Angle Over Time',
              labels={'timestamp': 'Timestamp', 'count': 'Average Step Distance', 'category': 'Category'})

fig.show()