In [1]:
import pandas as pd
import numpy as np
# from datetime import datetime, timedelta
import plotly.express as px
import plotly.graph_objects as go
from geopy.distance import geodesic
from utils import *
import igraph as ig
import math

In [25]:
# full travel dataset
cali_csvs = ['../data/california_jul_nov_2019.csv',
             '../data/california_may_jul_20.csv',
             '../data/california_jul_sep_20.csv',
             '../data/california_sep_nov_20.csv']
data_frames = []
for file in cali_csvs:
    data = pd.read_csv(file, engine='pyarrow')
    data_frames.append(data)
travel_df = pd.concat(data_frames, ignore_index=True).sort_values(by='date')
travel_df['date'] = pd.to_datetime(travel_df.date)

# wildfire dataset
wildfire_df = pd.read_csv('../data/ca_daily_fire_2000_03252022.csv', engine='pyarrow')
wildfire_df['acq_date'] = pd.to_datetime(wildfire_df.acq_date)

In [26]:
travel_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 881385 entries, 0 to 881384
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   geoid_o        881385 non-null  int64         
 1   geoid_d        881385 non-null  int64         
 2   lng_o          881385 non-null  float64       
 3   lat_o          881385 non-null  float64       
 4   lng_d          881385 non-null  float64       
 5   lat_d          881385 non-null  float64       
 6   date           881385 non-null  datetime64[ns]
 7   visitor_flows  881385 non-null  int64         
 8   pop_flows      881385 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(3)
memory usage: 67.2 MB


Separate dataset to (1) 30 days prior to the fire and (2) during the fire, for each fire.

| Fire    | Started              | Contained            | County | Lat       | Long        |
|---------|----------------------|----------------------|--------|-----------|-------------|
| Kincaid | 10/23/2019 9:27 PM   | 11/06/2019 7:00 PM   | Sonoma | 38.792458 | -122.780053 |
| CZU | 08/16/2020 8:00 AM | 09/22/2020 7:53 PM | Santa Cruz, San Mateo | 37.17162 | -122.22275 |
| August | 08/16/2020 8:37 PM | 11/11/2020 10:21 AM | Mendocino, Humboldt, Trinity, Tehama, Glenn, Lake, Colusa | 39.776 | -122.673 |

In [27]:
# pre-fire dates
kincaid_pre_date = '20190923'
czu_pre_date = '20200716'
august_pre_date = '20200716'

# start dates
kincaid_start_date = '20191023'
czu_start_date = '20200816'
august_start_date = '20200816'

# contained dates
kincaid_contained_date = '20191106'
czu_contained_date = '20200922'
august_contained_date = '20201111'

# long lats
kincaid_coords = (38.792458, -122.780053)
czu_coords = (37.17162, -122.22275)
august_coords = (39.776, -122.673)

# sq miles affected
kincaid_affected_km = 150
czu_affected_km = 150
august_affected_km = 300

In [28]:
# query on earliest fire (kincaid) to latest fire (august)
wildfires = wildfire_df.query('acq_date >= @kincaid_start_date & acq_date <= @august_contained_date')

I group by origin-destination pairs, and sum up the total flows between them. I then divide by the number of days to get the average daily traffic between these nodes over the time period.

In [29]:
# process pre fire data
pre_fire_kincaid = process_pre_fire_data(travel_df, kincaid_start_date)
pre_fire_czu = process_pre_fire_data(travel_df, czu_start_date)
pre_fire_august = process_pre_fire_data(travel_df, august_start_date)

# process during fire data
during_fire_kincaid = process_during_fire_data(travel_df, kincaid_start_date, kincaid_contained_date)
during_fire_czu = process_during_fire_data(travel_df, czu_start_date, czu_contained_date)
during_fire_august = process_during_fire_data(travel_df, august_start_date, august_contained_date)

#### Plots
Plots for each fire, this may take awhile to load.

In [None]:
# kincaid
plot_density_map(wildfire_df, 'kincaid', kincaid_coords)

In [None]:
# czu
plot_density_map(wildfire_df, 'czu', czu_coords)

In [None]:
# august
plot_density_map(wildfire_df, 'august', august_coords)

So the wildfire took place in this specific region. Let's then filter for any latitude longitudes that are in the surrounding radius up to some distance.  We then calculate the eigenvector centralities for counties.

In [33]:
pre_kincaid_filtered, pre_kincaid_centrality_df = get_eigenvector_centrality(pre_fire_kincaid, kincaid_coords, kincaid_affected_km)
during_kincaid_filtered, during_kincaid_centrality_df = get_eigenvector_centrality(during_fire_kincaid, kincaid_coords, kincaid_affected_km)

pre_czu_filtered, pre_czu_centrality_df = get_eigenvector_centrality(pre_fire_czu, czu_coords, czu_affected_km)
during_czu_filtered, during_czu_centrality_df = get_eigenvector_centrality(during_fire_czu, czu_coords, czu_affected_km)

pre_august_filtered, pre_august_centrality_df = get_eigenvector_centrality(pre_fire_august, august_coords, august_affected_km)
during_august_filtered, during_august_centrality_df = get_eigenvector_centrality(during_fire_august, august_coords, august_affected_km)


Weighted directed graph in eigenvector centrality at src/centrality/eigenvector.c:303


Weighted directed graph in eigenvector centrality at src/centrality/eigenvector.c:303


Weighted directed graph in eigenvector centrality at src/centrality/eigenvector.c:303


Weighted directed graph in eigenvector centrality at src/centrality/eigenvector.c:303


Weighted directed graph in eigenvector centrality at src/centrality/eigenvector.c:303


Weighted directed graph in eigenvector centrality at src/centrality/eigenvector.c:303



In [34]:
pre_kincaid_filtered.geoid_o.unique()

array([6001, 6007, 6011, 6013, 6021, 6033, 6041, 6045, 6055, 6067, 6095,
       6097, 6101, 6113, 6115])

#### Getting population data
Let's add population to the dataset. I obtain a dataset on population from [here](https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-total.html)

In [35]:
county_pop = pd.read_excel('../data/california_county_populations.xlsx', header=3).rename({'Unnamed: 0':'county',2020:'pop'}, axis=1).dropna()[['county','pop']].iloc[1:, :].reset_index(drop=True)

county_pop['county'] = county_pop.county.str[1:].str.split(',').str[0]

Get the county names of our flows dataset.

In [36]:
cali_county_coordinates = pd.Series(
    pd.concat(
        [
            travel_df.drop_duplicates(['geoid_o'])[['geoid_o','lat_o', 'lng_o']]. \
                rename({'geoid_o':'geoid','lat_o':'lat','lng_o':'lng'}, axis=1),
            travel_df.drop_duplicates(['geoid_d'])[['geoid_d', 'lat_d', 'lng_d']]. \
                rename({'geoid_d':'geoid','lat_d':'lat', 'lng_d':'lng'}, axis=1)
                ], axis=0).drop_duplicates().reset_index(drop=True).itertuples(name=None, index=False))

# cali_counties_lat_long_dict = create_converter_dict(cali_county_coordinates, lat_long_to_location, batch=True)

#with open('../data/cali_counties_lat_long_dict.json', 'w') as fp:
#    json.dump(cali_counties_lat_long_dict, fp)

with open('../data/cali_counties_lat_long_dict.json') as json_file:
    cali_counties_lat_long_dict = json.load(json_file)

For each fire, merge the county name data with the eigencentrality of each county.

In [37]:
pre_kincaid_merged_df = merge_centrality(pre_kincaid_filtered, pre_kincaid_centrality_df, county_pop, cali_counties_lat_long_dict)
during_kincaid_merged_df = merge_centrality(during_kincaid_filtered, during_kincaid_centrality_df, county_pop, cali_counties_lat_long_dict)

pre_czu_merged_df = merge_centrality(pre_czu_filtered, pre_czu_centrality_df, county_pop, cali_counties_lat_long_dict)
during_czu_merged_df = merge_centrality(during_czu_filtered, during_czu_centrality_df, county_pop, cali_counties_lat_long_dict)

pre_august_merged_df = merge_centrality(pre_august_filtered, pre_august_centrality_df, county_pop, cali_counties_lat_long_dict)
during_august_merged_df = merge_centrality(during_august_filtered, during_august_centrality_df, county_pop, cali_counties_lat_long_dict)

pre_kincaid_merged_df.to_csv('../data/clean/pre_kincaid_merged.csv', index=False)
pre_czu_merged_df.to_csv('../data/clean/pre_czu_merged.csv', index=False)
pre_august_merged_df.to_csv('../data/clean/pre_august_merged.csv', index=False)

during_kincaid_merged_df.to_csv('../data/clean/during_kincaid_merged.csv', index=False)
during_czu_merged_df.to_csv('../data/clean/during_czu_merged.csv', index=False)
during_august_merged_df.to_csv('../data/clean/during_august_merged.csv', index=False)


Network graph.

In [38]:
pre_august_merged_df.county_o.unique().__len__()

35

In [15]:
plot_centrality(pre_kincaid_merged_df, 'kincaid')

In [13]:
plot_centrality(pre_czu_merged_df, 'czu')

In [39]:
plot_centrality(pre_august_merged_df, 'august')

In [18]:
# import plotly.express as px

# # Plot the scatter plot
# fig = px.scatter(kincaid_merged_df, x='eigen_centrality_o', y='eigen_centrality_d', size='pop_flows',
#                  hover_data=['county_o', 'county_d'],
#                  labels={'eigen_centrality_o': 'Origin Eigen Centrality', 'eigen_centrality_d': 'Destination Eigen Centrality'},
#                  title='Eigen Centrality of Origin and Destination Counties',
#                  opacity=0.7)

# # Customize the layout
# fig.update_layout(
#     xaxis_title='Origin Eigen Centrality',
#     yaxis_title='Destination Eigen Centrality',
#     legend_title='Population Flows',
#     width=800,
#     height=600
# )

# # Show the plot
# fig.show()
