In [None]:
import pandas as pd
import numpy as np
import csv
from utils import *

In [None]:
df = load_csv_export_dataset("../export_dataframe_stage2.csv")

Filter the data to concentrate on recent period

In [None]:
df = df[df["event_date"] > "2012-01-01"]

The core fields are air_temp, water_temp, ph1, ph2, DissolvedOxygen1, DissolvedOxygen2 and Conductivity. For those who has two values, we could combine them.

In [None]:
def getMean(v1, v2, populate_NA = True):
    if pd.isnull(v1) and pd.notnull(v2) and populate_NA:
        return v2
    elif pd.notnull(v1) and pd.isnull(v2) and populate_NA:
        return v1
    elif pd.notnull(v1) and pd.notnull(v2):
        return (v1+v2)/2
    else:
        return np.nan

df["mean_ph"] = df.apply(lambda row: getMean(row["ph1"], row["ph2"]), axis=1)
df["mean_DissolvedOxygen"] = df.apply(lambda row: getMean(row["DissolvedOxygen1"], row["DissolvedOxygen2"]), axis=1)
df.drop(columns=["ph1", "ph2", "DissolvedOxygen1", "DissolvedOxygen2"], inplace=True)

There are also some semi-core fields: SecchiDisk1, SecchiDisk2, ChlorophyIIA, Salinity1, Salinity2

In [None]:
df["mean_SecciDisk"] = df.apply(lambda row: getMean(row["SecchiDisk1"], row["SecchiDisk2"]), axis=1)
df["mean_Salinity"] = df.apply(lambda row: getMean(row["Salinity1"], row["Salinity2"]), axis=1)
df.drop(columns=["SecchiDisk1", "SecchiDisk2", "Salinity1", "Salinity2"], inplace=True)
df.head()

There's also tow fields represents the same data: air_temp and air_temperature.  
Merge them together.

In [None]:
df['air_temp'] = df.apply(lambda row: getMean(row["air_temp"], row["air_temperature"]), axis=1)
df.drop(columns=["air_temperature"], inplace=True)

Here, I propose a strategy to categorize data into 3 groups: core fields, semi-core fields and other fields. To measure the quality of an entry, we add three columns to count the number of missing fields in each group.

In [None]:
# define the groups
core_group = ["air_temp", "water_temp", "mean_ph", "mean_DissolvedOxygen", "Conductivity"]
semi_core_group = ["mean_SecciDisk", "mean_Salinity", "ChlorophyllA"]
others = list(set(df)-set(core_group)-set(semi_core_group))

In [None]:
def get_missing_count(row, group):
    return sum([pd.isnull(row[col]) for col in group])

df["core_missing_count"] = df.apply(lambda row: get_missing_count(row, core_group), axis=1)
df["semi_core_missing_count"] = df.apply(lambda row: get_missing_count(row, semi_core_group), axis=1)
df["others_missing_count"] = df.apply(lambda row: get_missing_count(row, others), axis=1)

In [None]:
df.head()

After we got a count of missing fields, we could aggregate the count on the site level (wbd).

In [None]:
count_cols = ["core_missing_count", "semi_core_missing_count", "others_missing_count"]
missing_count = df.groupby(['wbd'])[count_cols].sum().sort_values(by=count_cols, ascending=True)
missing_count.reset_index(inplace=True)
missing_count.head()

However, only counting on missing fields doesn't tell us about how it's distributed in the time. We need to ensure that we have at least 1 entries for each site every 6 months to show the trend.

There are 496 sites in total. We first select a cut off solely base on the missing count.

In [None]:
missing_count = missing_count.query("core_missing_count < 20 and semi_core_missing_count < 30")

In [None]:
missing_count.shape

Then we need to aggregate the original data again to find the sites that have at least 1 entry every 6 months.

In [None]:
sites = df.groupby(['wbd',pd.Grouper(key='event_date', freq='6M')]).count().reset_index()
sites = sites[['wbd','event_date', 'event_rid']]
time_span = df["event_date"].max() - df["event_date"].min()
time = 2*time_span.days/365
sites = sites.query("event_rid >= 1")
sites = sites.groupby(['wbd']).count().reset_index()
sites = sites.query("event_rid >= @time")
site_names = sites['wbd'].tolist()
site_names

Let's plot these sites on map.

In [None]:
import numpy as np
import geopandas as gpd
import shapely
from shapely.geometry import shape, Point
# pip install pyshp
import shapefile
# pip install geopy
from geopy import distance
import plotly.figure_factory as ff
import plotly.io as pio
import plotly as plt
pio.renderers.default = "jupyterlab"
import plotly.express as px
import plotly.graph_objects as go

In [None]:
locs = df[['SiteName', 'SiteLocation']]
df = df[~df['SiteLocation'].isna()]
locs = locs[~locs['SiteLocation'].isna()]
loc_pairs, geo_locs, gdf = get_loc_objects_from_series(locs['SiteLocation'])
loc_lookup = fetch_geo_locs()
WBD_gj = shapefile.Reader("../../geodata\hydrologic_units_WBDHU12_ga_3975106_02\hydrologic_units\wbdhu12_a_ga.shp").__geo_interface__
GA_map = get_state_map()

In [None]:
fig = px.choropleth_mapbox(sites, #array/object with wbd names in the same order as the json file
                           geojson=WBD_gj,
                           locations="wbd",
                           featureidkey="properties.name",
                           center={"lat": 32.8407, "lon": -83.6324}, # macon
                           mapbox_style="stamen-terrain", #"carto-positron",
                           color_continuous_scale="Jet", #Sunsetdark, Jet, 
                           zoom=6,
                           opacity=0.7,
                      )


fig.update_layout(margin={"r":0, "t":0, "l":0, "b":0})
fig.show()