# Data Viz
### Purpose
The purpose of this notebook is to create some data visualizations to be used in the project sponser update.

### Author: 
Ian Davis
### Date: 
2020-09-05
### Update Date: 
2020-09-05

### Inputs 
1.3-rec-connecting-fips-ecosystem-data.csv - Comma separate file of the Christmas Bird Count and matches to 1 or more NOAA weather stations.
- Data Dictonary can be found here: http://www.audubon.org/sites/default/files/documents/cbc_report_field_definitions_2013.pdf

### Output Files
Plot html's

## Steps or Proceedures in the notebook 
- Set runtime options
- Import data
- Plots
    - Histogram of station distances

## References
- Figure Factory: # https://stackoverflow.com/questions/54734667/error-installing-geopandas-a-gdal-api-version-must-be-specified-in-anaconda
- FIPS query: https://gis.stackexchange.com/questions/294641/python-code-for-transforming-lat-long-into-fips-codes
- FIPS query: https://geo.fcc.gov/api/census/#!/block/get_block_find
- Geojson: https://geoffboeing.com/2015/10/exporting-python-data-geojson/
- Installing geopandas: https://stackoverflow.com/questions/54734667/error-installing-geopandas-a-gdal-api-version-must-be-specified-in-anaconda
- Colorscale: https://plotly.com/python/county-choropleth/
- Colors: http://www.impactlab.org/map/#usmeas=absolute&usyear=1981-2010&gmeas=absolute&gyear=1986-2005

### See data dictionary: 

http://www.audubon.org/sites/default/files/documents/cbc_report_field_definitions_2013.pdf

In [1]:
import numpy as np
import pandas as pd
import datetime
from scipy import stats
import sys
import gzip
import shutil

# for plotting
import plotly.express as px
import plotly.offline as ply
import plotly.graph_objects as go
import plotly.figure_factory as ff

# for GIS
import requests
import urllib
from urllib.request import urlopen
import json

# add scripts folder to path
sys.path.insert(1, '../scripts')

# user import
from calcs import main_calcs
from calcs import haversine_formula

In [2]:
# File Paths
PATH_TO_PAIRED_DATA = '../data/Cloud_data/1.3-rec-connecting-fips-ecosystem-data.txt'
PATH_TO_FIPS = "../data/Cloud_data/1.3.1-ijd-circles_and_fips.csv"

# User Options
offline_fips = True    # Get FIPS codes from offline .csv file
popen = True           # Auto-open plot HTML files when generated

## Functions

In [3]:
# Function to query and get FIPS county codes.
def get_fips(lat, lon):
    # Encode parameters
    params = urllib.parse.urlencode({'latitude': lat, 'longitude': lon, 'format': 'json'})
    # Contruct request URL
    url = 'https://geo.fcc.gov/api/census/block/find?' + params

    # Get response from API
    response = requests.get(url)

    # Parse json in response
    data = response.json()

    fips = data['County']['FIPS']

    print(lat, lon, fips)

    try:
        return float(fips)
    except:
        return np.nan

In [4]:
# Convert dataframe to geojson (UN-USED BELOW)
def df_to_geojson(df, properties, lat='latitude', lon='longitude'):
    geojson = {'type':'FeatureCollection', 'features':[]}
    for _, row in df.iterrows():
        feature = {'type':'Feature',
                   'properties':{},
                   'geometry':{'type':'Point',
                               'coordinates':[]}}
        feature['geometry']['coordinates'] = [row[lon],row[lat]]
        for prop in properties:
            feature['properties'][prop] = row[prop]
        geojson['features'].append(feature)
    return geojson

## Read in Data

In [5]:
# Select subset of columns to read in
# IJD: I was running into memory issues loading the whole dataset
# Only read in certain columns
fields = ['circle_name',
          'circle_id',
          'id',
          'ui',
          'lat',              # circle
          'lon',              # circle
          'county_fips',      # circle
          'country_state',
          'count_year',
          'count_date',
          'latitude',         # noaa
          'longitude',        # noaa
          'circle_elev',      # circle
          'elevation',        # noaa
          'min_snow',         # circle
          'max_snow',         # circle
          'am_rain',          # circle
          'pm_rain',          # circle
          'max_temp',         # circle
          'min_temp',         # circle
          'temp_max_value',   # noaa
          'temp_min_value'    # noaa
          ]

In [6]:
# read in data
df_paired = pd.read_csv(PATH_TO_PAIRED_DATA,
                        compression='gzip',
                        sep='\t',
                        skipinitialspace=True,
                        usecols=fields)


Columns (62) have mixed types.Specify dtype option on import or set low_memory=False.



## Data Screening & Filtering

In [7]:
# Copied in some calculations from calcs.py
for chunk in np.array_split(df_paired, 4):
    df_paired.loc[chunk.index, 'distance'] = df_paired.loc[chunk.index, ['lat', 'lon', 'latitude', 'longitude']].apply(haversine_formula, axis=1)

    # calculate elevation difference between circles and stations
    df_paired.loc[chunk.index, 'elev_diff'] = df_paired.loc[chunk.index, 'circle_elev'] - df_paired.loc[chunk.index, 'elevation']
    df_paired.loc[chunk.index, 'elev_diff'] = df_paired.loc[chunk.index, 'elev_diff'].abs()
    
    # Convert NOAA temperatures from a tenth of a degree to degrees
    df_paired.loc[:, 'noaa_tmax_value'] = df_paired.loc[:, 'temp_max_value'] / 10.0 * 1.8 + 32.0
    df_paired.loc[:, 'noaa_tmin_value'] = df_paired.loc[:, 'temp_min_value'] / 10.0 * 1.8 + 32.0

    # Remove temperature errors
    df_paired.loc[df_paired['max_temp'] > 150.0, 'max_temp'] = np.nan
    df_paired.loc[df_paired['noaa_tmax_value'] > 150.0, 'noaa_tmax_value'] = np.nan

In [8]:
# Create a Temportary String to Merge on
df_paired['temp_key_str'] = round(df_paired['lat'],3).astype(str) + round(df_paired['lon'],3).astype(str)
print("The number of unique Lat Lon combos in the dataset is: ")
df_paired['temp_key_str'].nunique()

df_circle = df_paired[["lat", "lon", "temp_key_str", "circle_name", "county_fips"]]
print(df_circle.shape)
df_circle = df_circle.drop_duplicates("temp_key_str")
print(df_circle.shape)

The number of unique Lat Lon combos in the dataset is: 
(756378, 5)
(3848, 5)


In [9]:
# Get County FIPS codes (already ran this and saved the output to a .csv file)

if not offline_fips:
    df_circle.loc[:, 'county_fips'] = df_circle.loc[:, ['lat', 'lon']].apply(lambda x: get_fips(x['lat'], x['lon']), axis=1)
    # Save to CSV
    df_circle.to_csv(PATH_TO_FIPS, index=False)
else:
    df_circle = pd.read_csv(PATH_TO_FIPS)

# Drop existing FIPS column
df_paired.drop('county_fips', axis=1, inplace=True)
# Merge with original dataset
df_paired = pd.merge(df_paired,
                     df_circle[['temp_key_str', 'county_fips']],
                     on=['temp_key_str'],
                     how='left',
                     copy=False
                     )

# Sanity check (should be less that 2000 NaN's for county_fips
print('The number of county_fips Nan\'s is:')
print(df_paired['county_fips'].isna().sum())

The number of county_fips Nan's is:
322


## Make Plots

### Unique Circle Locations

In [10]:
# Map plot of all unique circles
fig = go.Figure(data=go.Scattergeo(
        lon = df_circle['lon'],
        lat = df_circle['lat'],
        text = df_circle['circle_name'],
        mode = 'markers',
        marker=dict(opacity=0.4,
                    color="black")
        ))
fig.update_layout(
        title = 'Unique Circle Locations',
        geo_scope='usa',
    )
ply.plot(fig, filename='../plots/circles.html', auto_open=popen)

'../plots/circles.html'

### Circle to NOAA Station Distances

In [11]:
# Plot histogram of distances
fig = go.Figure()
fig.add_trace(go.Histogram(x=df_paired['distance'],
                           nbinsx=50))
fig.update_layout(title="Histogram of Distances Between Circles and Stations",
                  template="simple_white")
fig.update_xaxes(title_text='Distance [m]')
fig.update_yaxes(title_text='Counts')
ply.plot(fig, filename='../plots/distance.html', auto_open=popen)

'../plots/distance.html'

### Number of Matched Stations

In [12]:
# Plot histogram of station matches
fig = go.Figure()
fig.add_trace(go.Histogram(x=df_paired['ui'].value_counts(),
                           xbins=dict(start=0,
                                      end=50,
                                      size=1)))
fig.update_layout(title="Histogram of NOAA Station Matches",
                  template="simple_white")
fig.update_xaxes(title_text='Number of Stations Matched')
fig.update_yaxes(title_text='Counts')
ply.plot(fig, filename='../plots/matches.html', auto_open=popen)

'../plots/matches.html'

### Elevation Differences

In [13]:
# Plot histogram of elevation difference
fig = go.Figure()
fig.add_trace(go.Histogram(x=df_paired['elev_diff'],
                           xbins=dict(start=0,
                                      end=500,
                                      size=10)))
fig.update_layout(title="Histogram of Elevation Difference Between Circles and Stations",
                  template="simple_white")
fig.update_xaxes(title_text='Elevation Change [m]')
fig.update_yaxes(title_text='Counts')
ply.plot(fig, filename='../plots/elevation.html', auto_open=popen)

'../plots/elevation.html'

### Choropleth Map from GeoJSON

In [14]:
# Download U.S. counties in JSON format
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

In [15]:
# Subset for 2010 data
df_2010 = df_paired.loc[df_paired['count_year'] == 2010, ['lat', 'lon', 'country_state', 'max_temp', 'min_temp', 'county_fips']]
df_2010 = df_2010.dropna(axis=0, subset=['max_temp', 'county_fips'])

In [16]:
fig = px.choropleth_mapbox(df_2010, geojson=counties, locations='county_fips', color='max_temp',
                           color_continuous_scale="Reds",
                           range_color=(0, 100),
                           mapbox_style="carto-positron",
                           zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
                           opacity=0.5,
                           labels={'max_temp':'Maximum Temperature'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
ply.plot(fig, filename='../plots/choropleth_json2.html', auto_open=popen)

'../plots/choropleth_json2.html'

In [17]:
df_paired['max_temp'].max()

93.0

### Figure Factory Choropleth

In [18]:
# Setup
fips = df_2010['county_fips'].tolist()
values = df_2010['max_temp'].tolist()

colorscale = [
    '#00ACC1',
    '#26C6DA',
    '#B2EBF2',
    "#FFF9C4",
    '#FFEE58',
    '#FBC02D',
    '#FF7043',
    '#E64A19',
]

In [19]:
fig = ff.create_choropleth(
    fips=fips, values=values, scope=['usa'],
    show_state_data=True,
    show_hover=True,
    asp = 2.9,
    title_text = 'Maximum Circle Temps - 2010',
    legend_title = 'Temperature [F]',
    binning_endpoints=[-100.0, 0.0, 10.0, 20.0, 30.0, 40.0, 50.0],
    colorscale=colorscale,
)
fig.layout.template = None

ply.plot(fig, filename='../plots/choropleth_ff.html', auto_open=popen)

'../plots/choropleth_ff.html'