### 1.0 Imports

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
import glob
import datetime as dt
from math import radians, cos, sin, asin, sqrt

### 2.0 Load Data

Loading in the cleaned, exported data from my data wrangling step.

In [12]:
#load trees data
trees_data = pd.read_csv('../data/data_outputs/seattle_trees.csv')

#load climate data
climate_data = pd.read_csv('../data/data_outputs/washington_climate_normals.csv')

### 3.0 Explore and Combine Data

The first step I have in mind is creating a field that maps each trees to its nearest weather station from my climate normals data.

#### 3.1 Tie Together The Two Dataframes Using Lat-Lon

##### 3.1.1 Define Functions

In [5]:
def dist_b_points(lat1, long1, lat2, long2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees). 
    Credit https://medium.com/analytics-vidhya/finding-nearest-pair-of-latitude-and-longitude-match-using-python-ce50d62af546
    """
    # convert decimal degrees to radians 
    lat1, long1, lat2, long2 = map(radians, [lat1, long1, lat2, long2])
    # haversine formula 
    dlon = long2 - long1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

In [8]:
def flag_nearest(df, lat, long, field):
    """ 
    Label the closest weather station using the reults of the dist_b_points function.
    Credit https://medium.com/analytics-vidhya/finding-nearest-pair-of-latitude-and-longitude-match-using-python-ce50d62af546
    """
    distances = df.apply(
        lambda row: dist_b_points(lat, long, row['lat'], row['long']), axis = 1)
    return df.loc[distances.idxmin(), str(field)]

##### 3.1.2 Rename Lat-Lons Fields

In [14]:
#Need a reminder of my fields
trees_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162080 entries, 0 to 162079
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   planted_date               159111 non-null  object 
 1   most_recent_observation    162075 non-null  object 
 2   common_name                161408 non-null  object 
 3   longitude_coordinate       162080 non-null  float64
 4   latitude_coordinate        162080 non-null  float64
 5   diameter_breast_height_CM  162080 non-null  float64
 6   condition                  162080 non-null  object 
 7   native                     162080 non-null  object 
 8   age_at_obs                 159106 non-null  float64
 9   condition_index            162080 non-null  float64
dtypes: float64(5), object(5)
memory usage: 12.4+ MB


In [15]:
climate_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   STATION          33 non-null     object 
 1   LATITUDE         33 non-null     float64
 2   LONGITUDE        33 non-null     float64
 3   ELEVATION        33 non-null     float64
 4   NAME             33 non-null     object 
 5   ANN-TAVG-NORMAL  33 non-null     float64
 6   ANN-TMIN-NORMAL  33 non-null     float64
 7   ANN-TMAX-NORMAL  33 non-null     float64
 8   ANN-DUTR-NORMAL  33 non-null     float64
 9   ANN-PRCP-NORMAL  32 non-null     float64
dtypes: float64(8), object(2)
memory usage: 2.7+ KB


In [16]:
#Rename lat-long fields to easily identify between dfs
trees_data.rename(columns = {'longitude_coordinate': 'long-trees', 'latitude_coordinate': 'lat-trees'}, inplace=True)
climate_data.rename(columns = {'LONGITUDE': 'long-climate', 'LATITUDE': 'lat-climate'}, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162080 entries, 0 to 162079
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   planted_date               159111 non-null  object 
 1   most_recent_observation    162075 non-null  object 
 2   common_name                161408 non-null  object 
 3   long-trees                 162080 non-null  float64
 4   lat-trees                  162080 non-null  float64
 5   diameter_breast_height_CM  162080 non-null  float64
 6   condition                  162080 non-null  object 
 7   native                     162080 non-null  object 
 8   age_at_obs                 159106 non-null  float64
 9   condition_index            162080 non-null  float64
dtypes: float64(5), object(5)
memory usage: 12.4+ MB
