## Data Wrangling

### 1.0 Imports

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
import glob
import datetime as dt

### 2.0 Load Data

#### 2.1 Trees Data
\
The first source is from a source on data.dryad.org with record of over 5 million tress in 63 US cities. \
https://datadryad.org/stash/dataset/doi:10.5061/dryad.2jm63xsrf \
\
The data is split into a .csv file per city. We will only be using the 'Seattle_Final_2022-06-18.csv' file.

In [7]:
trees = pd.read_csv('data/Seattle_Final_2022-06-18.csv')
trees.head()

Unnamed: 0,planted_date,most_recent_observation,common_name,scientific_name,city,state,longitude_coordinate,latitude_coordinate,location_type,address,...,most_recent_observation_type,zipcode,neighborhood,location_name,ward,district,overhead_utility,height_M,height_binned_M,percent_population
0,07/22/1991,04/27/2019,(european) white birch,Betula pendula,Seattle,Washington,-122.28208,47.635207,no_info,1817 40th Av E,...,,,,,,,,,,
1,08/16/1990,04/27/2019,Red oak,Quercus rubra,Seattle,Washington,-122.310243,47.597548,no_info,1632 S Weller St,...,,,,,,,,,,
2,06/25/1992,04/27/2019,Apple/crabapple,Malus,Seattle,Washington,-122.355204,47.673916,no_info,300 N 62Nd St,...,,,,,,,,,,
3,07/30/1991,04/27/2019,Kwanzan flowering cherry,Prunus L.,Seattle,Washington,-122.318952,47.649141,no_info,3120 Fuhrman Av E,...,,,,,,,,,,
4,07/16/1991,04/27/2019,English (hedge) maple,Acer campestre,Seattle,Washington,-122.344731,47.613092,no_info,120 Blanchard St,...,,,,,,,,,,


In [12]:
trees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165623 entries, 0 to 165622
Data columns (total 28 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   planted_date                      162654 non-null  object 
 1   most_recent_observation           165623 non-null  object 
 2   common_name                       164947 non-null  object 
 3   scientific_name                   164903 non-null  object 
 4   city                              165623 non-null  object 
 5   state                             165623 non-null  object 
 6   longitude_coordinate              165623 non-null  float64
 7   latitude_coordinate               165623 non-null  float64
 8   location_type                     165623 non-null  object 
 9   address                           165388 non-null  object 
 10  diameter_breast_height_CM         165623 non-null  float64
 11  condition                         165623 non-null  o

##### 2.1.1 Update Datatypes

There are some columns with datatypes that need updating, so I'm going to save the col:dtype into a dictionary, update the values, and pass them back to the df to update the types.

In [14]:
#Create a dictionary of my fields and types as key: value
field_dict = {k:str(v[0]) for k,v in pd.DataFrame(trees.dtypes).T.to_dict('list').items()}
field_dict

{'planted_date': 'object',
 'most_recent_observation': 'object',
 'common_name': 'object',
 'scientific_name': 'object',
 'city': 'object',
 'state': 'object',
 'longitude_coordinate': 'float64',
 'latitude_coordinate': 'float64',
 'location_type': 'object',
 'address': 'object',
 'diameter_breast_height_CM': 'float64',
 'condition': 'object',
 'native': 'object',
 'diameter_breast_height_binned_CM': 'object',
 'greater_metro': 'object',
 'city_ID': 'float64',
 'tree_ID': 'float64',
 'retired_date': 'float64',
 'most_recent_observation_type': 'float64',
 'zipcode': 'float64',
 'neighborhood': 'float64',
 'location_name': 'float64',
 'ward': 'float64',
 'district': 'float64',
 'overhead_utility': 'float64',
 'height_M': 'float64',
 'height_binned_M': 'float64',
 'percent_population': 'float64'}

In [22]:
#Update created dictionary to what field types I want standardized
field_dict.update({
 'city_ID': 'object',
 'tree_ID': 'object',
 'retired_date': 'object',
 'most_recent_observation_type': 'object',
 'zipcode': 'object',
 'neighborhood': 'object',
 'location_name': 'object',
 'ward': 'object',
 'district': 'object',
 'overhead_utility': 'object',
 'height_binned_M': 'object',
 'planted_date': 'datetime64[ns]',
 'most_recent_observation': 'datetime64[ns]',
 'retired_date': 'datetime64[ns]'})

In [23]:
#validate updates
field_dict

{'planted_date': 'datetime64[ns]',
 'most_recent_observation': 'datetime64[ns]',
 'common_name': 'object',
 'scientific_name': 'object',
 'city': 'object',
 'state': 'object',
 'longitude_coordinate': 'float64',
 'latitude_coordinate': 'float64',
 'location_type': 'object',
 'address': 'object',
 'diameter_breast_height_CM': 'float64',
 'condition': 'object',
 'native': 'object',
 'diameter_breast_height_binned_CM': 'object',
 'greater_metro': 'object',
 'city_ID': 'object',
 'tree_ID': 'object',
 'retired_date': 'datetime64[ns]',
 'most_recent_observation_type': 'object',
 'zipcode': 'object',
 'neighborhood': 'object',
 'location_name': 'object',
 'ward': 'object',
 'district': 'object',
 'overhead_utility': 'object',
 'height_M': 'float64',
 'height_binned_M': 'object',
 'percent_population': 'float64'}

In [27]:
#Update dtyles
trees = trees.astype(field_dict)
trees.head()

Unnamed: 0,planted_date,most_recent_observation,common_name,scientific_name,city,state,longitude_coordinate,latitude_coordinate,location_type,address,...,most_recent_observation_type,zipcode,neighborhood,location_name,ward,district,overhead_utility,height_M,height_binned_M,percent_population
0,1991-07-22,2019-04-27,(european) white birch,Betula pendula,Seattle,Washington,-122.28208,47.635207,no_info,1817 40th Av E,...,,,,,,,,,,
1,1990-08-16,2019-04-27,Red oak,Quercus rubra,Seattle,Washington,-122.310243,47.597548,no_info,1632 S Weller St,...,,,,,,,,,,
2,1992-06-25,2019-04-27,Apple/crabapple,Malus,Seattle,Washington,-122.355204,47.673916,no_info,300 N 62Nd St,...,,,,,,,,,,
3,1991-07-30,2019-04-27,Kwanzan flowering cherry,Prunus L.,Seattle,Washington,-122.318952,47.649141,no_info,3120 Fuhrman Av E,...,,,,,,,,,,
4,1991-07-16,2019-04-27,English (hedge) maple,Acer campestre,Seattle,Washington,-122.344731,47.613092,no_info,120 Blanchard St,...,,,,,,,,,,


In [28]:
trees.describe()

Unnamed: 0,longitude_coordinate,latitude_coordinate,diameter_breast_height_CM,height_M,percent_population
count,165623.0,165623.0,165623.0,0.0,0.0
mean,-122.336843,47.62593,21.459268,,
std,0.038161,0.056614,18.750966,,
min,-122.419024,47.495836,0.0,,
25%,-122.368956,47.57606,7.62,,
50%,-122.335076,47.636938,15.24,,
75%,-122.306104,47.673012,30.48,,
max,-122.238583,47.734069,330.2,,


##### 2.1.2 Explore Missing Values