# Updating iNaturalist observation data
### Cleans and concatenates old and new exports
<br>Periodically, new data will have to be exported and added to the model.
<br>current update: through sep 30, 2022.
<br>Wildflower observation data comes from the iNaturalist export tool: https://www.inaturalist.org/observations/export   
<br>
[iNaturalist raw exports available here](https://github.com/Floydworks/WildflowerFinder_Phenology_Tool/tree/main/iNaturalist_observations_files)



______________________________

**Import the necessary libraries:**

In [1]:
#!pip install pyinaturalist
#!pip install pandas

from pyinaturalist.node_api import get_all_observations
import pandas as pd
import numpy as np
from datetime import date, datetime
import os

print("Libraries imported!")

Libraries imported!


# Information about the Parks

In [2]:
#PARK DICTIONARY 
park_info_dict =  { "Hearst San Simeon" :{"size(mi2)": "3.608","place_id":"3582", "region": "south coast", 'lat_long':(), 'station':(),'station_id':(), 'dataset':('none')}, 
                   "Pinnacles National Park": {"size(mi2)": "41.57","place_id":"5737","region": "south inland", 'lat_long':(), 'stations':(),'station_id':(), 'dataset':('train')}, 
                   "Tilden Regional Park": {"size(mi2)": "3.25","place_id":"3523","region": "east bay", 'lat_long':(37.894647, -122.241635), 'stations':('Berkeley'),'station_id':('USC00040693'), 'dataset':('train')} ,
#                  "Tilden Regional Park": {"size(mi2)": "3.25","place_id":"3523","region": "east bay", 'lat_long':(37.894647, -122.241635), 'stations':('Oakland'),'station_id':('USW00023230'), 'dataset':('train')} ,
                   "Briones Regional Park": {"size(mi2)": "9.56","place_id":"3706","region": "east bay", 'lat_long':(37.935804, -122.137413), 'stations':('Concord'),'station_id':('USW00023254'), 'dataset':('train')} ,
                   "Sunol Regional Wilderness": {"size(mi2)": "3.25","place_id":"3456","region": "east bay", 'lat_long':(37.510183, -121.82855), 'stations':('SanJose', 'Livermore'),'station_id':('USW00023293','USW00023285'), 'dataset':('train')}, 
                   "Carrizo National Monument": {"size(mi2)": "385.64","place_id":"","region": "south inland", 'lat_long':(), 'stations':(),'station_id':(), 'dataset':('none')} ,
                   "Tahoe National Forest": {"size(mi2)": "1361.71","place_id":"5879","region": "north inland", 'lat_long':(), 'stations':(),'station_id':(), 'dataset':('none')} ,
                   "Walker Canyon Ecological Reserve": {"size(mi2)": ".77","place_id":"4699","region": "south inland", 'lat_long':(), 'stations':(),'station_id':(), 'dataset':('none')}, 
                   "Antelope Valley Poppy": {"size(mi2)": "2.8","place_id":"","region": "", 'lat_long':(), 'stations':(),'station_id':(), 'dataset':('none')} ,
                   "Chino Hills Sate Park": {"size(mi2)": "22.03","place_id":"3996","region": "south inland", 'lat_long':(), 'stations':(),'station_id':(), 'dataset':('none')},
                   "Garin Regional Park": {"size(mi2)": "9.06","place_id":"5199","region": "east bay", 'lat_long':(37.63544, -122.02068), 'stations':('Hayward'),'station_id':('USW00093228'), 'dataset':('train')},
#                   "Las Trampas Regional Wilderness Park": {"size(mi2)": "9.03","place_id":"6196","region": "east bay", 'lat_long':(37.82, -122.05), 'stations':('LasTrampas (temp)','Danville (precip)'),'station_id':('USR0000CTRA','US1CACC0045'), 'dataset':('test')},
                   "Pleasanton Ridge Regional Park": {"size(mi2)": "14.20","place_id":"5777","region": "east bay", 'lat_long':(37.615409, -121.88456), 'stations':('Livermore', 'Hayward'),'station_id':('USW00023285','USW00093228'), 'dataset':('train')},
                   "Anthony Chabot Regional Park": {"size(mi2)": "5.1781","place_id":"5239","region": "east bay", 'lat_long':(37.766, -122.119), 'stations':('Oakland'),'station_id':('USW00023230'), 'dataset':('train')},
#                   "Black Diamond Mines Regional Preserve": {"size(mi2)": "9.38","place_id":"6102","region": "east bay", 'lat_long':(37.950278, -121.856944), 'stations':('Antioch'),'station_id':('USC00040232'), 'dataset':('test')},
                   "Joseph D Grant County Park": {"size(mi2)": "14.9266","place_id":"5339","region": "east bay", 'lat_long':(37.345495, -121.68717), 'stations':('SanJose', 'MtHamilton'),'station_id':('USW00023293','USC00045933'), 'dataset':('train')},
#                   "Morgan Territory Regional Preserve": {"size(mi2)": "8.17","place_id":"5582","region": "east bay", 'lat_long':(37.818311, -121.795883), 'stations':('MalloryRidge (temp)','Brentwood (precip)'),'station_id':(''), 'dataset':('test')},
                    "Mt Diablo State Park": {"size(mi2)": "31.25","place_id":"5586","region": "east bay", 'lat_long':(37.881698, -121.914155), 'stations':('MtDiablo'),'station_id':('USC00045915'), 'dataset':('test')}
                  }

#"_": {"size(mi2)": "_","place_id":"_","region": "_", 'lat_long':()}

#create dataframe of park information
park_info_df = pd.DataFrame.from_dict(park_info_dict, orient='index')
park_info_df = park_info_df.drop(['station'], axis=1)

park_info_df[park_info_df['region']=='east bay']

Unnamed: 0,size(mi2),place_id,region,lat_long,station_id,dataset,stations
Tilden Regional Park,3.25,3523,east bay,"(37.894647, -122.241635)",USC00040693,train,Berkeley
Briones Regional Park,9.56,3706,east bay,"(37.935804, -122.137413)",USW00023254,train,Concord
Sunol Regional Wilderness,3.25,3456,east bay,"(37.510183, -121.82855)","(USW00023293, USW00023285)",train,"(SanJose, Livermore)"
Garin Regional Park,9.06,5199,east bay,"(37.63544, -122.02068)",USW00093228,train,Hayward
Pleasanton Ridge Regional Park,14.2,5777,east bay,"(37.615409, -121.88456)","(USW00023285, USW00093228)",train,"(Livermore, Hayward)"
Anthony Chabot Regional Park,5.1781,5239,east bay,"(37.766, -122.119)",USW00023230,train,Oakland
Joseph D Grant County Park,14.9266,5339,east bay,"(37.345495, -121.68717)","(USW00023293, USC00045933)",train,"(SanJose, MtHamilton)"
Mt Diablo State Park,31.25,5586,east bay,"(37.881698, -121.914155)",USC00045915,test,MtDiablo


# Wildflower observations
**iNaturalist Observations using Export Tool:**
<br> Data is acquired [here](https://www.inaturalist.org/observations/export)
<br>Observations are associated with a URL and URLs of plant photos used in labeling
<br>Flowering PLants: taxon_id = 47125 
<br>Repository of observation data used in this study is [here]()

## new data, year 2022
### Will create some duplicate values

In [3]:
#files available at https://github.com/Floydworks/Capstone2_Wildflower_Phenology/tree/main/NOAA_climate_files

## get current directory
folder_path = '/Users/sandidge/Desktop/Python_Projects/Springboard_coursework/Capstone2_Wildflowers/iNat_2022/export_data/new_sep_2022'
#folder_path = 'your local file path to folder with downloaded data'

## list all file available 
all_files = os.listdir(folder_path)
#print(all_files)

## only store .csv filenames
csv_files = list(filter(lambda f: f.endswith('.csv'), all_files))
#print(csv_files)

## create a new list to store filesnames with no .csv extension
file_names = []
for x in range(len(csv_files)):
    file_names.append(csv_files[x].split('.')[0])
file_names

In [4]:

#create empty list to store dataframe names
df_new_names = []
print(type(df_new_names))

## Loop through to assign dataframe names
for file in file_names:
    final_df = file+"_df"
    print("Dataframe name : "+final_df, type(final_df))
    df_new_names.append(final_df)      #creat list of df names 'df_names'
   
    filename = file+".csv"
    ## In python to assign a string as a dataframe name, use globals()
    globals()[final_df] = pd.read_csv(r'/Users/sandidge/Desktop/Python_Projects/Springboard_coursework/Capstone2_Wildflowers/iNat_2022/export_data/new_sep_2022/'+filename)

    
print(df_new_names)

<class 'list'>
Dataframe name : pleasantonridge2022_df <class 'str'>
Dataframe name : tilden2022_df <class 'str'>
Dataframe name : josephdgrant2022_df <class 'str'>
Dataframe name : briones2022_df <class 'str'>
Dataframe name : garin2022_df <class 'str'>
Dataframe name : sunol2022_df <class 'str'>
Dataframe name : mtdiablo_sep2022_df <class 'str'>
Dataframe name : anthonychabot2022_df <class 'str'>
['pleasantonridge2022_df', 'tilden2022_df', 'josephdgrant2022_df', 'briones2022_df', 'garin2022_df', 'sunol2022_df', 'mtdiablo_sep2022_df', 'anthonychabot2022_df']


In [5]:
#add park name column to each df to identify location

#parks in training set
sunol2022_df['park'], sunol2022_df['region'] = ['Sunol', 'east bay']
briones2022_df['park'], briones2022_df['region'] = ['Briones', 'east bay']
tilden2022_df['park'],tilden2022_df['region'] = ['Tilden', 'east bay']
garin2022_df['park'], garin2022_df['region'] = ['Garin', 'east bay']
anthonychabot2022_df['park'],anthonychabot2022_df['region'] = ['AnthonyChabot', 'east bay']
josephdgrant2022_df['park'],josephdgrant2022_df['region'] = ['JDGrant', 'east bay']
pleasantonridge2022_df['park'],pleasantonridge2022_df['region'] = ['PleasantonRidge', 'east bay']

#parks in test set
mtdiablo_sep2022_df['park'], mtdiablo_sep2022_df['region'] = ['MtDiablo', 'east bay']



In [6]:
#concatenate the observations by parks dataframes
df_new = pd.concat([sunol2022_df, 
                briones2022_df, 
                tilden2022_df,
                anthonychabot2022_df,
                garin2022_df,
                josephdgrant2022_df,
                pleasantonridge2022_df,
                mtdiablo_sep2022_df, 
                ]
                )
print(df_new.shape)

#verify concatenation
if int(len(df_new)) == int(len(sunol2022_df)+
                       len(briones2022_df)+
                       len(tilden2022_df)+
                       len(anthonychabot2022_df)+
                       len(garin2022_df)+
                       len(josephdgrant2022_df)+
                       len(pleasantonridge2022_df)+
                       len(mtdiablo_sep2022_df)):
    print('Concatenation seems correct!')

(4028, 31)
Concatenation seems correct!


In [7]:
df_new['park'].unique()
print(df_new.columns)

Index(['id', 'observed_on', 'time_observed_at', 'time_zone', 'quality_grade',
       'url', 'image_url', 'description', 'num_identification_agreements',
       'num_identification_disagreements', 'captive_cultivated', 'place_guess',
       'latitude', 'longitude', 'geoprivacy', 'place_town_name',
       'place_county_name', 'species_guess', 'scientific_name', 'common_name',
       'iconic_taxon_name', 'taxon_id', 'taxon_family_name',
       'taxon_genus_name', 'taxon_species_name', 'park', 'region',
       'positional_accuracy', 'taxon_geoprivacy', 'observed_on_string',
       'created_at'],
      dtype='object')


In [8]:
#loop through parks and output first observation date, last observation date, number of observations

park_names = list(df_new['park'].unique())
#park_names

for p in park_names:
    df_temp = df_new[df_new['park']==p]
    name = p
    num_obs = len(df_temp)
    df_temp['observed_on'] = pd.to_datetime(df_temp['observed_on'])
    start = df_temp['observed_on'].min()
    end = df_temp['observed_on'].max()
    print('Park name: '+ str(p) +', '+'Num. of Obs = '+ str(num_obs))
    print(df_temp['observed_on'].min())
    print(df_temp['observed_on'].max())

Park name: Sunol, Num. of Obs = 731
2022-01-01 00:00:00
2022-09-27 00:00:00
Park name: Briones, Num. of Obs = 843
2022-01-09 00:00:00
2022-09-25 00:00:00
Park name: Tilden, Num. of Obs = 1313
2022-01-01 00:00:00
2022-09-26 00:00:00
Park name: AnthonyChabot, Num. of Obs = 543
2022-01-04 00:00:00
2022-09-30 00:00:00
Park name: Garin, Num. of Obs = 40
2022-01-30 00:00:00
2022-09-24 00:00:00
Park name: JDGrant, Num. of Obs = 400
2022-01-09 00:00:00
2022-09-17 00:00:00
Park name: PleasantonRidge, Num. of Obs = 114
2022-01-01 00:00:00
2022-09-26 00:00:00
Park name: MtDiablo, Num. of Obs = 44
2022-09-01 00:00:00
2022-09-30 00:00:00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['observed_on'] = pd.to_datetime(df_temp['observed_on'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['observed_on'] = pd.to_datetime(df_temp['observed_on'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['observed_on'] = pd.to_datetime(df_temp['observed_on'])
A value is tr

In [9]:
#see first rows of the concatenated dataframe of all new observations
df_new.head()

Unnamed: 0,id,observed_on,time_observed_at,time_zone,quality_grade,url,image_url,description,num_identification_agreements,num_identification_disagreements,...,taxon_id,taxon_family_name,taxon_genus_name,taxon_species_name,park,region,positional_accuracy,taxon_geoprivacy,observed_on_string,created_at
0,104188607,1/1/22,2022-01-01 21:16:00 UTC,Pacific Time (US & Canada),research,https://www.inaturalist.org/observations/10418...,https://inaturalist-open-data.s3.amazonaws.com...,,1,0,...,53359,Asteraceae,Baccharis,Baccharis pilularis,Sunol,east bay,,,,
1,104188609,1/1/22,2022-01-01 21:40:00 UTC,Pacific Time (US & Canada),research,https://www.inaturalist.org/observations/10418...,https://inaturalist-open-data.s3.amazonaws.com...,,1,0,...,56121,Brassicaceae,Capsella,Capsella bursa-pastoris,Sunol,east bay,,,,
2,104667115,1/8/22,2022-01-08 12:46:02 UTC,UTC,research,https://www.inaturalist.org/observations/10466...,https://static.inaturalist.org/photos/17543114...,,2,0,...,143799,Viburnaceae,Sambucus,Sambucus cerulea,Sunol,east bay,,,,
3,104681782,1/9/22,2022-01-09 11:15:26 UTC,UTC,research,https://www.inaturalist.org/observations/10468...,https://inaturalist-open-data.s3.amazonaws.com...,,1,0,...,49651,Brassicaceae,Cardamine,Cardamine californica,Sunol,east bay,,,,
4,104690215,1/8/22,2022-01-08 23:06:55 UTC,Pacific Time (US & Canada),research,https://www.inaturalist.org/observations/10469...,https://inaturalist-open-data.s3.amazonaws.com...,large population amidst cattle grazing slopes,1,0,...,400267,Montiaceae,Calandrinia,Calandrinia menziesii,Sunol,east bay,,,,


In [10]:
#add two additional columns with genus and species separated
df_new[['genus', 'species']] = df_new['taxon_species_name'].str.split(' ', expand=True)

In [11]:
#add datetime column
df_new['datetime'] = pd.to_datetime(df_new['observed_on'])

In [12]:
print(df_new['park'].unique())
df_new.head()

['Sunol' 'Briones' 'Tilden' 'AnthonyChabot' 'Garin' 'JDGrant'
 'PleasantonRidge' 'MtDiablo']


Unnamed: 0,id,observed_on,time_observed_at,time_zone,quality_grade,url,image_url,description,num_identification_agreements,num_identification_disagreements,...,taxon_species_name,park,region,positional_accuracy,taxon_geoprivacy,observed_on_string,created_at,genus,species,datetime
0,104188607,1/1/22,2022-01-01 21:16:00 UTC,Pacific Time (US & Canada),research,https://www.inaturalist.org/observations/10418...,https://inaturalist-open-data.s3.amazonaws.com...,,1,0,...,Baccharis pilularis,Sunol,east bay,,,,,Baccharis,pilularis,2022-01-01
1,104188609,1/1/22,2022-01-01 21:40:00 UTC,Pacific Time (US & Canada),research,https://www.inaturalist.org/observations/10418...,https://inaturalist-open-data.s3.amazonaws.com...,,1,0,...,Capsella bursa-pastoris,Sunol,east bay,,,,,Capsella,bursa-pastoris,2022-01-01
2,104667115,1/8/22,2022-01-08 12:46:02 UTC,UTC,research,https://www.inaturalist.org/observations/10466...,https://static.inaturalist.org/photos/17543114...,,2,0,...,Sambucus cerulea,Sunol,east bay,,,,,Sambucus,cerulea,2022-01-08
3,104681782,1/9/22,2022-01-09 11:15:26 UTC,UTC,research,https://www.inaturalist.org/observations/10468...,https://inaturalist-open-data.s3.amazonaws.com...,,1,0,...,Cardamine californica,Sunol,east bay,,,,,Cardamine,californica,2022-01-09
4,104690215,1/8/22,2022-01-08 23:06:55 UTC,Pacific Time (US & Canada),research,https://www.inaturalist.org/observations/10469...,https://inaturalist-open-data.s3.amazonaws.com...,large population amidst cattle grazing slopes,1,0,...,Calandrinia menziesii,Sunol,east bay,,,,,Calandrinia,menziesii,2022-01-08


In [13]:
print(df_new.isna().sum())

id                                     0
observed_on                            0
time_observed_at                       2
time_zone                              0
quality_grade                         44
url                                    0
image_url                              0
description                         3876
num_identification_agreements          0
num_identification_disagreements       0
captive_cultivated                     0
place_guess                            0
latitude                               0
longitude                              0
geoprivacy                          4028
place_town_name                     3818
place_county_name                      0
species_guess                         10
scientific_name                        0
common_name                           28
iconic_taxon_name                      0
taxon_id                               0
taxon_family_name                      0
taxon_genus_name                       1
taxon_species_na

In [14]:
#get df with just columns you need
df_new = df_new[['id', 'observed_on', 'datetime', 'park', 'region', 'latitude', 'longitude', 
         'taxon_species_name', 'genus', 'species', 'url', 'image_url']]

In [15]:
#rename temp cols
df_new.rename(columns = {'observed_on':'date', 'park':'park', 'region':'region','latitude':'latitude', 'longitude':'longitude', 
                           'taxon_species_name':'genus_species', 'genus':'genus', 'species':'species', 
                           'url':'url', 'image_url':'image_url'}, inplace = True)


In [16]:
df_new.head()

Unnamed: 0,id,date,datetime,park,region,latitude,longitude,genus_species,genus,species,url,image_url
0,104188607,1/1/22,2022-01-01,Sunol,east bay,37.530981,-121.819691,Baccharis pilularis,Baccharis,pilularis,https://www.inaturalist.org/observations/10418...,https://inaturalist-open-data.s3.amazonaws.com...
1,104188609,1/1/22,2022-01-01,Sunol,east bay,37.52706,-121.827025,Capsella bursa-pastoris,Capsella,bursa-pastoris,https://www.inaturalist.org/observations/10418...,https://inaturalist-open-data.s3.amazonaws.com...
2,104667115,1/8/22,2022-01-08,Sunol,east bay,37.523395,-121.833219,Sambucus cerulea,Sambucus,cerulea,https://www.inaturalist.org/observations/10466...,https://static.inaturalist.org/photos/17543114...
3,104681782,1/9/22,2022-01-09,Sunol,east bay,37.520038,-121.822708,Cardamine californica,Cardamine,californica,https://www.inaturalist.org/observations/10468...,https://inaturalist-open-data.s3.amazonaws.com...
4,104690215,1/8/22,2022-01-08,Sunol,east bay,37.509616,-121.824145,Calandrinia menziesii,Calandrinia,menziesii,https://www.inaturalist.org/observations/10469...,https://inaturalist-open-data.s3.amazonaws.com...


## old data: Sep. 2017 through various dates


In [37]:
#files available at https://github.com/Floydworks/Capstone2_Wildflower_Phenology/tree/main/NOAA_climate_files

## get current directory
folder_path = '/Users/sandidge/Desktop/Python_Projects/Springboard_coursework/Capstone2_Wildflowers/iNat_2022/export_data/'
#folder_path = 'your local file path to folder with downloaded data'

## list all file available 
all_files = os.listdir(folder_path)
#print(all_files)

## only store .csv filenames
csv_files = list(filter(lambda f: f.endswith('.csv'), all_files))
#print(csv_files)

## create a new list to store filesnames with no .csv extension
file_names = []
for x in range(len(csv_files)):
    file_names.append(csv_files[x].split('.')[0])
file_names

### create dataframes for each weather station dataset

In [38]:

#create empty list to store dataframe names
df_old_names = []
print(type(df_old_names))

## Loop through to assign dataframe names
for file in file_names:
    final_df = file+"_df"
    print("Dataframe name : "+final_df, type(final_df))
    df_old_names.append(final_df)      #creat list of df names 'df_names'
   
    filename = file+".csv"
    ## In python to assign a string as a dataframe name, use globals()
    globals()[final_df] = pd.read_csv(r'/Users/sandidge/Desktop/Python_Projects/Springboard_coursework/Capstone2_Wildflowers/iNat_2022/export_data/'+filename)

    
print(df_old_names)

<class 'list'>
Dataframe name : sunol_June25_247264_df <class 'str'>
Dataframe name : anthonychabot_sep15_254976_df <class 'str'>
Dataframe name : briones_June25_247265_df <class 'str'>
Dataframe name : mt_diablo_19sep22_df <class 'str'>
Dataframe name : josephdgrant_mar17_254980_df <class 'str'>
Dataframe name : tilden_June25_247275_df <class 'str'>
Dataframe name : garin_oct16_254955_df <class 'str'>
Dataframe name : pleasantonridge_sep15_254986_df <class 'str'>
['sunol_June25_247264_df', 'anthonychabot_sep15_254976_df', 'briones_June25_247265_df', 'mt_diablo_19sep22_df', 'josephdgrant_mar17_254980_df', 'tilden_June25_247275_df', 'garin_oct16_254955_df', 'pleasantonridge_sep15_254986_df']


In [39]:
#add park name column to each df to identify location

#parks in training set
sunol_June25_247264_df['park'], sunol_June25_247264_df['region'] = ['Sunol', 'east bay']
briones_June25_247265_df['park'], briones_June25_247265_df['region'] = ['Briones', 'east bay']
tilden_June25_247275_df['park'],tilden_June25_247275_df['region'] = ['Tilden', 'east bay']
garin_oct16_254955_df['park'], garin_oct16_254955_df['region'] = ['Garin', 'east bay']
anthonychabot_sep15_254976_df['park'],anthonychabot_sep15_254976_df['region'] = ['AnthonyChabot', 'east bay']
josephdgrant_mar17_254980_df['park'],josephdgrant_mar17_254980_df['region'] = ['JDGrant', 'east bay']
pleasantonridge_sep15_254986_df['park'],pleasantonridge_sep15_254986_df['region'] = ['PleasantonRidge', 'east bay']

#parks in test set
mt_diablo_19sep22_df['park'], mt_diablo_19sep22_df['region'] = ['MtDiablo', 'east bay']


In [40]:
#concatenate the observations by parks dataframes
df_old = pd.concat([sunol_June25_247264_df, 
                briones_June25_247265_df, 
                tilden_June25_247275_df,
                anthonychabot_sep15_254976_df,
                garin_oct16_254955_df,
                josephdgrant_mar17_254980_df,
                pleasantonridge_sep15_254986_df,
                mt_diablo_19sep22_df, 
                ]
                )
print(df_old.shape)

#verify concatenation
if int(len(df_old)) == int(len(sunol_June25_247264_df)+
                       len(briones_June25_247265_df)+
                       len(tilden_June25_247275_df)+
                       len(anthonychabot_sep15_254976_df)+
                       len(garin_oct16_254955_df)+
                       len(josephdgrant_mar17_254980_df)+
                       len(pleasantonridge_sep15_254986_df)+
                       len(mt_diablo_19sep22_df)):
    print('Concatenation seems correct!')

(44198, 29)
Concatenation seems correct!


In [41]:
df_old['park'].unique()

In [42]:
df_old.columns

In [43]:
#loop through parks and output first observation date, last observation date, number of observations

park_names = list(df_old['park'].unique())
#park_names

for p in park_names:
    df_temp = df_old[df_old['park']==p]
    name = p
    num_obs = len(df_temp)
    df_temp['observed_on'] = pd.to_datetime(df_temp['observed_on'])
    start = df_temp['observed_on'].min()
    end = df_temp['observed_on'].max()
    print('Park name: '+ str(p) +', '+'Num. of Obs = '+ str(num_obs))
    print(df_temp['observed_on'].min())
    print(df_temp['observed_on'].max())
   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['observed_on'] = pd.to_datetime(df_temp['observed_on'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['observed_on'] = pd.to_datetime(df_temp['observed_on'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['observed_on'] = pd.to_datetime(df_temp['observed_on'])
A value is tr

Park name: Sunol, Num. of Obs = 2812
2017-10-03 00:00:00
2022-06-21 00:00:00
Park name: Briones, Num. of Obs = 3548
2017-10-07 00:00:00
2022-06-23 00:00:00
Park name: Tilden, Num. of Obs = 5703
2017-10-03 00:00:00
2022-06-23 00:00:00
Park name: AnthonyChabot, Num. of Obs = 2994
2015-09-30 00:00:00
2022-07-22 00:00:00
Park name: Garin, Num. of Obs = 444
2016-11-24 00:00:00
2022-06-15 00:00:00
Park name: JDGrant, Num. of Obs = 2519
2015-10-18 00:00:00
2022-07-16 00:00:00
Park name: PleasantonRidge, Num. of Obs = 685
2016-02-06 00:00:00
2022-06-08 00:00:00
Park name: MtDiablo, Num. of Obs = 25493
2016-10-01 00:00:00
2022-09-17 00:00:00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['observed_on'] = pd.to_datetime(df_temp['observed_on'])


In [44]:
#add two additional columns with genus and species separated
df_old[['genus', 'species']] = df_old['taxon_species_name'].str.split(' ', expand=True)

In [45]:
#make datetime column
df_old['datetime'] = pd.to_datetime(df_old['observed_on'])
df_old = df_old[df_old['datetime']>'2017-09-30']

In [46]:
print(df_old['park'].unique())
df_old.head()

['Sunol' 'Briones' 'Tilden' 'AnthonyChabot' 'Garin' 'JDGrant'
 'PleasantonRidge' 'MtDiablo']


Unnamed: 0,id,observed_on,time_observed_at,time_zone,quality_grade,url,image_url,description,num_identification_agreements,num_identification_disagreements,...,taxon_id,taxon_family_name,taxon_genus_name,taxon_species_name,park,region,observed_on_string,genus,species,datetime
0,8246725,10/3/17,2017-10-03 20:46:28 UTC,Pacific Time (US & Canada),research,https://www.inaturalist.org/observations/8246725,https://inaturalist-open-data.s3.amazonaws.com...,,1,0,...,47850,Fagaceae,Quercus,Quercus agrifolia,Sunol,east bay,,Quercus,agrifolia,2017-10-03
1,8246777,10/3/17,2017-10-03 20:47:08 UTC,Pacific Time (US & Canada),research,https://www.inaturalist.org/observations/8246777,https://inaturalist-open-data.s3.amazonaws.com...,,1,0,...,53351,Betulaceae,Alnus,Alnus rhombifolia,Sunol,east bay,,Alnus,rhombifolia,2017-10-03
2,8246800,10/3/17,2017-10-03 20:47:30 UTC,Pacific Time (US & Canada),research,https://www.inaturalist.org/observations/8246800,https://inaturalist-open-data.s3.amazonaws.com...,,1,0,...,58231,Platanaceae,Platanus,Platanus racemosa,Sunol,east bay,,Platanus,racemosa,2017-10-03
3,8246813,10/3/17,2017-10-03 20:48:23 UTC,Pacific Time (US & Canada),research,https://www.inaturalist.org/observations/8246813,https://inaturalist-open-data.s3.amazonaws.com...,,1,0,...,54502,Juglandaceae,Juglans,Juglans hindsii,Sunol,east bay,,Juglans,hindsii,2017-10-03
4,8246824,10/3/17,2017-10-03 20:48:55 UTC,Pacific Time (US & Canada),research,https://www.inaturalist.org/observations/8246824,https://inaturalist-open-data.s3.amazonaws.com...,,1,0,...,53445,Rosaceae,Rubus,Rubus ursinus,Sunol,east bay,,Rubus,ursinus,2017-10-03


In [47]:
print(df_old.isna().sum())

id                                      0
observed_on                             0
time_observed_at                      363
time_zone                               0
quality_grade                           0
url                                     0
image_url                               0
description                         39160
num_identification_agreements           0
num_identification_disagreements        0
captive_cultivated                      0
place_guess                             0
latitude                                0
longitude                               0
positional_accuracy                 10364
geoprivacy                          41686
place_town_name                     40358
place_county_name                       0
species_guess                         120
scientific_name                         0
common_name                           491
iconic_taxon_name                       0
taxon_id                                0
taxon_family_name                 

In [48]:
#get df with just columns you need
df_old = df_old[['id', 'observed_on', 'datetime', 'park', 'region', 'latitude', 'longitude', 
         'taxon_species_name', 'genus', 'species', 'url', 'image_url']]

In [49]:
#rename temp cols
df_old.rename(columns = {'observed_on':'date', 'park':'park', 'region':'region','latitude':'latitude', 'longitude':'longitude', 
                           'taxon_species_name':'genus_species', 'genus':'genus', 'species':'species', 
                           'url':'url', 'image_url':'image_url'}, inplace = True)
#df.head()

In [35]:
df_old.head()

Unnamed: 0,id,date,datetime,park,region,latitude,longitude,genus_species,genus,species,url,image_url
0,8246725,10/3/17,2017-10-03,Sunol,east bay,37.515778,-121.830642,Quercus agrifolia,Quercus,agrifolia,https://www.inaturalist.org/observations/8246725,https://inaturalist-open-data.s3.amazonaws.com...
1,8246777,10/3/17,2017-10-03,Sunol,east bay,37.515892,-121.830588,Alnus rhombifolia,Alnus,rhombifolia,https://www.inaturalist.org/observations/8246777,https://inaturalist-open-data.s3.amazonaws.com...
2,8246800,10/3/17,2017-10-03,Sunol,east bay,37.515688,-121.829925,Platanus racemosa,Platanus,racemosa,https://www.inaturalist.org/observations/8246800,https://inaturalist-open-data.s3.amazonaws.com...
3,8246813,10/3/17,2017-10-03,Sunol,east bay,37.515797,-121.830467,Juglans hindsii,Juglans,hindsii,https://www.inaturalist.org/observations/8246813,https://inaturalist-open-data.s3.amazonaws.com...
4,8246824,10/3/17,2017-10-03,Sunol,east bay,37.516022,-121.830453,Rubus ursinus,Rubus,ursinus,https://www.inaturalist.org/observations/8246824,https://inaturalist-open-data.s3.amazonaws.com...


# Concatenate the old and new datasets

In [50]:
#df_new.head()
df_new.shape

In [51]:
#df_old.head()
df_old.shape

In [52]:
df_updated = pd.concat([df_new, df_old])

print(df_updated.shape)

(45714, 12)


# Delete duplicated observations

In [53]:
 
#drop duplicated observation id's
df_updated = df_updated.drop_duplicates(subset='id', keep="first")
df_updated.shape

In [54]:
df_updated.head()

Unnamed: 0,id,date,datetime,park,region,latitude,longitude,genus_species,genus,species,url,image_url
0,104188607,1/1/22,2022-01-01,Sunol,east bay,37.530981,-121.819691,Baccharis pilularis,Baccharis,pilularis,https://www.inaturalist.org/observations/10418...,https://inaturalist-open-data.s3.amazonaws.com...
1,104188609,1/1/22,2022-01-01,Sunol,east bay,37.52706,-121.827025,Capsella bursa-pastoris,Capsella,bursa-pastoris,https://www.inaturalist.org/observations/10418...,https://inaturalist-open-data.s3.amazonaws.com...
2,104667115,1/8/22,2022-01-08,Sunol,east bay,37.523395,-121.833219,Sambucus cerulea,Sambucus,cerulea,https://www.inaturalist.org/observations/10466...,https://static.inaturalist.org/photos/17543114...
3,104681782,1/9/22,2022-01-09,Sunol,east bay,37.520038,-121.822708,Cardamine californica,Cardamine,californica,https://www.inaturalist.org/observations/10468...,https://inaturalist-open-data.s3.amazonaws.com...
4,104690215,1/8/22,2022-01-08,Sunol,east bay,37.509616,-121.824145,Calandrinia menziesii,Calandrinia,menziesii,https://www.inaturalist.org/observations/10469...,https://inaturalist-open-data.s3.amazonaws.com...


In [55]:
df_updated = df_updated[df_updated['date']>'09/30/17']
df_updated.shape

In [56]:
#loop through parks and output first observation date, last observation date, number of observations

park_names = list(df_updated['park'].unique())
#park_names

for p in park_names:
    df_temp = df_updated[df_updated['park']==p]
    name = p
    num_obs = len(df_temp)
    #df_temp['observed_on'] = pd.to_datetime(df_temp['observed_on'])
    start = df_temp['date'].min()
    end = df_temp['date'].max()
    print('Park name: '+ str(p) +', '+'Num. of Obs = '+ str(num_obs))
    print(df_temp['datetime'].min())
    print(df_temp['datetime'].max())
   

Park name: Sunol, Num. of Obs = 2878
2017-10-03 00:00:00
2022-09-27 00:00:00
Park name: Briones, Num. of Obs = 3649
2017-10-07 00:00:00
2022-09-25 00:00:00
Park name: Tilden, Num. of Obs = 6070
2017-10-03 00:00:00
2022-09-26 00:00:00
Park name: AnthonyChabot, Num. of Obs = 2866
2017-11-04 00:00:00
2022-09-30 00:00:00
Park name: Garin, Num. of Obs = 390
2017-11-24 00:00:00
2022-09-24 00:00:00
Park name: JDGrant, Num. of Obs = 2418
2017-10-11 00:00:00
2022-09-17 00:00:00
Park name: PleasantonRidge, Num. of Obs = 603
2017-10-14 00:00:00
2022-09-26 00:00:00
Park name: MtDiablo, Num. of Obs = 19048
2017-10-04 00:00:00
2022-09-30 00:00:00


In [57]:
#export data

df_updated.to_csv('/Users/sandidge/Desktop/Python_Projects/Springboard_coursework/Capstone2_Wildflowers/Public_Final/updated_inat_data.csv')

