In [111]:
!pip install geopandas shapely



In [112]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point 
import os
import time
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans

In [113]:
df = pd.read_csv("alldata.csv")
df.head() # Previewing alldata.csv before cleaning

Unnamed: 0.1,Unnamed: 0,Observation_ID,Date_of_observation,User_id,User_Tree_id,Species_name,Lat,Long,State_name,Leaves_fresh,Leaves_mature,Leaves_old,Flowers_bud,Flowers_open,Flowers_male,Flowers_Female,Fruits_unripe,Fruits_ripe,Fruits_open
0,1,388564.0,2020-01-01,20396.0,84299.0,Indian Almond- Terminalia catappa,12.15386,75.22397,Kerala,2.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
1,2,388565.0,2020-01-01,20396.0,84300.0,Indian Almond- Terminalia catappa,12.15386,75.22397,Kerala,2.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
2,3,388566.0,2020-01-01,20396.0,84301.0,Fish-tail palm- Caryota urens,12.1406,75.22145,Kerala,0.0,2.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
3,4,388567.0,2020-01-01,20396.0,84302.0,Mast Tree-Monoon longifolium,12.1406,75.22145,Kerala,1.0,2.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
4,5,388568.0,2020-01-01,20396.0,84303.0,Indian Almond- Terminalia catappa,12.1406,75.22145,Kerala,0.0,1.0,2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0


In [114]:
# Replacing incorrect -2 values with either NA or -2
all_species = list(df['Species_name'].value_counts().index) # all species named in order of prevalence
phenophases = list(df.columns[9:]) # Phenophases

def create_species_dict(*absent_phenophases):
    species_dict = dict(zip(phenophases, np.zeros(len(phenophases), int)))
    for phenophase in absent_phenophases:
        species_dict[phenophase] = 1
    return species_dict

handbook_dicts = {} # Dict mapping species to phenophase dicts. 
# phenophase dicts give absent phenophases in the associated species.
# Manually input absent phenophases from SeasonWatch handbook
handbook_dicts[all_species[0]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[1]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[2]] = create_species_dict('Flowers_open', 'Fruits_open')
handbook_dicts[all_species[3]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[4]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[5]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[6]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[7]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[8]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[9]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[10]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[11]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[12]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[13]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[14]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[15]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[16]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[17]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[18]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[19]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[20]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[21]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[22]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[23]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[24]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[25]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[26]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[27]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[28]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[29]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[30]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[31]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[32]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[33]] = create_species_dict('Flowers_bud', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[34]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[35]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[36]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[37]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[38]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[39]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[40]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[41]] = create_species_dict('Flowers_open', 'Fruits_open')
handbook_dicts[all_species[42]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[43]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[44]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[45]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[46]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[47]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[48]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[49]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[50]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[51]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[52]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[53]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[54]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[55]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[56]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[57]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[58]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[59]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[60]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[61]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[62]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[63]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[64]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[65]] = create_species_dict('Flowers_open', 'Fruits_open') # Silkworm Mulberry
handbook_dicts[all_species[66]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[67]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[68]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[69]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[70]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[71]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[72]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[73]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[74]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[75]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[76]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[77]] = create_species_dict('Flowers_open', 'Fruits_open') # Box-myrtle
handbook_dicts[all_species[78]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[79]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[80]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Airi Mango
handbook_dicts[all_species[81]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[82]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[83]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[84]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[85]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[86]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[87]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[88]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[89]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[90]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[91]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[92]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[93]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[94]] = create_species_dict('Flowers_open') # Wild Almond
handbook_dicts[all_species[95]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[96]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[97]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[98]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[99]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[100]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[101]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[102]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[103]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[104]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Alphonso Mango
handbook_dicts[all_species[105]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[106]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[107]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[108]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[109]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[110]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[111]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[112]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[113]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[114]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[115]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Aabehayat Mango
handbook_dicts[all_species[116]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[117]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[118]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[119]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[120]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[121]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[122]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[123]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[124]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[125]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[126]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[127]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[128]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[129]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Manjeera Mango
handbook_dicts[all_species[130]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[131]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[132]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[133]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[134]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[135]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[136]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[137]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[138]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[139]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[140]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[141]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[142]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[143]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[144]] = create_species_dict('Flowers_male', 'Flowers_Female') # Blue Pine
handbook_dicts[all_species[145]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[146]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[147]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[148]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[149]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[150]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[151]] = create_species_dict('Flowers_open', 'Fruits_open') # Indian Charcoal Tree
handbook_dicts[all_species[152]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[153]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[154]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[155]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[156]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[157]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[158]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[159]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[160]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Mallika Mango
handbook_dicts[all_species[161]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[162]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[163]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[164]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[165]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[166]] = create_species_dict('Flowers_open','Fruits_open') # Mohru Oak
handbook_dicts[all_species[167]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[168]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[169]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[170]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[171]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[172]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Chosa Mango
handbook_dicts[all_species[173]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[174]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Olour Mango
handbook_dicts[all_species[175]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[176]] = create_species_dict('Flowers_bud', 'Flowers_male', 'Flowers_Female', 'Fruits_open')

# Replace Incorrect -2 Values
for species in all_species:
    species_df = df[df['Species_name'] == species]
    species_dict = handbook_dicts[species]
    for phenophase in phenophases:
        if species_dict[phenophase] == 0:
            false_positive_idx = species_df.index[species_df[phenophase] == -2] # Indices of reports that incorrectly assign -2 values (false positive) to phenophases that DO appear in the species
            df.loc[false_positive_idx, phenophase] = np.full(len(false_positive_idx),np.nan) # turn into NaN so they will be dropped
            
        if species_dict[phenophase] == 1:
            false_negative_idx = species_df.index[species_df[phenophase] != -2] # Indices of reports that incorrectly assign values other than -2 (false negative) to phenophases that DO NOT appear in the species
            df.loc[false_negative_idx, phenophase] = np.full(len(false_negative_idx),-2.0) # convert all values for the absent phenophase to -2

In [115]:
# Combining all mango varieties under the Species_name of Mango (all varieties)- Mangifera indica
df['Species_name'] = df['Species_name'].replace(to_replace=r'\w* Mango- Mangifera indica', value='Mango (all varieties)- Mangifera indica', regex=True)

In [116]:
# get rid of NA values and sort by species name
df = df.dropna()
df = df.drop(df.columns[0], axis=1)
df = df.sort_values(by='Species_name')
df.head()

Unnamed: 0,Observation_ID,Date_of_observation,User_id,User_Tree_id,Species_name,Lat,Long,State_name,Leaves_fresh,Leaves_mature,Leaves_old,Flowers_bud,Flowers_open,Flowers_male,Flowers_Female,Fruits_unripe,Fruits_ripe,Fruits_open
223170,266696.0,2019-01-23,15220.0,30035.0,African tulip- Spathodea campanulata,9.3852,76.5848,Kerala,1.0,2.0,1.0,1.0,2.0,-2.0,-2.0,1.0,0.0,0.0
259851,314564.0,2019-06-13,15220.0,30035.0,African tulip- Spathodea campanulata,9.3852,76.5848,Kerala,1.0,2.0,0.0,0.0,0.0,-2.0,-2.0,0.0,1.0,1.0
192003,242113.0,2018-12-02,18911.0,20472.0,African tulip- Spathodea campanulata,18.73225,73.68948,Maharashtra,1.0,2.0,1.0,1.0,2.0,-2.0,-2.0,1.0,1.0,0.0
449415,577102.0,2022-03-03,17259.0,98238.0,African tulip- Spathodea campanulata,20.17316,85.689,Odisha,0.0,2.0,1.0,2.0,2.0,-2.0,-2.0,0.0,0.0,0.0
449410,577097.0,2022-03-03,17259.0,112023.0,African tulip- Spathodea campanulata,20.1699,85.68567,Odisha,0.0,2.0,2.0,2.0,2.0,-2.0,-2.0,0.0,0.0,0.0


In [117]:
states_shapefile = gpd.read_file("india/gadm41_IND_3.shp")

In [118]:
# Function for filling state_name attribute based on coordinates for observations with NA state_name
def find_indian_state(latitude, longitude, gdf):
    point = Point(longitude, latitude)
    
    for _, state in gdf.iterrows():
        if state['geometry'].contains(point):
            return state['NAME_1']
    return None

state = find_indian_state(13.07248, 80.24340, states_shapefile)
print(state) # IF THE FUNCTION WORKS THIS SHOULD OUTPUT TAMIL NADU

Tamil Nadu


In [119]:
# Fill any missing state names in dataset
for i, row in df.iterrows():
    if pd.isna(row["State_name"]):
        df.at[i, "State_name"] = find_indian_state(row["Lat"], row["Long"], states_shapefile)

In [10]:
# Save updated all_data.csv
df.to_csv('updated_alldata.csv', index=False) # SAVED IT

In [120]:
# Reformats df to Year, Week formatting to match the reference data
df["Date_of_observation"] = pd.to_datetime(df["Date_of_observation"], format='mixed')
df["Year"] = df["Date_of_observation"].dt.isocalendar().year
df["Week"] = df["Date_of_observation"].dt.dayofyear // (366/48+0.0000000000001) # Weeks duration varies between 7 or 8 days
df["Week"] = df["Week"].astype(int)
# dt.dayofyear gives an index starting at 1, thus use 366 for leap years
# Add 0.0000000000001 bias so week is in range [0,47] instead of [0,48]

# !!! NEW CODE !!!

# EDITED

In [239]:
states

array(['Kerala', 'Maharashtra', 'Odisha', 'Puducherry', 'Karnataka',
       'Tamil Nadu', 'West Bengal', 'Telangana', 'Andhra Pradesh',
       'Rajasthan', 'Madhya Pradesh', 'Goa', 'Chhattisgarh', 'Manipur',
       'Uttar Pradesh', 'Uttarakhand', 'Nagaland', 'Assam', 'Tripura',
       'Punjab', 'Meghalaya', 'Andaman and Nicobar Islands', 'Gujarat',
       'Jammu and Kashmir', 'Himachal Pradesh', 'Haryana', 'Jharkhand',
       'Delhi', 'Bihar', 'Lakshadweep', 'Sikkim', 'Arunachal Pradesh',
       'Chandigarh', 'Dadra and Nagar Haveli'], dtype=object)

In [240]:
species

"Devil's tree- Alstonia scholaris"

In [None]:
states = df["State_name"].unique()
for state in states:
    state_df = df[df["State_name"] == state]
    years = state_df["Year"].sort_values().unique()
    for year in years:
        print(f"**********{year}**********")
        year_start_time = time.time()
        state_year_df = state_df[state_df["Year"] == year]
        species_list = state_year_df["Species_name"].unique()
        for species in species_list:
            species_state_year_df = state_year_df[state_year_df["Species_name"] == species]
            min_observations_for_outlier_detection = 100 # Hyperparameter

In [None]:
import time
def anomaly_detection_overall(df, min_observations_for_outlier_detection):
    start_time = time.time()
    invalid_indices = []
    states = df["State_name"].unique()
    for state in states:
        print(f"**********{state}**********")
        state_start_time = time.time()
        state_df = df[df["State_name"] == state]
        years = state_df["Year"].sort_values().unique()
        for year in years:
            print(f"**********{year}**********")
            year_start_time = time.time()
            state_year_df = state_df[state_df["Year"] == year]
            species_list = state_year_df["Species_name"].unique()
            for species in species_list:
                species_state_year_df = state_year_df[state_year_df["Species_name"] == species]
                if len(species_state_year_df) > min_observations_for_outlier_detection:
                    species_start_time = time.time()
                    invalid_indices += outlier_detection(species_state_year_df)
                    print(f"Length of DataFrame: {len(species_state_year_df)}")
                    print(f"Finished {species} in {state} during {year} in {time.time()-species_start_time} seconds")
            print(f"Finished {state} during {year} in {time.time()-year_start_time} seconds")
        print(f"Finished {state} in {time.time()-state_start_time} seconds")
    print(f"Finished completely in {time.time()-start_time} seconds")
    return invalid_indices
invalid_indices = anomaly_detection_overall(df, 100):
df.drop(invalid_indices)

In [270]:
import time
start_time = time.time()
invalid_indices = []
states = df["State_name"].unique()
for state in states:
    print(f"**********{state}**********")
    state_start_time = time.time()
    state_df = df[df["State_name"] == state]
    #years = state_df["Year"].sort_values().unique()
    years = [2018]
    for year in years:
        print(f"**********{year}**********")
        year_start_time = time.time()
        state_year_df = state_df[state_df["Year"] == year]
        species_list = state_year_df["Species_name"].unique()
        for species in species_list:
            species_state_year_df = state_year_df[state_year_df["Species_name"] == species]
            min_observations_for_outlier_detection = 100 # Hyperparameter
            if len(species_state_year_df) > min_observations_for_outlier_detection:
                species_start_time = time.time()
                invalid_indices += outlier_detection(species_state_year_df)
                print(f"Finished {species} in {state} during {year} in {time.time()-species_start_time} seconds")
                print(f"Current length of invalid_indices: {len(invalid_indices)}")
        print(f"Finished {state} during {year} in {time.time()-year_start_time} seconds")
        break
    break
    print(f"Finished {state} in {time.time()-state_start_time} seconds")
print(f"Finished completely in {time.time()-start_time} seconds")

**********Kerala**********
**********2018**********
Finished Amla- Phyllanthus emblica in Kerala during 2018 in 24.18339490890503 seconds
Current length of invalid_indices: 225
Finished Banyan-Ficus benghalensis in Kerala during 2018 in 19.874462127685547 seconds
Current length of invalid_indices: 301
Finished Chiku- Sapodilla-Manilkara zapota  in Kerala during 2018 in 31.85158896446228 seconds
Current length of invalid_indices: 416
Finished Coconut palm-Cocos nucifera in Kerala during 2018 in 33.53771495819092 seconds
Current length of invalid_indices: 1309
Finished Copper-pod- Peltophorum pterocarpum in Kerala during 2018 in 4.916529893875122 seconds
Current length of invalid_indices: 1370
Finished Country Fig- Ficus racemosa in Kerala during 2018 in 21.03408908843994 seconds
Current length of invalid_indices: 1433
Finished Curry leaf plant-Murraya koenigii in Kerala during 2018 in 18.290374994277954 seconds
Current length of invalid_indices: 1492
Finished Custard apple-Annona squamo

In [266]:
def outlier_detection(df, num_trees=500):
    df = df.drop(["Date_of_observation", "Observation_ID", "User_id", "User_Tree_id", "State_name", "Species_name", "Year"], axis=1)
    
    model = IsolationForest(n_estimators = num_trees, verbose = 1, random_state = 42)
    
    invalid_indices = []
    
    for week in df["Week"].sort_values().unique():
        
        week_df = df[df["Week"] == week]
        week_df = week_df.drop("Week", axis=1)
        model.fit(week_df)
        preds = model.predict(week_df)

        week_df["Predictions"] = preds
        invalid_indices += list(week_df[week_df["Predictions"] == -1].index)
    return invalid_indices

# Updated State Creator

In [11]:
# export new state dataframes to citizenData folder
new_dir_path = 'citizenData'
try:
    # Create the directory
    os.makedirs(new_dir_path, exist_ok=True)
    print(f"Directory '{new_dir_path}' created.")
except FileExistsError:
    print(f"Directory '{new_dir_path}' already exists.")

for state_name in df["State_name"].unique(): # This piece of code creates a new CSV for each state with Year and Week formatting to match the reference data
    state_df = df[df["State_name"] == state_name]    
    state_df.to_csv(f'citizenData/{state_name}_citizenData.csv', index=False)

Directory 'citizenData' already exists.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Date_of_observation"] = pd.to_datetime(state_df["Date_of_observation"], format='mixed')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Year"] = state_df["Date_of_observation"].dt.isocalendar().year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Week"] = state_df["Date_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Date_of_observation"] = pd.to_datetime(state_df["Date_of_observation"], format='mixed')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Year"] = state_df["Date_of_observation"].dt.isocalendar().year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Week"] = state_df["Date_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Date_of_observation"] = pd.to_datetime(state_df["Date_of_observation"], format='mixed')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Year"] = state_df["Date_of_observation"].dt.isocalendar().year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Week"] = state_df["Date_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Date_of_observation"] = pd.to_datetime(state_df["Date_of_observation"], format='mixed')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Year"] = state_df["Date_of_observation"].dt.isocalendar().year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Week"] = state_df["Date_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Date_of_observation"] = pd.to_datetime(state_df["Date_of_observation"], format='mixed')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Year"] = state_df["Date_of_observation"].dt.isocalendar().year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Week"] = state_df["Date_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Date_of_observation"] = pd.to_datetime(state_df["Date_of_observation"], format='mixed')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Year"] = state_df["Date_of_observation"].dt.isocalendar().year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Week"] = state_df["Date_

# Deprecated State Creator

In [11]:
# export new state dataframes to citizenData folder
new_dir_path = 'citizenData'
try:
    # Create the directory
    os.makedirs(new_dir_path, exist_ok=True)
    print(f"Directory '{new_dir_path}' created.")
except FileExistsError:
    print(f"Directory '{new_dir_path}' already exists.")

for state_name in df["State_name"].unique(): # This piece of code creates a new df for each state with Year and Week formatting to match the reference data
    state_df = df[df["State_name"] == state_name]
    state_df["Date_of_observation"] = pd.to_datetime(state_df["Date_of_observation"], format='mixed')
    state_df["Year"] = state_df["Date_of_observation"].dt.isocalendar().year
    state_df["Week"] = state_df["Date_of_observation"].dt.dayofyear // 7.583 #364/48 (dt.dayofyear gives an index, thus 364)
    state_df["Week"] = state_df["Week"].astype(int)
    
    state_df.to_csv(f'citizenData/{state_name}_citizenData.csv', index=False)

Directory 'citizenData' already exists.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Date_of_observation"] = pd.to_datetime(state_df["Date_of_observation"], format='mixed')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Year"] = state_df["Date_of_observation"].dt.isocalendar().year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Week"] = state_df["Date_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Date_of_observation"] = pd.to_datetime(state_df["Date_of_observation"], format='mixed')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Year"] = state_df["Date_of_observation"].dt.isocalendar().year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Week"] = state_df["Date_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Date_of_observation"] = pd.to_datetime(state_df["Date_of_observation"], format='mixed')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Year"] = state_df["Date_of_observation"].dt.isocalendar().year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Week"] = state_df["Date_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Date_of_observation"] = pd.to_datetime(state_df["Date_of_observation"], format='mixed')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Year"] = state_df["Date_of_observation"].dt.isocalendar().year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Week"] = state_df["Date_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Date_of_observation"] = pd.to_datetime(state_df["Date_of_observation"], format='mixed')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Year"] = state_df["Date_of_observation"].dt.isocalendar().year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Week"] = state_df["Date_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Date_of_observation"] = pd.to_datetime(state_df["Date_of_observation"], format='mixed')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Year"] = state_df["Date_of_observation"].dt.isocalendar().year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Week"] = state_df["Date_

# !!! NEW CODE !!!

# UNEDITED

In [None]:
def select_reference_data(state, species, year, num_trees):
    
    df = pd.read_csv(f"all data/citizen/{state}.csv")
    df = df[df["Species_name"] == species]
    df = df[df["Year"] == int(year)]
    df = df.drop(["Date_of_observation", "Observation_ID", "User_id", "User_Tree_id", "Species_id", "State_name", "Species_name", "Year"], axis=1)
    df = df.reset_index(drop=True)
    
    model = IsolationForest(n_estimators = num_trees, verbose = 1, random_state = 42)
    
    new_ref_dict = {} # Saves closest citizen observations to centroids.
    cent = {} # Saves cluster centers of each week for future use
    
    for week in df["Week"].sort_values().unique():
        
        # Isolation Forests
        
        week_df = df[df["Week"] == week]
        week_df = week_df.drop("Week", axis=1)
        week_df = week_df.reset_index(drop=True)
        
        model.fit(week_df)
        preds = model.predict(week_df)

        week_df["Predictions"] = preds

        valid_data = week_df[week_df["Predictions"] == 1]
        valid_data = valid_data.reset_index(drop=True)

In [12]:
# load lookup dicts for id -> name and name -> id from species_codes.csv
species_codes = pd.read_csv("species codes.csv", encoding='unicode_escape')

species_id_to_name = {}
species_name_to_id = {}

for i, row in species_codes.iterrows():
    species_id_to_name[row["species_id"]] = "{}-{}".format(row["species_primary_common_name"], row["species_scientific_name"])
    species_name_to_id["{}-{}".format(row["species_primary_common_name"], row["species_scientific_name"]).lower().replace(" ", "")] = row["species_id"]

In [13]:
# paths to citizen and reference data. create if they do not exist
citizen_path = "all data/citizen"
reference_path = "all data/reference"

try:
    # Create the nested directories
    os.makedirs(citizen_path)
    print(f"Nested directories '{citizen_path}' created.")
except FileExistsError:
    print(f"Nested directories '{citizen_path}' already exists.")

try:
    # Create the nested directories
    os.makedirs(reference_path)
    print(f"Nested directories '{reference_path}' created.")
except FileExistsError:
    print(f"Nested directories '{reference_path}' already exists.")

Nested directories 'all data/citizen' already exists.
Nested directories 'all data/reference' already exists.


In [14]:
# concatenate data for andaman and nicobar and delhi
aani_df = pd.read_csv("citizenData/Andaman and Nicobar Islands_citizenData.csv")
aan_df = pd.read_csv("citizenData/Andaman and Nicobar_citizenData.csv")
pd.concat([aan_df, aani_df],ignore_index=True).to_csv('citizenData/Andaman and Nicobar Islands_citizenData.csv',index=False)

nct_delhi_df = pd.read_csv("citizenData/NCT of Delhi_citizenData.csv")
delhi_df = pd.read_csv("citizenData/Delhi_citizenData.csv")
pd.concat([delhi_df, nct_delhi_df],ignore_index=True).to_csv('citizenData/Delhi_citizenData.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'citizenData/Andaman and Nicobar_citizenData.csv'

In [15]:
# update names in reference data

citizen_states = {}
reference_states = {}

for root, _, files in os.walk("citizenData"):
    for file_name in files:
        if file_name != ".DS_Store":
            file_path = os.path.join(root, file_name)
            # Read the CSV file into a DataFrame

            df = pd.read_csv(file_path)

            state = file_name.replace("_citizenData.csv", "")
            state = state.replace(" ", "_").lower()
            
            citizen_states[state] = file_name

for root, _, files in os.walk("reference_data"):
    for file_name in files:
        if file_name != ".DS_Store":
            file_path = os.path.join(root, file_name)
            
            df = pd.read_csv(file_path)
            
            state = file_name.replace("pvt_", "")
            state = state.replace(".csv", "")
            state = state.replace(" ", "_").lower()
            
            if state == "maharastra":
                reference_states["maharashtra"] = file_name
            else:
                reference_states[state] = file_name
                
species_codes = pd.read_csv("species codes.csv", encoding='unicode_escape')

species_id_to_name = {}
species_name_to_id = {}

for i, row in species_codes.iterrows():
    species_id_to_name[row["species_id"]] = "{}-{}".format(row["species_primary_common_name"], row["species_scientific_name"])
    species_name_to_id["{}-{}".format(row["species_primary_common_name"], row["species_scientific_name"]).lower().replace(" ", "")] = row["species_id"]
    
for state in reference_states:
    ref_df = pd.read_csv(f"reference_data/{reference_states[state]}")
    # remove unnamed columns if they exist
    for col in ref_df.columns:
        if 'Unnamed' in col or col == 'id':
            ref_df = ref_df.drop(col, axis=1)
    ref_df['species_name'] = ref_df['species_id'].map(species_id_to_name)
    ref_df.to_csv(f"all data/reference/{state}.csv", index=False)

In [16]:
species_name_to_id

{'whitebabool-acacialeucophloea': 1000,
 'babool-acacianilotica': 1001,
 'woodapple-aeglemarmelos': 1002,
 'indianhorsechestnut-aesculusindica': 1003,
 'treeofheaven-ailanthusexcelsa': 1004,
 'krishnasiris-albiziaamara': 1005,
 'siris-albizialebbeck': 1006,
 'whitesiris-albiziaprocera': 1007,
 "devil'stree-alstoniascholaris": 1008,
 'axlewood-anogeissuslatifolia': 1009,
 'greymangrove-avicenniamarina': 1010,
 'neem-azadirachtaindica': 1011,
 'purplebauhinia-bauhiniapurpurea': 1012,
 'jhinjheri-bauhiniaracemosa': 1013,
 'semla-bauhiniaretusa': 1014,
 'redsilkcotton-bombaxceiba': 1015,
 'toddypalm-borassusflabellifer': 1016,
 'flameoftheforest-buteamonosperma': 1017,
 'blackdammer-canariumstrictum': 1018,
 'fish-tailpalm-caryotaurens': 1019,
 'indianlaburnum-cassiafistula': 1020,
 'whitesilkcotton-ceibapentandra': 1021,
 'flosssilktree-ceibaspeciosa': 1022,
 'yellowsilkcotton-cochlospermumreligiosum': 1023,
 'mountaincoffee-coffeaarabica': 1024,
 'robustacoffee-coffearobusta': 1025,
 'la

In [18]:
# update names in citizen data
def reformat_mango_string(input_string):
    index = input_string.find("mango")
    
    prefix = input_string[:index]
    suffix = input_string[index+5:]  

    output_string = f"mango:{prefix}{suffix}"
    
    return output_string

for state in citizen_states:
    cit_df = pd.read_csv(f"citizenData/{citizen_states[state]}")
    print(state)

    for i, row in cit_df.iterrows():
        name = row["Species_name"].lower().replace(" ", "")
        
        if name == "arjuntree-terminaliaarjuna":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1083]
            continue
        if name == "axlewoodtree-anogeissuslatifolia":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1009]
            continue
        if name == "chiku-sapodilla-manilkarazapota\xa0":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1188]
            continue
        if name == "dyer'soleander-wrightiatinctoria":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1181]
            continue
        if name == "ficusmollis-softfig":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1197]
            continue
        if name == "frangipani-templetree-plumeriarubra":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1176]
            continue
        if name == "garuga-kharpat-garugapinnata":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1038]
            continue
        if name == "ghostrree-sterculiaurens":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1078]
            continue
        if name == "indianfrankincense-boswelliaserrata":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1195]
            continue
        if name == "indiancoraltree-erythrinaindica":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1034]
            continue
        if name == "kadamba-neolamarckiacadamba":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1058]
            continue
        if name == "lanneacoromandelica-indianashtree":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1194]
            continue
        if name == "mexicanoleander-yellowoleander-cascabelathevetia":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1177]
            continue
        if name == "nightfloweringjasmine-harsingar-nyctanthesarbor-tristis":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1059]
            continue
        if name == "prosopiscineraria-khejri":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1201]
            continue
        if name == "raintree-samaneasaman":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1162]
            continue
        if name == "redsilk-cotton-bombaxceiba":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1015]
            continue
        if name == "whitesilk-cotton-ceibapentandra":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1021]
            continue
        if name == "yellow-silkcottontree-cochlospermumreligiosum":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1023]
            continue
        if name == "karkat-dogteak-dilleniapentagyna":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1032]
            continue
        if name == "chosamango-mangiferaindica":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1108]
            continue
        if name == "falsewhiteteak-mallotusnudiflorus":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1088]
            continue
        if name == "floss-silktree-ceibaspeciosa":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1022]
            continue
        if name == "largesebesten-bairola-cordiawallichii":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1026]
            continue
        if name == "roxburghskydia-pulia-kydiacalycina":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1044]
            continue
        if name == "wildrose-rosawebbiana":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1206]
            continue
        if name == "albiziaodoratissima-blacksiris":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1199]
            continue
        if name == "anogeissuspendula-kardhai":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1198]
            continue
        if name == "brokenbonestree-oroxylumindicum":
            cit_df.loc[i, "Species_name"] = "Broken Bones Tree-Oroxylum Indicum"
            continue
        if name == "pyinmatree-andamancrapemyrtle-lagerstroemiahypoleuca":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1216]
            continue
        if name == "crataevareligiosa-garlic-peartree":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1196]
            continue
        if name == "guh-de-three-leafcapertree-cratevaadansonii":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1217]
            continue
        if name == "aabehayatmango-mangiferaindica":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1091]
            continue
        if name == "bedu-punjabfig-ficuspalmata":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1235]
            continue
        if name == "chinar-platanusorientalis":
            cit_df.loc[i, "Species_name"] = "Chinar-Platanus Orientalis"
            continue
        if name == "prunusnepalensis-sohiong":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1192]
            continue
        if name == "tecomellaundulata-roheda":
            cit_df.loc[i, "Species_name"] = species_id_to_name[1200]
            continue
        if name == "tigersmilkspruce-falconeriainsignis":
            cit_df.loc[i, "Species_name"] = "Tiger's Milk Spruce-Falconeria Insignis"
            continue
        if "mango" in name:
            mango_string = reformat_mango_string(name.lower().replace(" ", ""))
            if mango_string == "mango:(allvarieties)-mangiferaindica":
                cit_df.loc[i, "Species_name"] = species_id_to_name[1090]
            else :
                cit_df.loc[i, "Species_name"] = species_id_to_name[species_name_to_id[mango_string]]
            continue
        if species_name_to_id[name]:
            cit_df.loc[i, "Species_name"] = species_id_to_name[species_name_to_id[name]] # ??? why not just do name -Zach
    cit_df.insert(loc = 4, column = 'Species_id', value = [species_name_to_id.get(species.lower().replace(" ",""), np.nan) for species in cit_df["Species_name"]])
    cit_df.to_csv(f"all data/citizen/{state}.csv")

telangana
andhra_pradesh
goa
uttarakhand
haryana
chandigarh
sikkim
madhya_pradesh
jharkhand
himachal_pradesh
chhattisgarh
delhi
bihar
punjab
andaman_and_nicobar_islands
dadra_and_nagar_haveli
assam
kerala
puducherry
rajasthan
gujarat
lakshadweep
arunachal_pradesh
jammu_and_kashmir
uttar_pradesh
manipur
meghalaya
west_bengal
karnataka
tamil_nadu
nagaland
tripura
maharashtra
odisha


In [24]:
# Make sure all attributes have been added to citizen data

for state in citizen_states:
    if state == 'nct_of_delhi' or state == 'andaman_and_nicobar': # these got merged into multiple files
        continue
    cit_df = pd.read_csv(f"citizenData/{citizen_states[state]}")
    other_cit_df = pd.read_csv(f"all data/citizen/{state}.csv")
    other_cit_df['Lat'] = cit_df['Lat']
    other_cit_df['Long'] = cit_df['Long']
    other_cit_df['Observation_ID'] = cit_df['Observation_ID']
    other_cit_df['User_id'] = cit_df['User_id']
    other_cit_df['User_Tree_id'] = cit_df['User_Tree_id']
    other_cit_df = other_cit_df.drop('Unnamed: 0', axis=1)
    #other_cit_df = other_cit_df.drop('Unnamed: 0.1', axis=1)
    other_cit_df.to_csv(f'all data/citizen/{state}.csv', index=False)