In [1]:
!pip install geopandas shapely

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point 
import math
import os
import time
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans

### Validation Label 0

In [3]:
df = pd.read_csv("alldata.csv")
val_df = df.copy()
val_df["validation_label"] = np.zeros(len(val_df))
df.head() # Previewing alldata.csv before cleaning

Unnamed: 0.1,Unnamed: 0,Observation_ID,Date_of_observation,User_id,User_Tree_id,Species_name,Lat,Long,State_name,Leaves_fresh,Leaves_mature,Leaves_old,Flowers_bud,Flowers_open,Flowers_male,Flowers_Female,Fruits_unripe,Fruits_ripe,Fruits_open
0,1,388564.0,2020-01-01,20396.0,84299.0,Indian Almond- Terminalia catappa,12.15386,75.22397,Kerala,2.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
1,2,388565.0,2020-01-01,20396.0,84300.0,Indian Almond- Terminalia catappa,12.15386,75.22397,Kerala,2.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
2,3,388566.0,2020-01-01,20396.0,84301.0,Fish-tail palm- Caryota urens,12.1406,75.22145,Kerala,0.0,2.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
3,4,388567.0,2020-01-01,20396.0,84302.0,Mast Tree-Monoon longifolium,12.1406,75.22145,Kerala,1.0,2.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
4,5,388568.0,2020-01-01,20396.0,84303.0,Indian Almond- Terminalia catappa,12.1406,75.22145,Kerala,0.0,1.0,2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0


# Labeling Incorrect -2 Values

### Validation Label 1

In [4]:
# Replacing incorrect -2 values with either NA or -2
all_species = list(df['Species_name'].value_counts().index) # all species named in order of prevalence
phenophases = list(df.columns[9:]) # Phenophases

def create_species_dict(*absent_phenophases):
    species_dict = dict(zip(phenophases, np.zeros(len(phenophases), int)))
    for phenophase in absent_phenophases:
        species_dict[phenophase] = 1
    return species_dict

handbook_dicts = {} # Dict mapping species to phenophase dicts. 
# phenophase dicts give absent phenophases in the associated species.
# Manually input absent phenophases from SeasonWatch handbook
handbook_dicts[all_species[0]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[1]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[2]] = create_species_dict('Flowers_open', 'Fruits_open')
handbook_dicts[all_species[3]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[4]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[5]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[6]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[7]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[8]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[9]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[10]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[11]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[12]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[13]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[14]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[15]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[16]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[17]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[18]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[19]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[20]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[21]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[22]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[23]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[24]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[25]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[26]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[27]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[28]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[29]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[30]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[31]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[32]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[33]] = create_species_dict('Flowers_bud', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[34]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[35]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[36]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[37]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[38]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[39]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[40]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[41]] = create_species_dict('Flowers_open', 'Fruits_open')
handbook_dicts[all_species[42]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[43]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[44]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[45]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[46]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[47]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[48]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[49]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[50]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[51]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[52]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[53]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[54]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[55]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[56]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[57]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[58]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[59]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[60]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[61]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[62]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[63]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[64]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[65]] = create_species_dict('Flowers_open', 'Fruits_open') # Silkworm Mulberry
handbook_dicts[all_species[66]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[67]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[68]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[69]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[70]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[71]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[72]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[73]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[74]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[75]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[76]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[77]] = create_species_dict('Flowers_open', 'Fruits_open') # Box-myrtle
handbook_dicts[all_species[78]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[79]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[80]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Airi Mango
handbook_dicts[all_species[81]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[82]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[83]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[84]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[85]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[86]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[87]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[88]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[89]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[90]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[91]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[92]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[93]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[94]] = create_species_dict('Flowers_open') # Wild Almond
handbook_dicts[all_species[95]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[96]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[97]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[98]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[99]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[100]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[101]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[102]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[103]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[104]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Alphonso Mango
handbook_dicts[all_species[105]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[106]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[107]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[108]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[109]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[110]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[111]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[112]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[113]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[114]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[115]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Aabehayat Mango
handbook_dicts[all_species[116]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[117]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[118]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[119]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[120]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[121]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[122]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[123]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[124]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[125]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[126]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[127]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[128]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[129]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Manjeera Mango
handbook_dicts[all_species[130]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[131]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[132]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[133]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[134]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[135]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[136]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[137]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[138]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[139]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[140]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[141]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[142]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[143]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[144]] = create_species_dict('Flowers_male', 'Flowers_Female') # Blue Pine
handbook_dicts[all_species[145]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[146]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[147]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[148]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[149]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[150]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[151]] = create_species_dict('Flowers_open', 'Fruits_open') # Indian Charcoal Tree
handbook_dicts[all_species[152]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[153]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[154]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[155]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[156]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[157]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[158]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[159]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[160]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Mallika Mango
handbook_dicts[all_species[161]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[162]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[163]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[164]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[165]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[166]] = create_species_dict('Flowers_open','Fruits_open') # Mohru Oak
handbook_dicts[all_species[167]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[168]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[169]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[170]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[171]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[172]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Chosa Mango
handbook_dicts[all_species[173]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[174]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Olour Mango
handbook_dicts[all_species[175]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[176]] = create_species_dict('Flowers_bud', 'Flowers_male', 'Flowers_Female', 'Fruits_open')

# Replace Incorrect -2 Values
for species in all_species:
    species_df = df[df['Species_name'] == species]
    species_dict = handbook_dicts[species]
    for phenophase in phenophases:
        if species_dict[phenophase] == 0:
            false_positive_idx = species_df.index[species_df[phenophase] == -2] # Indices of reports that incorrectly assign -2 values (false positive) to phenophases that DO appear in the species
            df.loc[false_positive_idx, phenophase] = np.full(len(false_positive_idx),np.nan) # turn into NaN so they will be dropped
            val_df.loc[false_positive_idx, 'validation_label'] = np.full(len(false_positive_idx),1) # 1 represents observations incorrectly assigned -2 values
            
        if species_dict[phenophase] == 1:
            false_negative_idx = species_df.index[species_df[phenophase] != -2] # Indices of reports that incorrectly assign values other than -2 (false negative) to phenophases that DO NOT appear in the species
            df.loc[false_negative_idx, phenophase] = np.full(len(false_negative_idx),-2.0) # convert all values for the absent phenophase to -2

In [5]:
# Combining all mango varieties under the Species_name of Mango (all varieties)- Mangifera indica
df['Species_name'] = df['Species_name'].replace(to_replace=r'\w* Mango- Mangifera indica', value='Mango (all varieties)- Mangifera indica', regex=True)

# Filling in Missing States

In [6]:
states_shapefile = gpd.read_file("india/gadm41_IND_3.shp")

In [7]:
# Function for filling state_name attribute based on coordinates for observations with NA state_name
def find_indian_state(latitude, longitude, gdf):
    point = Point(longitude, latitude)
    
    for _, state in gdf.iterrows():
        if state['geometry'].contains(point):
            return state['NAME_1']
    return None

state = find_indian_state(13.07248, 80.24340, states_shapefile)
print(state) # IF THE FUNCTION WORKS THIS SHOULD OUTPUT TAMIL NADU

Tamil Nadu


In [8]:
# Fill any missing state names in dataset
# !!! Warning: This process take ~2.1 hours !!!
import time
start_time = time.time()
counter = 0
observations_missing_state_name = df[df["State_name"].isna()].drop(['State_name'], axis=1).dropna(how='any')
for idx, row in observations_missing_state_name.iterrows():
    if (counter % 1000) == 0:
        print(f"{counter}: {time.time()-start_time} seconds elapsed; {counter/len(observations_missing_state_name)*100}% Done")
    df.at[idx, "State_name"] = find_indian_state(row["Lat"], row["Long"], states_shapefile)
    counter += 1
# In case any states are labeled as a name inconsistent with our dataset
df['State_name'] = df['State_name'].replace('Andaman and Nicobar', 'Andaman and Nicobar Islands')
df['State_name'] = df['State_name'].replace('NCT of Delhi', 'Delhi')
print(f"Finished in {time.time()-start_time} seconds")


0: 0.03315591812133789 seconds elapsed; 0.0% Done
1000: 43.859100341796875 seconds elapsed; 6.737636437137852% Done
2000: 84.23898983001709 seconds elapsed; 13.475272874275705% Done
3000: 129.6504008769989 seconds elapsed; 20.212909311413558% Done
4000: 175.87285041809082 seconds elapsed; 26.95054574855141% Done
5000: 225.14478421211243 seconds elapsed; 33.68818218568926% Done
6000: 277.78693151474 seconds elapsed; 40.425818622827116% Done
7000: 339.83855867385864 seconds elapsed; 47.16345505996497% Done
8000: 397.55657172203064 seconds elapsed; 53.90109149710282% Done
9000: 448.48569440841675 seconds elapsed; 60.63872793424066% Done
10000: 498.5956838130951 seconds elapsed; 67.37636437137851% Done
11000: 557.9487283229828 seconds elapsed; 74.11400080851637% Done
12000: 618.5587477684021 seconds elapsed; 80.85163724565423% Done
13000: 659.1623032093048 seconds elapsed; 87.58927368279208% Done
14000: 698.4099836349487 seconds elapsed; 94.32691011992993% Done
Finished in 734.544679641723

# Labeling NA Values

### Validation Label 2

In [9]:
# Label rows with NA values and sort by species name
na_indices = val_df[val_df.isna().any(axis=1)].index
val_df.loc[na_indices, 'validation_label'] = np.full(len(na_indices), 2) # 2 represents observations with na values
df = df.dropna()
df = df.drop(df.columns[0], axis=1)
df = df.sort_values(by='Species_name')

In [10]:
# Reformats df to Year, Week formatting to match the reference data
df["Date_of_observation"] = pd.to_datetime(df["Date_of_observation"], format='mixed')
df["Year"] = df["Date_of_observation"].dt.isocalendar().year
df["Week"] = df["Date_of_observation"].dt.dayofyear // (366/48+0.0000000000001) # Weeks duration varies between 7 or 8 days
df["Week"] = df["Week"].astype(int)
# dt.dayofyear gives an index starting at 1, thus use 366 for leap years
# Add 0.0000000000001 bias so week is in range [0,47] instead of [0,48]

# !!! Use the following instead if 52 weeks are wanted !!!
# df["Year"] = df["Date_of_observation"].dt.isocalendar().week
# Warning: 52nd week will only be 1 or 2 days depending on if it's a leap year or not

# Outlier Detection

In [12]:
# Helper function for anomaly_detection_overall
def outlier_detection(df, num_trees=500): # Returns list of indices of observations deemed outliers by isolation forests
    df = df.drop(["Date_of_observation", "Observation_ID", "User_id", "User_Tree_id", "State_name", "Species_name", "Year"], axis=1)
    
    model = IsolationForest(n_estimators = num_trees, contamination=0.08, verbose = 1, random_state = 42)
    
    invalid_indices = []
    
    for week in df["Week"].sort_values().unique():
        week_df = df[df["Week"] == week]
        week_df = week_df.drop("Week", axis=1)
        
        model.fit(week_df)
        preds = model.predict(week_df)
        week_df["Predictions"] = preds
        
        invalid_indices += list(week_df[week_df["Predictions"] == -1].index)
    return invalid_indices

In [13]:
def anomaly_detection_overall(df, min_observations_for_outlier_detection):
    start_time = time.time()
    invalid_indices = []
    states = df["State_name"].unique()
    for state in states:
        print(f"**********{state}**********")
        state_start_time = time.time()
        state_df = df[df["State_name"] == state]
        years = state_df["Year"].sort_values().unique()
        for year in years:
            print(f"**********{year}**********")
            year_start_time = time.time()
            state_year_df = state_df[state_df["Year"] == year]
            species_list = state_year_df["Species_name"].unique()
            for species in species_list:
                species_state_year_df = state_year_df[state_year_df["Species_name"] == species]
                if len(species_state_year_df) > min_observations_for_outlier_detection:
                    species_start_time = time.time()
                    invalid_indices += outlier_detection(species_state_year_df)
                    print(f"Length of DataFrame: {len(species_state_year_df)}")
                    print(f"Finished {species} in {state} during {year} in {time.time()-species_start_time} seconds")
            print(f"Finished {state} during {year} in {time.time()-year_start_time} seconds")
        print(f"Finished {state} in {time.time()-state_start_time} seconds")
    print(f"Finished completely in {time.time()-start_time} seconds")
    return invalid_indices

Unnamed: 0,Observation_ID,Date_of_observation,User_id,User_Tree_id,Species_name,Lat,Long,State_name,Leaves_fresh,Leaves_mature,Leaves_old,Flowers_bud,Flowers_open,Flowers_male,Flowers_Female,Fruits_unripe,Fruits_ripe,Fruits_open,Year,Week
223170,266696.0,2019-01-23,15220.0,30035.0,African tulip- Spathodea campanulata,9.38520,76.58480,Kerala,1.0,2.0,1.0,1.0,2.0,-2.0,-2.0,1.0,0.0,0.0,2019,3
259851,314564.0,2019-06-13,15220.0,30035.0,African tulip- Spathodea campanulata,9.38520,76.58480,Kerala,1.0,2.0,0.0,0.0,0.0,-2.0,-2.0,0.0,1.0,1.0,2019,21
192003,242113.0,2018-12-02,18911.0,20472.0,African tulip- Spathodea campanulata,18.73225,73.68948,Maharashtra,1.0,2.0,1.0,1.0,2.0,-2.0,-2.0,1.0,1.0,0.0,2018,44
449415,577102.0,2022-03-03,17259.0,98238.0,African tulip- Spathodea campanulata,20.17316,85.68900,Odisha,0.0,2.0,1.0,2.0,2.0,-2.0,-2.0,0.0,0.0,0.0,2022,8
449410,577097.0,2022-03-03,17259.0,112023.0,African tulip- Spathodea campanulata,20.16990,85.68567,Odisha,0.0,2.0,2.0,2.0,2.0,-2.0,-2.0,0.0,0.0,0.0,2022,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364882,453117.0,2020-11-09,4919.0,7091.0,Wood Apple- Aegle marmelos,8.61354,76.83297,Kerala,2.0,2.0,1.0,2.0,1.0,-2.0,-2.0,2.0,1.0,-2.0,2020,41
163185,286684.0,2018-06-25,4884.0,7125.0,Wood Apple- Aegle marmelos,8.56742,77.01065,Kerala,2.0,2.0,2.0,1.0,1.0,-2.0,-2.0,1.0,1.0,-2.0,2018,23
555374,684758.0,2023-08-14,26933.0,144762.0,Yellow-silk cotton tree- Cochlospermum religiosum,18.97872,72.83551,Maharashtra,1.0,2.0,0.0,0.0,0.0,-2.0,-2.0,0.0,0.0,0.0,2023,29
489175,617837.0,2022-10-23,25641.0,123278.0,Yellow-silk cotton tree- Cochlospermum religiosum,10.73738,76.63250,Kerala,2.0,1.0,1.0,2.0,2.0,-2.0,-2.0,-1.0,-1.0,0.0,2022,38


### Validation Label 3

In [14]:
# Run anomaly detection on all citizen data and drop any anomalies
invalid_indices = anomaly_detection_overall(df, 15*52) # Choose 15*52 as the min num of observations because it means there's an average of 15 observations per week
val_df.loc[invalid_indices, 'validation_label'] = np.full(len(invalid_indices), 3) # 3 represents observations flagged as outliers by our anomaly detection system (isolation forests machine learning)
df = df.drop(invalid_indices)

**********Kerala**********
**********2014**********
Finished Kerala during 2014 in 0.0013806819915771484 seconds
**********2015**********
Finished Kerala during 2015 in 0.0009446144104003906 seconds
**********2016**********
Finished Kerala during 2016 in 0.0009205341339111328 seconds
**********2017**********
Finished Kerala during 2017 in 0.003741741180419922 seconds
**********2018**********
Length of DataFrame: 4672
Finished Coconut palm-Cocos nucifera in Kerala during 2018 in 22.096879720687866 seconds
Length of DataFrame: 3238
Finished Jackfruit- Artocarpus heterophyllus in Kerala during 2018 in 22.276249170303345 seconds
Length of DataFrame: 1170
Finished Jamun- Syzygium cumini in Kerala during 2018 in 21.82000184059143 seconds
Length of DataFrame: 2596
Finished Mango (all varieties)- Mangifera indica in Kerala during 2018 in 22.089894771575928 seconds
Length of DataFrame: 788
Finished Neem- Azadirachta indica in Kerala during 2018 in 21.74928593635559 seconds
Finished Kerala durin

# Save Data Cleaning Labeled Citizen Data in One File

In [15]:
# Save updated all_data.csv
val_df.to_csv('data_cleaning_labeled_alldata.csv', index=False) # SAVED IT

## Key for `validation_label` Column

| Number | Meaning |
| ------ | ------- |
| 0      | Kept    |
| 1      | Dropped because a phenophase was incorrectly reported as being -2 |
| 2      | Dropped because a phenophase had missing data (Null Values) |
| 3      | Dropped because observation was flagged as anomalous |