# Imports

In [1]:
# This is an uncommon library so we run a pip install of it just in case
!pip install geopandas shapely

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Common imports
import pandas as pd # Pandas for storing and manipulating CSV files as DataFrames (tables)
import geopandas as gpd # Geopandas for converting coordinates to states
from shapely.geometry import Point # Same as Geopandas
import numpy as np # Numpy for storing and manipulating arrays (lists)
import math # Math for finding NaN values
import os # OS for writing and reading directories (folders)
import time # Time for checking how long things take
from sklearn.ensemble import IsolationForest # Isolation Forests for anomaly detection

### Validation Label 0

In [3]:
df = pd.read_csv("alldata.csv") # Load raw citizen data in
df.head() # Previewing raw (before cleaning) citizen data

val_df = df.copy() # Initializing validation labels DataFrame as raw citizen data
val_df["validation_label"] = np.zeros(len(val_df)) # Initialize validation labels column as zero for every record

# Labeling Incorrect -2 Values

### Validation Label 1

In [4]:
# Replacing incorrect -2 values with either NA or -2
all_species = list(df['Species_name'].value_counts().index) # All species names in order of frequency
phenophases = list(df.columns[9:]) # Phenophase column names

def create_species_dict(*absent_phenophases):
    """
    Creates a dictionary informing if each phenophase is seen in a species or not (0 for seen, 1 for not seen)

    Args:
        absent_phenophase (List(string)): List of phenophases not seen in a species
        use_q (bool):  False means use MDP value iteration, true means use Q-learning
    Returns:
        species_dict (Dict(string,int)): A Dictionary mapping phenophases to binaries indicating presence in a species
    """
    species_dict = dict(zip(phenophases, np.zeros(len(phenophases), int)))
    for phenophase in absent_phenophases:
        species_dict[phenophase] = 1
    return species_dict

handbook_dicts = {} # Dict mapping species to species dicts. 
# Below are manually input absent phenophases for each species in the citizen dataset. Labels derived from SeasonWatch Tree Phenology Guide
handbook_dicts[all_species[0]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[1]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[2]] = create_species_dict('Flowers_open', 'Fruits_open')
handbook_dicts[all_species[3]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[4]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[5]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[6]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[7]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[8]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[9]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[10]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[11]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[12]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[13]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[14]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[15]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[16]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[17]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[18]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[19]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[20]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[21]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[22]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[23]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[24]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[25]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[26]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[27]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[28]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[29]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[30]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[31]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[32]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[33]] = create_species_dict('Flowers_bud', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[34]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[35]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[36]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[37]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[38]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[39]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[40]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[41]] = create_species_dict('Flowers_open', 'Fruits_open')
handbook_dicts[all_species[42]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[43]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[44]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[45]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[46]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[47]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[48]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[49]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[50]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[51]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[52]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[53]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[54]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[55]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[56]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[57]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[58]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[59]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[60]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[61]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[62]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[63]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[64]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[65]] = create_species_dict('Flowers_open', 'Fruits_open') # Silkworm Mulberry
handbook_dicts[all_species[66]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[67]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[68]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[69]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[70]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[71]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[72]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[73]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[74]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[75]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[76]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[77]] = create_species_dict('Flowers_open', 'Fruits_open') # Box-myrtle
handbook_dicts[all_species[78]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[79]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[80]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Airi Mango
handbook_dicts[all_species[81]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[82]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[83]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[84]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[85]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[86]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[87]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[88]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[89]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[90]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[91]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[92]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[93]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[94]] = create_species_dict('Flowers_open') # Wild Almond
handbook_dicts[all_species[95]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[96]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[97]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[98]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[99]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[100]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[101]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[102]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[103]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[104]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Alphonso Mango
handbook_dicts[all_species[105]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[106]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[107]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[108]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[109]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[110]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[111]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[112]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[113]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[114]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[115]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Aabehayat Mango
handbook_dicts[all_species[116]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[117]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[118]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[119]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[120]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[121]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[122]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[123]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[124]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[125]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[126]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[127]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[128]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[129]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Manjeera Mango
handbook_dicts[all_species[130]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[131]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[132]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[133]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[134]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[135]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[136]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[137]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[138]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[139]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[140]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[141]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[142]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[143]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[144]] = create_species_dict('Flowers_male', 'Flowers_Female') # Blue Pine
handbook_dicts[all_species[145]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[146]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[147]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[148]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[149]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[150]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[151]] = create_species_dict('Flowers_open', 'Fruits_open') # Indian Charcoal Tree
handbook_dicts[all_species[152]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[153]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[154]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[155]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[156]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[157]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[158]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[159]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[160]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Mallika Mango
handbook_dicts[all_species[161]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[162]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[163]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[164]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[165]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[166]] = create_species_dict('Flowers_open','Fruits_open') # Mohru Oak
handbook_dicts[all_species[167]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[168]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[169]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[170]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[171]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[172]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Chosa Mango
handbook_dicts[all_species[173]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[174]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Olour Mango
handbook_dicts[all_species[175]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[176]] = create_species_dict('Flowers_bud', 'Flowers_male', 'Flowers_Female', 'Fruits_open')

# Replace Incorrect -2 Values
for species in all_species:
    species_df = df[df['Species_name'] == species]
    species_dict = handbook_dicts[species]
    for phenophase in phenophases:
        if species_dict[phenophase] == 0: # Phenophase seen in species
            false_positive_idx = species_df.index[species_df[phenophase] == -2] # Indices of reports that incorrectly assign -2 values (false positive) to phenophases SEEN in the species
            df.loc[false_positive_idx, phenophase] = np.full(len(false_positive_idx),100.612) # Turn all false positives into 100.612 (these observations will later be dropped but they cannot be Null/NaN/None values yet, so they will not be identified by .isna() check later)
            val_df.loc[false_positive_idx, 'validation_label'] = np.full(len(false_positive_idx),1) # Label false positive records as 1 because they will be dropped (1 represents observations incorrectly assigned -2 values)
        if species_dict[phenophase] == 1: # Phenophase NOT seen in species
            false_negative_idx = species_df.index[species_df[phenophase] != -2] # Indices of reports that incorrectly assign values other than -2 (false negative) to phenophases NOT SEEN in the species
            df.loc[false_negative_idx, phenophase] = np.full(len(false_negative_idx),-2.0) # Turn all false negatives into -2 values

 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp'
 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp'
 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp'
 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp'
 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp'
 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp'
 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp'
 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp'
 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp'
 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp'
 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp'
 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp'
 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp' 'temp']' has dtype incompatible with float64, please explicitly cast to a compatible

In [5]:
# Combining all mango varieties under the Species_name of Mango (all varieties)- Mangifera indica
df['Species_name'] = df['Species_name'].replace(to_replace=r'\w* Mango- Mangifera indica', value='Mango (all varieties)- Mangifera indica', regex=True)

# Filling in Missing States

In [6]:
states_shapefile = gpd.read_file("india/gadm41_IND_3.shp") # Load map of India with states as a coordinate grid with labels

In [7]:
# Function for filling state_name attribute based on coordinates for observations with NA state_name
def find_indian_state(latitude, longitude, gdf):
    """
    Finds the state name associated with a set of coordinates

    Args:
        latitude, longitude (int, int): Latitude and longitude coordinates
        gdf (GeoDataFrame): Coordinate grid of India with states labeled
    Returns:
        state['NAME_1'] (string): If coordinates map to somewhere within India, return state name
        None (None): Else, return None because the coordinates are unreliable
    """
    
    point = Point(longitude, latitude)
    
    for _, state in gdf.iterrows():
        if state['geometry'].contains(point):
            return state['NAME_1']
    return None

# Test
state = find_indian_state(13.07248, 80.24340, states_shapefile)
print(state) # IF THE FUNCTION WORKS THIS SHOULD OUTPUT TAMIL NADU

Tamil Nadu


In [8]:
# Fill any missing state names in dataset
import time 
start_time = time.time() # Measuring Time
counter = 0 # Number of observations checked so far
observations_missing_state_name = df[df["State_name"].isna()].drop(['State_name'], axis=1).dropna(how='any') # Taking a subset of citizen observations with all columns recorded (not NaN/Null/None) except State_name
for idx, row in observations_missing_state_name.iterrows(): # Iterate over observations missing a state name
    if (counter % 1000) == 0: # Report time every 1000 observations
        print(f"{counter}: {time.time()-start_time} seconds elapsed; {counter/len(observations_missing_state_name)*100}% Done")
    df.at[idx, "State_name"] = find_indian_state(row["Lat"], row["Long"], states_shapefile) # Add state name based on coordinates (if coordinates are reliable)
    counter += 1
# In case any states are labeled as a name inconsistent with our dataset
df['State_name'] = df['State_name'].replace('Andaman and Nicobar', 'Andaman and Nicobar Islands') # Two names for Andaman and Nicobar Islands
df['State_name'] = df['State_name'].replace('NCT of Delhi', 'Delhi') # Two names for Delhi
print(f"Finished in {time.time()-start_time} seconds") 


0: 0.07801318168640137 seconds elapsed; 0.0% Done
1000: 46.160059690475464 seconds elapsed; 6.418073294397023% Done
2000: 87.192462682724 seconds elapsed; 12.836146588794046% Done
3000: 129.68691158294678 seconds elapsed; 19.254219883191066% Done
4000: 174.14658093452454 seconds elapsed; 25.67229317758809% Done
5000: 216.5604043006897 seconds elapsed; 32.09036647198511% Done
6000: 270.04892206192017 seconds elapsed; 38.50843976638213% Done
7000: 325.77878046035767 seconds elapsed; 44.92651306077916% Done
8000: 379.12504482269287 seconds elapsed; 51.34458635517618% Done
9000: 432.1303160190582 seconds elapsed; 57.7626596495732% Done
10000: 477.7850797176361 seconds elapsed; 64.18073294397021% Done
11000: 525.3719263076782 seconds elapsed; 70.59880623836725% Done
12000: 582.4095170497894 seconds elapsed; 77.01687953276426% Done
13000: 632.4632956981659 seconds elapsed; 83.4349528271613% Done
14000: 668.8884508609772 seconds elapsed; 89.85302612155832% Done
15000: 705.4002416133881 second

# Labeling NA Values

### Validation Label 2

In [9]:
# Drop observations with Null/NaN/None values and label those rows with 2
na_indices = df[df.isna().any(axis=1)].index # Finds Null/NaN/None values in citizen data
val_df.loc[na_indices, 'validation_label'] = np.full(len(na_indices), 2) # 2 represents observations with Null/NaN/None values
df = df.drop(df.columns[0], axis=1) # Drop the index column called "Unnamed: 0"
df = df.replace(100.612,None) # Turn 100.612 to None so these values will be dropped. These values were kept as temp so that they will not be detected by na_indices
df = df.dropna(how='any') # Drop any observation with at least one NaN/Null/None value reported
df = df.sort_values(by='Species_name') # Order data by species for organization

In [10]:
# Reformats df to Year, Week formatting to match the reference data
df["Date_of_observation"] = pd.to_datetime(df["Date_of_observation"], format='mixed') # Change Date_of_observation column to datetime format/datatype

df["Year"] = df["Date_of_observation"].dt.isocalendar().year # Extract the year of each observation from the date column

"""
!!! Use the following instead if 52 weeks are wanted !!!
df["Year"] = df["Date_of_observation"].dt.isocalendar().week
Warning: 52nd week will only be 1 or 2 days depending on if it's a leap year or not
"""

# Weeks are 48 weeks instead of 52 because this follows the format of the reference data
df["Week"] = df["Date_of_observation"].dt.dayofyear // (366/48+0.0000000000001) # Week durations vary between 7 or 8 days (~50% of weeks are 7 days long. ~50% of weeks are 8 days long)
df["Week"] = df["Week"].astype(int) # Convert Week column from decimal to integer

"""
dt.dayofyear gives an index starting at 1, thus use 366 in equation for leap years
Add 0.0000000000001 bias to 48 (number of weeeks) so week is in range [0,47] instead of [0,48]
"""

'\ndt.dayofyear gives an index starting at 1, thus use 366 in equation for leap years\nAdd 0.0000000000001 bias to 48 (number of weeeks) so week is in range [0,47] instead of [0,48]\n'

# Anomaly Detection

In [11]:
def outlier_detection(df, num_trees=500): 
    """
    Helper function for anomaly_detection_overall. Gives indices of anomalous observations.

    Args:
        df (DataFrame): Observations spanning at least an entire year. Observations are preferably of the same species within the same state.
        num_trees (int): Number of trees making up an isolation forest. Higher number of trees reduces variance but slows down training.
    Returns:
        invalid_indices (List(int)): Index values in citizen data associated with observations deemed outliers by an isolation forest (invalid).
    """
    
    df = df.drop(["Date_of_observation", "Observation_ID", "User_id", "User_Tree_id", "State_name", "Species_name", "Year"], axis=1) # Drop columns not used in training
    
    model = IsolationForest(n_estimators = num_trees, contamination = 0.08, verbose = 1, random_state = 42) # Generate isolation forest
    # Note: Higher contamination score means more false positive outliers; Lower contamination score means more false negative outliers
    
    invalid_indices = []
    
    for week in df["Week"].sort_values().unique(): # Iterate over each week
        week_df = df[df["Week"] == week] # Use only observations from the current week
        week_df = week_df.drop("Week", axis=1) # Drop week column because it is not used in training
        
        model.fit(week_df) # Train isolation forest on particular week
        preds = model.predict(week_df) # Predict if each observation is an outlier
        week_df["Predictions"] = preds # Add prediction to observation data
        
        invalid_indices += list(week_df[week_df["Predictions"] == -1].index) # Record indices of outliers from the given week
    return invalid_indices 

In [12]:
def anomaly_detection_overall(df, min_observations_for_outlier_detection):
    """
    Performs anomaly detection on entire dataset

    Args:
        df (DataFrame): All citizen data
        min_observations_for_outlier_detection (int): Minimum number of observations in a subset of the data for anomaly detection to be performed
    Returns:
        invalid_indices (List(int)): Index values in citizen data associated with observations deemed outliers by isolation forests (invalid).
    """
    start_time = time.time() # Time
    invalid_indices = []
    states = df["State_name"].unique() # All states in dataset
    for state in states: # Iterate over all states
        print(f"**********{state}**********")
        state_start_time = time.time() # Time
        state_df = df[df["State_name"] == state] # Only observations from the given state
        years = state_df["Year"].sort_values().unique() # All years data was recorded in the given state
        for year in years: # Iterate over each year
            print(f"**********{year}**********")
            year_start_time = time.time() # Time
            state_year_df = state_df[state_df["Year"] == year] # Only observations in the given year from the given state
            species_list = state_year_df["Species_name"].unique() # All species in the given year from the given state
            for species in species_list: # Iterate over each species
                species_state_year_df = state_year_df[state_year_df["Species_name"] == species] # Only observations of the given species in the given year from the given state
                if len(species_state_year_df) > min_observations_for_outlier_detection: # Verifying there are enough observations for anomaly detection
                    species_start_time = time.time() # Time
                    outliers = outlier_detection(species_state_year_df) # Run outlier detection on observations
                    invalid_indices += outliers # Record indices of outliers
                    print(f"{len(outliers)}/{len(species_state_year_df)} observations invalid in {species} in {year} in {state}") # Outlier count
                    print(f"Finished {species} in {state} during {year} in {time.time()-species_start_time} seconds") # Time
            print(f"Finished {state} during {year} in {time.time()-year_start_time} seconds") # Time
        print(f"Finished {state} in {time.time()-state_start_time} seconds") # Time
    print(f"{len(invalid_indices)}/{len(df)} observations invalid overall") # Total outlier count
    print(f"Finished completely in {time.time()-start_time} seconds") # Time
    return invalid_indices

### Validation Label 3

In [13]:
# Run anomaly detection on all citizen data and drop any anomalies
invalid_indices = anomaly_detection_overall(df, 15*52) # Choose 15*52 as the min num of observations because it means there's an average of 15 observations per week
val_df.loc[invalid_indices, 'validation_label'] = np.full(len(invalid_indices), 3) # 3 represents observations flagged as outliers by our anomaly detection system (isolation forests machine learning)
df = df.drop(invalid_indices) # Drop observations deemed outliers

**********Kerala**********
**********2014**********
Finished Kerala during 2014 in 0.0014238357543945312 seconds
**********2015**********
Finished Kerala during 2015 in 0.0010044574737548828 seconds
**********2016**********
Finished Kerala during 2016 in 0.0009868144989013672 seconds
**********2017**********
Finished Kerala during 2017 in 0.004163980484008789 seconds
**********2018**********
389/4672 observations invalid in Coconut palm-Cocos nucifera in 2018 in Kerala
Finished Coconut palm-Cocos nucifera in Kerala during 2018 in 21.61777901649475 seconds
284/3238 observations invalid in Jackfruit- Artocarpus heterophyllus in 2018 in Kerala
Finished Jackfruit- Artocarpus heterophyllus in Kerala during 2018 in 21.711244106292725 seconds
113/1170 observations invalid in Jamun- Syzygium cumini in 2018 in Kerala
Finished Jamun- Syzygium cumini in Kerala during 2018 in 21.206090450286865 seconds
227/2596 observations invalid in Mango (all varieties)- Mangifera indica in 2018 in Kerala
Finis

# Save Data Cleaning Labeled Citizen Data in One File

In [15]:
# Save raw citizen data labeled with validation labels
val_df.to_csv('data_cleaning_labeled_alldata.csv', index=False)

## Key for `validation_label` Column

| Label | Meaning |
| :----: | :----- |
| 0      | Kept    |
| 1      | Dropped because a phenophase was incorrectly reported as being -2 |
| 2      | Dropped because a phenophase had missing data (Null Values) |
| 3      | Dropped because observation was flagged as anomalous |