# Imports

In [3]:
!pip install geopandas shapely



In [50]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point 
import math
import os
import time
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans

# Previewing Original Citizen Data

In [51]:
df = pd.read_csv("alldata.csv")
df.head() # Previewing alldata.csv before cleaning

Unnamed: 0.1,Unnamed: 0,Observation_ID,Date_of_observation,User_id,User_Tree_id,Species_name,Lat,Long,State_name,Leaves_fresh,Leaves_mature,Leaves_old,Flowers_bud,Flowers_open,Flowers_male,Flowers_Female,Fruits_unripe,Fruits_ripe,Fruits_open
0,1,388564.0,2020-01-01,20396.0,84299.0,Indian Almond- Terminalia catappa,12.15386,75.22397,Kerala,2.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
1,2,388565.0,2020-01-01,20396.0,84300.0,Indian Almond- Terminalia catappa,12.15386,75.22397,Kerala,2.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
2,3,388566.0,2020-01-01,20396.0,84301.0,Fish-tail palm- Caryota urens,12.1406,75.22145,Kerala,0.0,2.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
3,4,388567.0,2020-01-01,20396.0,84302.0,Mast Tree-Monoon longifolium,12.1406,75.22145,Kerala,1.0,2.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
4,5,388568.0,2020-01-01,20396.0,84303.0,Indian Almond- Terminalia catappa,12.1406,75.22145,Kerala,0.0,1.0,2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0


# Handling Incorrect -2 Values

In [52]:
# Replacing incorrect -2 values with either NA or -2
all_species = list(df['Species_name'].value_counts().index) # all species named in order of prevalence
phenophases = list(df.columns[9:]) # Phenophases

def create_species_dict(*absent_phenophases):
    species_dict = dict(zip(phenophases, np.zeros(len(phenophases), int)))
    for phenophase in absent_phenophases:
        species_dict[phenophase] = 1
    return species_dict

handbook_dicts = {} # Dict mapping species to phenophase dicts. 
# phenophase dicts give absent phenophases in the associated species.
# Manually input absent phenophases from SeasonWatch handbook
handbook_dicts[all_species[0]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[1]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[2]] = create_species_dict('Flowers_open', 'Fruits_open')
handbook_dicts[all_species[3]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[4]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[5]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[6]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[7]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[8]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[9]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[10]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[11]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[12]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[13]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[14]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[15]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[16]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[17]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[18]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[19]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[20]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[21]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[22]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[23]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[24]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[25]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[26]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[27]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[28]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[29]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[30]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[31]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[32]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[33]] = create_species_dict('Flowers_bud', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[34]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[35]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[36]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[37]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[38]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[39]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[40]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[41]] = create_species_dict('Flowers_open', 'Fruits_open')
handbook_dicts[all_species[42]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[43]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[44]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[45]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[46]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[47]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[48]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[49]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[50]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[51]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[52]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[53]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[54]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[55]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[56]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[57]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[58]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[59]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[60]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[61]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[62]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[63]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[64]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[65]] = create_species_dict('Flowers_open', 'Fruits_open') # Silkworm Mulberry
handbook_dicts[all_species[66]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[67]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[68]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[69]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[70]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[71]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[72]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[73]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[74]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[75]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[76]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[77]] = create_species_dict('Flowers_open', 'Fruits_open') # Box-myrtle
handbook_dicts[all_species[78]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[79]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[80]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Airi Mango
handbook_dicts[all_species[81]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[82]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[83]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[84]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[85]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[86]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[87]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[88]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[89]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[90]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[91]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[92]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[93]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[94]] = create_species_dict('Flowers_open') # Wild Almond
handbook_dicts[all_species[95]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[96]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[97]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[98]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[99]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[100]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[101]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[102]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[103]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[104]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Alphonso Mango
handbook_dicts[all_species[105]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[106]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[107]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[108]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[109]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[110]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[111]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[112]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[113]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[114]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[115]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Aabehayat Mango
handbook_dicts[all_species[116]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[117]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[118]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[119]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[120]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[121]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[122]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[123]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[124]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[125]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[126]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[127]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[128]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[129]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Manjeera Mango
handbook_dicts[all_species[130]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[131]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[132]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[133]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[134]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[135]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[136]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[137]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[138]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[139]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[140]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[141]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[142]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[143]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[144]] = create_species_dict('Flowers_male', 'Flowers_Female') # Blue Pine
handbook_dicts[all_species[145]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[146]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[147]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[148]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[149]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[150]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[151]] = create_species_dict('Flowers_open', 'Fruits_open') # Indian Charcoal Tree
handbook_dicts[all_species[152]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[153]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[154]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[155]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[156]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[157]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[158]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[159]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[160]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Mallika Mango
handbook_dicts[all_species[161]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[162]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[163]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[164]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[165]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[166]] = create_species_dict('Flowers_open','Fruits_open') # Mohru Oak
handbook_dicts[all_species[167]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[168]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[169]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[170]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[171]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[172]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Chosa Mango
handbook_dicts[all_species[173]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[174]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Olour Mango
handbook_dicts[all_species[175]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[176]] = create_species_dict('Flowers_bud', 'Flowers_male', 'Flowers_Female', 'Fruits_open')

# Replace Incorrect -2 Values
for species in all_species:
    species_df = df[df['Species_name'] == species]
    species_dict = handbook_dicts[species]
    for phenophase in phenophases:
        if species_dict[phenophase] == 0:
            false_positive_idx = species_df.index[species_df[phenophase] == -2] # Indices of reports that incorrectly assign -2 values (false positive) to phenophases that DO appear in the species
            df.loc[false_positive_idx, phenophase] = np.full(len(false_positive_idx),np.nan) # turn into NaN so they will be dropped
            
        if species_dict[phenophase] == 1:
            false_negative_idx = species_df.index[species_df[phenophase] != -2] # Indices of reports that incorrectly assign values other than -2 (false negative) to phenophases that DO NOT appear in the species
            df.loc[false_negative_idx, phenophase] = np.full(len(false_negative_idx),-2.0) # convert all values for the absent phenophase to -2

# Combining Mango Varieties

In [53]:
# Combining all mango varieties under the Species_name of Mango (all varieties)- Mangifera indica
df['Species_name'] = df['Species_name'].replace(to_replace=r'\w* Mango- Mangifera indica', value='Mango (all varieties)- Mangifera indica', regex=True)

# Filling in Missing States

In [54]:
states_shapefile = gpd.read_file("india/gadm41_IND_3.shp")

In [55]:
# Function for filling state_name attribute based on coordinates for observations with NA state_name
def find_indian_state(latitude, longitude, gdf):
    point = Point(longitude, latitude)
    
    for _, state in gdf.iterrows():
        if state['geometry'].contains(point):
            return state['NAME_1']
    return None

state = find_indian_state(13.07248, 80.24340, states_shapefile)
print(state) # IF THE FUNCTION WORKS THIS SHOULD OUTPUT TAMIL NADU

Tamil Nadu


In [56]:
# Fill any missing state names in dataset
# !!! Warning: This process take ~2.1 hours !!!
import time
start_time = time.time()
counter = 0
observations_missing_state_name = df[df["State_name"].isna()].drop(['State_name'], axis=1).dropna(how='any')
for idx, row in observations_missing_state_name.iterrows():
    if (counter % 1000) == 0:
        print(f"{counter}: {time.time()-start_time} seconds elapsed; {counter/len(observations_missing_state_name)*100}% Done")
    df.at[idx, "State_name"] = find_indian_state(row["Lat"], row["Long"], states_shapefile)
    counter += 1
# In case any states are labeled as a name inconsistent with our dataset
df['State_name'] = df['State_name'].replace('Andaman and Nicobar', 'Andaman and Nicobar Islands')
df['State_name'] = df['State_name'].replace('NCT of Delhi', 'Delhi')
print(f"Finished in {time.time()-start_time} seconds")


0: 0.050238847732543945 seconds elapsed; 0.0% Done


KeyboardInterrupt: 

# Dropping NAs

In [None]:
# get rid of NA values and sort by species name
df = df.drop(df.columns[0], axis=1)
df = df.dropna()
df = df.sort_values(by='Species_name')

# Reformatting & Adding Date Columns

In [None]:
# Reformats df to Year, Week formatting to match the reference data
df["Date_of_observation"] = pd.to_datetime(df["Date_of_observation"], format='mixed')
df["Year"] = df["Date_of_observation"].dt.isocalendar().year
df["Week"] = df["Date_of_observation"].dt.dayofyear // (366/48+0.0000000000001) # Weeks duration varies between 7 or 8 days
df["Week"] = df["Week"].astype(int)
# dt.dayofyear gives an index starting at 1, thus use 366 for leap years
# Add 0.0000000000001 bias so week is in range [0,47] instead of [0,48]

# !!! Use the following instead if 52 weeks are wanted !!!
# df["Year"] = df["Date_of_observation"].dt.isocalendar().week
# Warning: 52nd week will only be 1 or 2 days depending on if it's a leap year or not

# Anomaly Detection

In [None]:
# Helper function for anomaly_detection_overall
def outlier_detection(df, num_trees=500): # Returns list of indices of observations deemed outliers by isolation forests
    df = df.drop(["Date_of_observation", "Observation_ID", "User_id", "User_Tree_id", "State_name", "Species_name", "Year"], axis=1)
    
    model = IsolationForest(n_estimators = num_trees, verbose = 1, random_state = 42)
    
    invalid_indices = []
    
    for week in df["Week"].sort_values().unique():
        week_df = df[df["Week"] == week]
        week_df = week_df.drop("Week", axis=1)
        
        model.fit(week_df)
        preds = model.predict(week_df)
        week_df["Predictions"] = preds
        
        invalid_indices += list(week_df[week_df["Predictions"] == -1].index)
    return invalid_indices

In [None]:
def anomaly_detection_overall(df, min_observations_for_outlier_detection):
    start_time = time.time()
    invalid_indices = []
    states = df["State_name"].unique()
    for state in states:
        print(f"**********{state}**********")
        state_start_time = time.time()
        state_df = df[df["State_name"] == state]
        years = state_df["Year"].sort_values().unique()
        outliers_state = 0
        for year in years:
            print(f"**********{year}**********")
            year_start_time = time.time()
            state_year_df = state_df[state_df["Year"] == year]
            species_list = state_year_df["Species_name"].unique()
            outliers_year = 0
            for species in species_list:
                species_state_year_df = state_year_df[state_year_df["Species_name"] == species]
                if len(species_state_year_df) > min_observations_for_outlier_detection:
                    species_start_time = time.time()
                    outliers = outlier_detection(species_state_year_df)
                    outliers_year += len(outliers)
                    outliers_state += len(outliers)
                    invalid_indices += outliers
                    print(f"{len(outliers)}/{len(species_state_year_df)} observations invalid in {species} in {year} in {state}")
                    print(f"Finished {species} in {state} during {year} in {time.time()-species_start_time} seconds")
            print(f"{len(outliers_year)}/{len(state_year_df)} observations invalid in {year} in {state}")
            print(f"Finished {state} during {year} in {time.time()-year_start_time} seconds")
        print(f"{len(outliers_state)}/{len(state_df)} observations invalid in {state}")
        print(f"Finished {state} in {time.time()-state_start_time} seconds")
    print(f"{len(invalid_indices)}/{len(df)} observations invalid overall")
    print(f"Finished completely in {time.time()-start_time} seconds")
    return invalid_indices

In [None]:
# Run anomaly detection on all citizen data and drop any anomalies
invalid_indices = anomaly_detection_overall(df, 15*52) # Choose 15*52 as the min num of observations because it means there's an average of 15 observations per week
df = df.drop(invalid_indices)

# Species ID <-> Name Dicts

In [None]:
# load lookup dicts for id -> name and name -> id from species_codes.csv
species_codes = pd.read_csv("species codes.csv", encoding='unicode_escape')

species_id_to_name = {}
species_name_to_id = {}

for i, row in species_codes.iterrows():
    species_id_to_name[row["species_id"]] = "{}-{}".format(row["species_primary_common_name"], row["species_scientific_name"])
    species_name_to_id["{}-{}".format(row["species_primary_common_name"], row["species_scientific_name"]).lower().replace(" ", "")] = row["species_id"]

# Encoding and Decoding Species ID & Species Names in Citizen Data

In [None]:
# update names in citizen data
# NEEDS SPECIES_ID_TO_NAME AND SPECIES_NAME_TO_ID ALREADY DEFINED
for i, row in df.iterrows():
    name = row["Species_name"].lower().replace(" ", "")

    if name == "arjuntree-terminaliaarjuna": # if statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1083]
        continue
    if name == "axlewoodtree-anogeissuslatifolia":
        df.loc[i, "Species_name"] = species_id_to_name[1009]
        continue
    if name == "chiku-sapodilla-manilkarazapota\xa0":
        df.loc[i, "Species_name"] = species_id_to_name[1188]
        continue
    if name == "dyer'soleander-wrightiatinctoria":
        df.loc[i, "Species_name"] = species_id_to_name[1181]
        continue
    if name == "ficusmollis-softfig":
        df.loc[i, "Species_name"] = species_id_to_name[1197]
        continue
    if name == "frangipani-templetree-plumeriarubra":
        df.loc[i, "Species_name"] = species_id_to_name[1176]
        continue
    if name == "garuga-kharpat-garugapinnata":
        df.loc[i, "Species_name"] = species_id_to_name[1038]
        continue
    if name == "ghostrree-sterculiaurens":
        df.loc[i, "Species_name"] = species_id_to_name[1078]
        continue
    if name == "indianfrankincense-boswelliaserrata":
        df.loc[i, "Species_name"] = species_id_to_name[1195]
        continue
    if name == "indiancoraltree-erythrinaindica":
        df.loc[i, "Species_name"] = species_id_to_name[1034]
        continue
    if name == "kadamba-neolamarckiacadamba":
        df.loc[i, "Species_name"] = species_id_to_name[1058]
        continue
    if name == "lanneacoromandelica-indianashtree":
        df.loc[i, "Species_name"] = species_id_to_name[1194]
        continue
    if name == "mexicanoleander-yellowoleander-cascabelathevetia":
        df.loc[i, "Species_name"] = species_id_to_name[1177]
        continue
    if name == "nightfloweringjasmine-harsingar-nyctanthesarbor-tristis":
        df.loc[i, "Species_name"] = species_id_to_name[1059]
        continue
    if name == "prosopiscineraria-khejri":
        df.loc[i, "Species_name"] = species_id_to_name[1201]
        continue
    if name == "raintree-samaneasaman":
        df.loc[i, "Species_name"] = species_id_to_name[1162]
        continue
    if name == "redsilk-cotton-bombaxceiba":
        df.loc[i, "Species_name"] = species_id_to_name[1015]
        continue
    if name == "whitesilk-cotton-ceibapentandra":
        df.loc[i, "Species_name"] = species_id_to_name[1021]
        continue
    if name == "yellow-silkcottontree-cochlospermumreligiosum":
        df.loc[i, "Species_name"] = species_id_to_name[1023]
        continue
    if name == "karkat-dogteak-dilleniapentagyna":
        df.loc[i, "Species_name"] = species_id_to_name[1032]
        continue
    if name == "chosamango-mangiferaindica":
        df.loc[i, "Species_name"] = species_id_to_name[1108]
        continue
    if name == "falsewhiteteak-mallotusnudiflorus":
        df.loc[i, "Species_name"] = species_id_to_name[1088]
        continue
    if name == "floss-silktree-ceibaspeciosa":
        df.loc[i, "Species_name"] = species_id_to_name[1022]
        continue
    if name == "largesebesten-bairola-cordiawallichii":
        df.loc[i, "Species_name"] = species_id_to_name[1026]
        continue
    if name == "roxburghskydia-pulia-kydiacalycina":
        df.loc[i, "Species_name"] = species_id_to_name[1044]
        continue
    if name == "wildrose-rosawebbiana":
        df.loc[i, "Species_name"] = species_id_to_name[1206]
        continue
    if name == "albiziaodoratissima-blacksiris":
        df.loc[i, "Species_name"] = species_id_to_name[1199]
        continue
    if name == "anogeissuspendula-kardhai":
        df.loc[i, "Species_name"] = species_id_to_name[1198]
        continue
    if name == "brokenbonestree-oroxylumindicum":
        df.loc[i, "Species_name"] = "Broken Bones Tree-Oroxylum Indicum"
        continue
    if name == "pyinmatree-andamancrapemyrtle-lagerstroemiahypoleuca":
        df.loc[i, "Species_name"] = species_id_to_name[1216]
        continue
    if name == "crataevareligiosa-garlic-peartree":
        df.loc[i, "Species_name"] = species_id_to_name[1196]
        continue
    if name == "guh-de-three-leafcapertree-cratevaadansonii":
        df.loc[i, "Species_name"] = species_id_to_name[1217]
        continue
    if name == "aabehayatmango-mangiferaindica":
        df.loc[i, "Species_name"] = species_id_to_name[1091]
        continue
    if name == "bedu-punjabfig-ficuspalmata":
        df.loc[i, "Species_name"] = species_id_to_name[1235]
        continue
    if name == "chinar-platanusorientalis":
        df.loc[i, "Species_name"] = "Chinar-Platanus Orientalis"
        continue
    if name == "prunusnepalensis-sohiong":
        df.loc[i, "Species_name"] = species_id_to_name[1192]
        continue
    if name == "tecomellaundulata-roheda":
        df.loc[i, "Species_name"] = species_id_to_name[1200]
        continue
    if name == "tigersmilkspruce-falconeriainsignis":
        df.loc[i, "Species_name"] = "Tiger's Milk Spruce-Falconeria Insignis"
        continue
    if species_name_to_id[name]:
        df.loc[i, "Species_name"] = species_id_to_name[species_name_to_id[name]] # species dictionaries to undo the .lower().replace(" ","") formatting
df.insert(loc = 4, column = 'Species_id', value = [species_name_to_id.get(species.lower().replace(" ",""), np.nan) for species in df["Species_name"]])

# Save Updated Citizen Data in One File

In [None]:
# Save updated all_data.csv
df.to_csv('updated_alldata.csv', index=False) # SAVED IT

# Make Directories for Citizen and Reference Data

In [None]:
# paths to citizen and reference data. create if they do not exist
os.makedirs("all data/citizen", exist_ok=True)
os.makedirs("all data/reference", exist_ok=True)

# State DFs to citizenData folder

In [None]:
for state_name in df["State_name"].unique():
    state_df = df[df["State_name"] == state_name]
    state_name = state_name.replace(" ","_").lower() # Reformat state names to lowercase with _ instead of spaces
    state_df.to_csv(f"all data/citizen/{state_name}.csv", index=False)

# Reference Data Cleaning

In [91]:
# Dicts and Functions
code_to_attribute_dict = {'FL': 'Leaves_fresh', 'ML': 'Leaves_mature', 'DL': 'Leaves_old', 
                          'BD': 'Flowers_bud', 'OF': 'Flowers_open', 'MF': 'Flowers_male', 
                          'FF': 'Flowers_Female', 'UFR': 'Fruits_unripe', 'RFR': 'Fruits_ripe', 
                          'OFR': 'Fruits_open'}
code_to_category_dict = {'NA': None, '': None, '0': 0, '1': 1, '2': 2} #TODO - what should map to -1?

def code_to_attribute_category_tuple(code):
    code_attr_tup = code.split('_')
    code_attr_tup[0] = code_to_attribute_dict[code_attr_tup[0]]
    code_attr_tup[1] = code_to_category_dict[code_attr_tup[1]]
    return tuple(code_attr_tup)

def get_duplicates_df(df):
    duplicates_df = pd.DataFrame({'species_id': [], 'all_same': []})
    for species_id in df['species_id'].unique():
        df_filtered = df[df['species_id'] == species_id]
        if len(df_filtered) > 1:
            # see if categorical vectors are all the same:
            all_equal = True
            for colname in df.columns[3:-2]:
                all_equal = all_equal and df_filtered[colname].nunique() == 1
            duplicates_df.loc[len(duplicates_df) + 1] = pd.Series({'species_id': species_id, 'all_same': all_equal})
    return duplicates_df

def clean_df(df):
    # TODO:
    # - get rid of null values

    # get rid of duplicate rows
    print("getting rid of duplicates")
    duplicates_df = get_duplicates_df(df)
    for i in range(len(duplicates_df)):
        dup_id = duplicates_df.iloc[i]['species_id']
        drop_idx = list(df['species_id']).index(dup_id)
        df = df.drop(index=drop_idx)
    print("got rid of duplicates")
    # new columns: everything in df that is not a month, plus the 10 categorical codes, plus a 'week' column
    week_codes = ['Jan_wk1', 'Jan_wk2', 'Jan_wk3', 'Jan_wk4', 'Feb_wk1', 
                   'Feb_wk2', 'Feb_wk3', 'Feb_wk4', 'Mar_wk1', 'Mar_wk2',
                   'Mar_wk3', 'Mar_wk4', 'Apr_wk1', 'Apr_wk2', 'Apr_wk3', 'Apr_wk4',
                   'May_wk1', 'May_wk2', 'May_wk3', 'May_wk4', 'Jun_wk1', 'Jun_wk2',
                   'Jun_wk3', 'Jun_wk4', 'Jul_wk1', 'Jul_wk2', 'Jul_wk3', 'Jul_wk4',
                   'Aug_wk1', 'Aug_wk2', 'Aug_wk3', 'Aug_wk4', 'Sep_wk1', 'Sep_wk2',
                   'Sep_wk3', 'Sep_wk4', 'Oct_wk1', 'Oct_wk2', 'Oct_wk3', 'Oct_wk4',
                   'Nov_wk1', 'Nov_wk2', 'Nov_wk3', 'Nov_wk4', 'Dec_wk1', 'Dec_wk2','Dec_wk3', 'Dec_wk4']
    
    cat_codes = ['Leaves_fresh', 'Leaves_mature', 'Leaves_old', 'Flowers_bud',
       'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_unripe',
       'Fruits_ripe', 'Fruits_open']

    base_cols = [not_week_code for not_week_code in filter(lambda c: c not in week_codes, list(df.columns))]
    try:
        base_cols.remove('created_at')
    except:
        print("WARNING: created_at column not found")
    
    new_cols = base_cols + ['week'] + cat_codes
    
    new_df = pd.DataFrame(columns=new_cols)

    num_idxs = len(df)
    for idx, row in df.iterrows():
        if idx % 10 == 0:
            print("{} % done".format(100 * idx / num_idxs))
        for week_idx, week_name in enumerate(week_codes):
            new_datapoint = {}
            for old_colname in base_cols:
                new_datapoint[old_colname] = row[old_colname]
            new_datapoint['week'] = week_idx
            cat_vector_for_week = row[week_name]

            # make nan map to a cat vector of all Nones
            # print(type(cat_vector_for_week), cat_vector_for_week)
            if not isinstance(cat_vector_for_week, str):
                if math.isnan(cat_vector_for_week):
                    for attr_name in code_to_attribute_dict.values():
                        new_datapoint[attr_name] = None
                    new_df.loc[len(new_df)] = new_datapoint
                    continue
            
            cat_vector_list = list(map(code_to_attribute_category_tuple, cat_vector_for_week.split(",")))
            for (attr, cat) in cat_vector_list:
                new_datapoint[attr] = cat

            # add new datapoint to df
            new_df.loc[len(new_df)] = new_datapoint
            # remove unnamed columns if they exist
            for col in new_df.columns:
                if 'Unnamed' in col or col == 'id':
                    new_df = new_df.drop(col, axis=1)
            new_df['species_name'] = new_df['species_id'].map(species_id_to_name) # Adding species_name column

    return new_df

# Saving Cleaned Reference Data Files

In [92]:
# clean all old pvt dataframes, put in ./reference_data

for tablename in os.listdir('./pvttables_raw/'):
    if tablename == '.DS_Store':
        continue
    print("cleaning {}".format(tablename))
    ref_df = pd.read_csv('./pvttables_raw/{}'.format(tablename), sep=';')
    tablename = tablename.replace("pvt_","")
    if tablename == 'maharastra.csv':
        tablename = 'maharashtra.csv'
    new_df = clean_df(ref_df)
    new_df.to_csv('./all data/reference/{}'.format(tablename), index=False)

cleaning pvt_andaman_and_nicobar_islands.csv
getting rid of duplicates
got rid of duplicates
0.0 % done
4.184100418410042 % done
8.368200836820083 % done
12.552301255230125 % done
16.736401673640167 % done
20.92050209205021 % done
25.10460251046025 % done
29.288702928870293 % done
33.47280334728033 % done
37.65690376569037 % done
41.84100418410042 % done
46.02510460251046 % done
50.2092050209205 % done
54.39330543933055 % done
58.57740585774059 % done
62.76150627615063 % done
66.94560669456067 % done
71.1297071129707 % done
75.31380753138075 % done
79.4979079497908 % done
83.68200836820084 % done
87.86610878661088 % done
92.05020920502092 % done
96.23430962343096 % done
100.418410041841 % done
cleaning pvt_rajasthan.csv
getting rid of duplicates
got rid of duplicates
0.0 % done
4.166666666666667 % done
8.333333333333334 % done
12.5 % done
16.666666666666668 % done
20.833333333333332 % done
25.0 % done
29.166666666666668 % done
33.333333333333336 % done
37.5 % done
41.666666666666664 % 

62.5 % done
66.66666666666667 % done
70.83333333333333 % done
75.0 % done
79.16666666666667 % done
83.33333333333333 % done
87.5 % done
91.66666666666667 % done
95.83333333333333 % done
100.0 % done
cleaning pvt_odisha.csv
getting rid of duplicates
got rid of duplicates
0.0 % done
4.166666666666667 % done
8.333333333333334 % done
12.5 % done
16.666666666666668 % done
20.833333333333332 % done
25.0 % done
29.166666666666668 % done
33.333333333333336 % done
37.5 % done
41.666666666666664 % done
45.833333333333336 % done
50.0 % done
54.166666666666664 % done
58.333333333333336 % done
62.5 % done
66.66666666666667 % done
70.83333333333333 % done
75.0 % done
79.16666666666667 % done
83.33333333333333 % done
87.5 % done
91.66666666666667 % done
95.83333333333333 % done
100.0 % done
cleaning pvt_tamil_nadu.csv
getting rid of duplicates
got rid of duplicates
0.0 % done
4.166666666666667 % done
8.333333333333334 % done
12.5 % done
16.666666666666668 % done
20.833333333333332 % done
25.0 % done
