# Imports

In [1]:
# This is an uncommon library so we run a pip install of it just in case
!pip install geopandas shapely



In [2]:
# Common imports
import pandas as pd # Pandas for storing and manipulating CSV files as DataFrames (tables)
import geopandas as gpd # Geopandas for converting coordinates to states
from shapely.geometry import Point # Same as Geopandas
import numpy as np # Numpy for storing and manipulating arrays (lists)
import math # Math for finding NaN values
import os # OS for writing and reading directories (folders)
import time # Time for checking how long things take
from sklearn.ensemble import IsolationForest # Isolation Forests for anomaly detection

# Previewing Original Citizen Data

In [3]:
df = pd.read_csv("alldata.csv") # Load raw citizen data in
df.head() # Previewing raw (before cleaning) citizen data

Unnamed: 0.1,Unnamed: 0,Observation_ID,Date_of_observation,User_id,User_Tree_id,Species_name,Lat,Long,State_name,Leaves_fresh,Leaves_mature,Leaves_old,Flowers_bud,Flowers_open,Flowers_male,Flowers_Female,Fruits_unripe,Fruits_ripe,Fruits_open
0,1,388564.0,2020-01-01,20396.0,84299.0,Indian Almond- Terminalia catappa,12.15386,75.22397,Kerala,2.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
1,2,388565.0,2020-01-01,20396.0,84300.0,Indian Almond- Terminalia catappa,12.15386,75.22397,Kerala,2.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
2,3,388566.0,2020-01-01,20396.0,84301.0,Fish-tail palm- Caryota urens,12.1406,75.22145,Kerala,0.0,2.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
3,4,388567.0,2020-01-01,20396.0,84302.0,Mast Tree-Monoon longifolium,12.1406,75.22145,Kerala,1.0,2.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
4,5,388568.0,2020-01-01,20396.0,84303.0,Indian Almond- Terminalia catappa,12.1406,75.22145,Kerala,0.0,1.0,2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0


#Manually input absent phenophases for each species in the citizen dataset
Labels derived from SeasonWatch Tree Phenology Guide

In [15]:
df = pd.read_csv("alldata.csv")

# Replacing incorrect -2 values with either NA or -2
all_species = list(df['Species_name'].value_counts().index) # All species names in order of frequency
phenophases = list(df.columns[9:]) # Phenophase column names

def create_species_dict(*absent_phenophases):
    """
    Creates a dictionary informing if each phenophase is seen in a species or not (0 for seen, 1 for not seen)

    Args:
        absent_phenophase (List(string)): List of phenophases not seen in a species
        use_q (bool):  False means use MDP value iteration, true means use Q-learning
    Returns:
        species_dict (Dict(string,int)): A Dictionary mapping phenophases to binaries indicating presence in a species
    """
    species_dict = dict(zip(phenophases, np.zeros(len(phenophases), int)))
    for phenophase in absent_phenophases:
        species_dict[phenophase] = 1
    return species_dict

handbook_dicts = {} # Dict mapping species to species dicts.
#
handbook_dicts[all_species[0]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[1]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[2]] = create_species_dict('Flowers_open', 'Fruits_open')
handbook_dicts[all_species[3]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[4]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[5]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[6]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[7]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[8]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[9]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[10]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[11]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[12]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[13]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[14]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[15]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[16]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[17]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[18]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[19]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[20]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[21]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[22]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[23]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[24]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[25]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[26]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[27]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[28]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[29]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[30]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[31]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[32]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[33]] = create_species_dict('Flowers_bud', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[34]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[35]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[36]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[37]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[38]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[39]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[40]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[41]] = create_species_dict('Flowers_open', 'Fruits_open')
handbook_dicts[all_species[42]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[43]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[44]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[45]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[46]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[47]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[48]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[49]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[50]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[51]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[52]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[53]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[54]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[55]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[56]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[57]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[58]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[59]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[60]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[61]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[62]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[63]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[64]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[65]] = create_species_dict('Flowers_open', 'Fruits_open') # Silkworm Mulberry
handbook_dicts[all_species[66]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[67]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[68]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[69]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[70]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[71]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[72]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[73]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[74]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[75]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[76]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[77]] = create_species_dict('Flowers_open', 'Fruits_open') # Box-myrtle
handbook_dicts[all_species[78]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[79]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[80]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Airi Mango
handbook_dicts[all_species[81]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[82]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[83]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[84]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[85]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[86]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[87]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[88]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[89]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[90]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[91]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[92]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[93]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[94]] = create_species_dict('Flowers_open') # Wild Almond
handbook_dicts[all_species[95]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[96]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[97]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[98]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[99]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[100]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[101]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[102]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[103]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[104]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Alphonso Mango
handbook_dicts[all_species[105]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[106]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[107]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[108]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[109]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[110]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[111]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[112]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[113]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[114]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[115]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Aabehayat Mango
handbook_dicts[all_species[116]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[117]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[118]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[119]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[120]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[121]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[122]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[123]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[124]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[125]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[126]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[127]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[128]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[129]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Manjeera Mango
handbook_dicts[all_species[130]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[131]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[132]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[133]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[134]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[135]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[136]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[137]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[138]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[139]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[140]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[141]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[142]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[143]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[144]] = create_species_dict('Flowers_male', 'Flowers_Female') # Blue Pine
handbook_dicts[all_species[145]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[146]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[147]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[148]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[149]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[150]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[151]] = create_species_dict('Flowers_open', 'Fruits_open') # Indian Charcoal Tree
handbook_dicts[all_species[152]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[153]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[154]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[155]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[156]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[157]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[158]] = create_species_dict('Flowers_open')
handbook_dicts[all_species[159]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[160]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Mallika Mango
handbook_dicts[all_species[161]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[162]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[163]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[164]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[165]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[166]] = create_species_dict('Flowers_open','Fruits_open') # Mohru Oak
handbook_dicts[all_species[167]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[168]] = create_species_dict('Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[169]] = create_species_dict('Flowers_male', 'Flowers_Female')
handbook_dicts[all_species[170]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[171]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[172]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Chosa Mango
handbook_dicts[all_species[173]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open')
handbook_dicts[all_species[174]] = create_species_dict('Flowers_male', 'Flowers_Female', 'Fruits_open') # Olour Mango
handbook_dicts[all_species[175]] = create_species_dict('Flowers_open','Fruits_open')
handbook_dicts[all_species[176]] = create_species_dict('Flowers_bud', 'Flowers_male', 'Flowers_Female', 'Fruits_open')






#Handling -2 incorrect values

False positives and false negatives are grouped together with the value of -2

In [16]:
# Replace Incorrect -2 Values
for species in all_species:
    if species in handbook_dicts:  # Only apply handbook info if species is in the manually entered dictionary
        species_df = df[df['Species_name'] == species]
        species_dict = handbook_dicts[species]
        for phenophase in phenophases:
            if species_dict[phenophase] == 0:  # Phenophase seen in species
                false_positive_idx = species_df.index[species_df[phenophase] == -2]  # Indices of reports that incorrectly assign -2 values
                df.loc[false_positive_idx, phenophase] = np.full(len(false_positive_idx), np.nan)  # Turn all false positives into NaN
            if species_dict[phenophase] == 1:  # Phenophase NOT seen in species
                false_negative_idx = species_df.index[species_df[phenophase] != -2]  # Indices of reports that incorrectly assign values other than -2
                df.loc[false_negative_idx, phenophase] = np.full(len(false_negative_idx), -2.0)  # Turn all false negatives into -2 values

# Combining Mango Varieties

In [17]:
# Combining all mango varieties under the Species_name of Mango (all varieties)- Mangifera indica
df['Species_name'] = df['Species_name'].replace(to_replace=r'\w* Mango- Mangifera indica', value='Mango (all varieties)- Mangifera indica', regex=True)

# Filling in Missing States using GoogleAPI


In [36]:
import requests
import pandas as pd

# Function to get the state name from Google Maps Geocoding API
def find_indian_state_google(latitude, longitude, api_key):
    """
    Finds the state name associated with a set of coordinates using Google Maps Geocoding API

    Args:
        latitude, longitude (float, float): Latitude and longitude coordinates
        api_key (str): Your Google Maps API key
    Returns:
        state_name (str): The state name if within India, otherwise None
    """
    # Google Maps Geocoding API endpoint
    url = "https://maps.googleapis.com/maps/api/geocode/json"

    # Parameters for the API call
    params = {
        "latlng": f"{latitude},{longitude}",
        "key": api_key
    }

    # Make the request to the API
    response = requests.get(url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()

        # Extract the relevant address components (state and country)
        if data['status'] == 'OK':
            for result in data['results']:
                country = None
                state_name = None

                for component in result['address_components']:
                    if 'administrative_area_level_1' in component['types']:
                        state_name = component['long_name']  # Capture state name
                    if 'country' in component['types']:
                        country = component['short_name']  # Capture country code ('IN' for India)

                # If the country is India, return the state name
                if country == 'IN':
                    return state_name

        else:
            print("Geocoding API error:", data['status'])
    else:
        print("Error in request:", response.status_code)

    return None




In [37]:


# Load the dataset
file_path = 'alldata.csv'
df = pd.read_csv(file_path)

# Replace with your actual Google Maps API key
api_key = "USE YOUR API KEY"

# Find rows where State_name is missing, but Lat and Long are present
for index, row in df.iterrows():
    if pd.isna(row['State_name']) and not pd.isna(row['Lat']) and not pd.isna(row['Long']):
        latitude = row['Lat']
        longitude = row['Long']
        state_name = find_indian_state_google(latitude, longitude, api_key)

        # Update the DataFrame with the retrieved state name
        if state_name:
            df.at[index, 'State_name'] = state_name

# Save the updated DataFrame to a new CSV file
df.to_csv('updated_alldata.csv', index=False)

print("Data saved to 'updated_alldata.csv'.")



Geocoding API error: ZERO_RESULTS
Geocoding API error: ZERO_RESULTS
Geocoding API error: ZERO_RESULTS
Geocoding API error: ZERO_RESULTS
Geocoding API error: ZERO_RESULTS
Geocoding API error: ZERO_RESULTS
Data saved to 'updated_alldata.csv'.


#Dropping rows that are not needed

In [66]:

data = pd.read_csv(file_path)

# Dropping unnecessary columns
columns_to_drop = ['User_id', 'Observation_ID', 'User_Tree_id']
cleaned_data = data.drop(columns=columns_to_drop)

# Dropping rows where 'Lat' or 'Long' values are missing (NaN)
cleaned_data = cleaned_data.dropna(subset=['Lat', 'Long'])


cleaned_data.head()


Unnamed: 0.1,Unnamed: 0,Date_of_observation,Species_name,Lat,Long,State_name,Leaves_fresh,Leaves_mature,Leaves_old,Flowers_bud,Flowers_open,Flowers_male,Flowers_Female,Fruits_unripe,Fruits_ripe,Fruits_open
0,1,2020-01-01,Indian Almond- Terminalia catappa,12.15386,75.22397,Kerala,2.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
1,2,2020-01-01,Indian Almond- Terminalia catappa,12.15386,75.22397,Kerala,2.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
2,3,2020-01-01,Fish-tail palm- Caryota urens,12.1406,75.22145,Kerala,0.0,2.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
3,4,2020-01-01,Mast Tree-Monoon longifolium,12.1406,75.22145,Kerala,1.0,2.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
4,5,2020-01-01,Indian Almond- Terminalia catappa,12.1406,75.22145,Kerala,0.0,1.0,2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0


In [67]:
df= cleaned_data

# Reformatting & Adding Date Columns

In [68]:

import pandas as pd



cleaned_data = df


cleaned_data["Date_of_observation"] = pd.to_datetime(cleaned_data["Date_of_observation"], dayfirst=True, errors='coerce')

# Extract the year
cleaned_data["Year"] = cleaned_data["Date_of_observation"].dt.isocalendar().year

# Calculate the week number with 48 weeks instead of 52
cleaned_data["Week"] = cleaned_data["Date_of_observation"].dt.dayofyear // (366/48 + 0.0000000000001)

# Handle any potential NaN values in the Week column and convert to integers
cleaned_data["Week"] = cleaned_data["Week"].fillna(0).astype(int)

output_file_path = 'cleaned_data_with_year_week.csv'
cleaned_data.to_csv(output_file_path, index=False)

cleaned_data[["Date_of_observation", "Year", "Week"]].head()


Unnamed: 0,Date_of_observation,Year,Week
0,2020-01-01,2020,0
1,2020-01-01,2020,0
2,2020-01-01,2020,0
3,2020-01-01,2020,0
4,2020-01-01,2020,0


# Anomaly Detection

In [73]:
def outlier_detection(df, num_trees=500):
    """
    Helper function for anomaly_detection_overall. Gives indices of anomalous observations.

    Args:
        df (DataFrame): Observations spanning at least an entire year. Observations are preferably of the same species within the same state.
        num_trees (int): Number of trees making up an isolation forest. Higher number of trees reduces variance but slows down training.
    Returns:
        invalid_indices (List(int)): Index values in citizen data associated with observations deemed outliers by an isolation forest (invalid).
    """

    df = df.drop(["Date_of_observation", "State_name", "Species_name", "Year"], axis=1) # Drop columns not used in training

    model = IsolationForest(n_estimators = num_trees, contamination = 0.08, verbose = 1, random_state = 42) # Generate isolation forest
    # Note: Higher contamination score means more false positive outliers; Lower contamination score means more false negative outliers

    invalid_indices = []

    for week in df["Week"].sort_values().unique(): # Iterate over each week
        week_df = df[df["Week"] == week] # Use only observations from the current week
        week_df = week_df.drop("Week", axis=1) # Drop week column because it is not used in training

        model.fit(week_df) # Train isolation forest on particular week
        preds = model.predict(week_df) # Predict if each observation is an outlier
        week_df["Predictions"] = preds # Add prediction to observation data

        invalid_indices += list(week_df[week_df["Predictions"] == -1].index) # Record indices of outliers from the given week
    return invalid_indices

In [74]:
def anomaly_detection_overall(df, min_observations_for_outlier_detection):
    """
    Performs anomaly detection on entire dataset

    Args:
        df (DataFrame): All citizen data
        min_observations_for_outlier_detection (int): Minimum number of observations in a subset of the data for anomaly detection to be performed
    Returns:
        invalid_indices (List(int)): Index values in citizen data associated with observations deemed outliers by isolation forests (invalid).
    """
    start_time = time.time() # Time
    invalid_indices = []
    states = df["State_name"].unique() # All states in dataset
    for state in states: # Iterate over all states
        print(f"**********{state}**********")
        state_start_time = time.time() # Time
        state_df = df[df["State_name"] == state] # Only observations from the given state
        years = state_df["Year"].sort_values().unique() # All years data was recorded in the given state
        for year in years: # Iterate over each year
            print(f"**********{year}**********")
            year_start_time = time.time() # Time
            state_year_df = state_df[state_df["Year"] == year] # Only observations in the given year from the given state
            species_list = state_year_df["Species_name"].unique() # All species in the given year from the given state
            for species in species_list: # Iterate over each species
                species_state_year_df = state_year_df[state_year_df["Species_name"] == species] # Only observations of the given species in the given year from the given state
                if len(species_state_year_df) > min_observations_for_outlier_detection: # Verifying there are enough observations for anomaly detection
                    species_start_time = time.time() # Time
                    outliers = outlier_detection(species_state_year_df) # Run outlier detection on observations
                    invalid_indices += outliers # Record indices of outliers
                    print(f"{len(outliers)}/{len(species_state_year_df)} observations invalid in {species} in {year} in {state}") # Outlier count
                    print(f"Finished {species} in {state} during {year} in {time.time()-species_start_time} seconds") # Time
            print(f"Finished {state} during {year} in {time.time()-year_start_time} seconds") # Time
        print(f"Finished {state} in {time.time()-state_start_time} seconds") # Time
    print(f"{len(invalid_indices)}/{len(df)} observations invalid overall") # Total outlier count
    print(f"Finished completely in {time.time()-start_time} seconds") # Time
    return invalid_indices

In [86]:



# Then run the anomaly detection function
invalid_indices = anomaly_detection_overall(df, 15*52)

# Drop the detected outliers from the dataset
df = df.drop(invalid_indices)


**********Kerala**********
**********2015.0**********
Finished Kerala during 2015.0 in 0.0019032955169677734 seconds
**********2016.0**********
Finished Kerala during 2016.0 in 0.0022268295288085938 seconds
**********2017.0**********
Finished Kerala during 2017.0 in 0.004521846771240234 seconds
**********2018.0**********
155/1826 observations invalid in Jackfruit-Artocarpus heterophyllus in 2018.0 in Kerala
Finished Jackfruit-Artocarpus heterophyllus in Kerala during 2018.0 in 26.600977182388306 seconds
136/1551 observations invalid in Mango (all varieties)-Mangifera indica in 2018.0 in Kerala
Finished Mango (all varieties)-Mangifera indica in Kerala during 2018.0 in 22.8853120803833 seconds
233/2834 observations invalid in Coconut palm-Cocos nucifera in 2018.0 in Kerala
Finished Coconut palm-Cocos nucifera in Kerala during 2018.0 in 24.633702516555786 seconds
Finished Kerala during 2018.0 in 74.36199736595154 seconds
**********2019.0**********
83/906 observations invalid in Indian Lab

# Species ID <-> Name Dicts

In [89]:
# Load species lookup dicts for id <-> name from species_codes.csv
species_codes = pd.read_csv("species codes.csv", encoding='unicode_escape')

species_id_to_name = {}
species_name_to_id = {}

# Creating species id <-> name dictionaries
# Refomatting species names from "species codes.csv" to match those of the citizen data
# Load the reformatted data to dictionaries for converting species_name to species_id and vice versa
for i, row in species_codes.iterrows():
    species_id_to_name[row["species_id"]] = "{}-{}".format(row["species_primary_common_name"], row["species_scientific_name"])
    species_name_to_id["{}-{}".format(row["species_primary_common_name"], row["species_scientific_name"]).lower().replace(" ", "")] = row["species_id"]

# Encoding and Decoding Species ID & Species Names in Citizen Data

In [90]:
import numpy as np

# Reformat species names in citizen data
for i, row in df.iterrows():
    name = row["Species_name"].lower().replace(" ", "")  # Reformat species names

    # Manual corrections for known species names
    if name == "arjuntree-terminaliaarjuna":
        df.loc[i, "Species_name"] = species_id_to_name[1083]
        continue
    if name == "axlewoodtree-anogeissuslatifolia":
        df.loc[i, "Species_name"] = species_id_to_name[1009]
        continue
    if name == "chiku-sapodilla-manilkarazapota\xa0":
        df.loc[i, "Species_name"] = species_id_to_name[1188]
        continue
    if name == "dyer'soleander-wrightiatinctoria":
        df.loc[i, "Species_name"] = species_id_to_name[1181]
        continue
    if name == "ficusmollis-softfig":
        df.loc[i, "Species_name"] = species_id_to_name[1197]
        continue
    if name == "frangipani-templetree-plumeriarubra": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1176]
        continue
    if name == "garuga-kharpat-garugapinnata": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1038]
        continue
    if name == "ghostrree-sterculiaurens": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1078]
        continue
    if name == "indianfrankincense-boswelliaserrata": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1195]
        continue
    if name == "indiancoraltree-erythrinaindica": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1034]
        continue
    if name == "kadamba-neolamarckiacadamba": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1058]
        continue
    if name == "lanneacoromandelica-indianashtree": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1194]
        continue
    if name == "mexicanoleander-yellowoleander-cascabelathevetia": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1177]
        continue
    if name == "nightfloweringjasmine-harsingar-nyctanthesarbor-tristis": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1059]
        continue
    if name == "prosopiscineraria-khejri": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1201]
        continue
    if name == "raintree-samaneasaman": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1162]
        continue
    if name == "redsilk-cotton-bombaxceiba": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1015]
        continue
    if name == "whitesilk-cotton-ceibapentandra": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1021]
        continue
    if name == "yellow-silkcottontree-cochlospermumreligiosum": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1023]
        continue
    if name == "karkat-dogteak-dilleniapentagyna": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1032]
        continue
    if name == "chosamango-mangiferaindica": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1108]
        continue
    if name == "falsewhiteteak-mallotusnudiflorus": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1088]
        continue
    if name == "floss-silktree-ceibaspeciosa": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1022]
        continue
    if name == "largesebesten-bairola-cordiawallichii": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1026]
        continue
    if name == "roxburghskydia-pulia-kydiacalycina": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1044]
        continue
    if name == "wildrose-rosawebbiana": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1206]
        continue
    if name == "albiziaodoratissima-blacksiris": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1199]
        continue
    if name == "anogeissuspendula-kardhai": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1198]
        continue
    if name == "brokenbonestree-oroxylumindicum": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = "Broken Bones Tree-Oroxylum Indicum"
        continue
    if name == "pyinmatree-andamancrapemyrtle-lagerstroemiahypoleuca": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1216]
        continue
    if name == "crataevareligiosa-garlic-peartree": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1196]
        continue
    if name == "guh-de-three-leafcapertree-cratevaadansonii": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1217]
        continue
    if name == "aabehayatmango-mangiferaindica": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1091]
        continue
    if name == "bedu-punjabfig-ficuspalmata": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1235]
        continue
    if name == "chinar-platanusorientalis": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = "Chinar-Platanus Orientalis"
        continue
    if name == "prunusnepalensis-sohiong": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1192]
        continue
    if name == "tecomellaundulata-roheda": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = species_id_to_name[1200]
        continue
    if name == "tigersmilkspruce-falconeriainsignis": # If statements to catch any misspellings or missing entries in species codes.csv
        df.loc[i, "Species_name"] = "Tiger's Milk Spruce-Falconeria Insignis"
        continue
    # More corrections for known misspellings or variations here...

    # General case with no known manual corrections
    species_id = species_name_to_id.get(name, np.nan)  # Safely get species_id or return NaN
    if not pd.isna(species_id):  # Only proceed if species_id is valid (i.e., not NaN)
        df.loc[i, "Species_name"] = species_id_to_name[species_id]  # Update to the correct name

# Adding a Species_id column for comparing to the reference database
df['Species_id'] = df["Species_name"].apply(lambda species: species_name_to_id.get(species.lower().replace(" ", ""), np.nan))

# Display the first few rows of the updated DataFrame
print(df.head())



   Unnamed: 0 Date_of_observation                      Species_name       Lat  \
0           1          2020-01-01  Indian Almond-Terminalia catappa  12.15386   
1           2          2020-01-01  Indian Almond-Terminalia catappa  12.15386   
2           3          2020-01-01      Fish-tail Palm-Caryota urens  12.14060   
3           4          2020-01-01      Mast Tree-Monoon longifolium  12.14060   
4           5          2020-01-01  Indian Almond-Terminalia catappa  12.14060   

       Long State_name  Leaves_fresh  Leaves_mature  Leaves_old  Flowers_bud  \
0  75.22397     Kerala           2.0            0.0         0.0         -2.0   
1  75.22397     Kerala           2.0            0.0         0.0         -2.0   
2  75.22145     Kerala           0.0            2.0         0.0         -2.0   
3  75.22145     Kerala           1.0            2.0         0.0         -2.0   
4  75.22145     Kerala           0.0            1.0         2.0         -2.0   

   Flowers_open  Flowers_male  F

# Save Updated Citizen Data in One File

In [93]:
# Save updated_alldata.csv to disk
df.to_csv('cleaned_alldata_version2.csv', index=False)

In [87]:
# prompt: I want to get total number of rows in cleaned_alldata_version2.csv

import pandas as pd

# Replace 'cleaned_alldata_version2.csv' with the actual path to your file
file_path = 'cleaned_data_with_year_week.csv'

try:
  df = pd.read_csv(file_path)
  total_rows = len(df)
  print(f"Total number of rows in cleaned_alldata_version2.csv: {total_rows}")
except FileNotFoundError:
  print(f"File not found: {file_path}")

Total number of rows in cleaned_alldata_version2.csv: 571834
