In [130]:
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np

In [131]:
citizen_fnames = os.listdir('../data/citizen_states_cleaned')
ref_fnames = os.listdir('../data/reference_states_cleaned')

['tamil_nadu.csv',
 'delhi.csv',
 'manipur.csv',
 'jharkhand.csv',
 'assam.csv',
 'rajasthan.csv',
 'jammu_and_kashmir.csv',
 'tripura.csv',
 'karnataka.csv',
 'puducherry.csv',
 'arunachal_pradesh.csv',
 'madhya_pradesh.csv',
 'odisha.csv',
 'meghalaya.csv',
 'gujarat.csv',
 'andhra_pradesh.csv',
 'andaman_and_nicobar_islands.csv',
 'kerala.csv',
 'haryana.csv',
 'maharashtra.csv',
 'bihar.csv']

In [132]:
# make the id lookup dict for the citizen datapoints
species_codes = pd.read_csv("../data/species codes.csv", encoding='unicode_escape')

species_id_to_name = {}
species_name_to_id = {}

for i, row in species_codes.iterrows():
    species_id_to_name[row["species_id"]] = "{}-{}".format(row["species_primary_common_name"], row["species_scientific_name"])
    species_name_to_id["{}-{}".format(row["species_primary_common_name"], row["species_scientific_name"]).lower().replace(" ", "")] = row["species_id"]

dict_keys(['whitebabool-acacialeucophloea', 'babool-acacianilotica', 'woodapple-aeglemarmelos', 'indianhorsechestnut-aesculusindica', 'treeofheaven-ailanthusexcelsa', 'krishnasiris-albiziaamara', 'siris-albizialebbeck', 'whitesiris-albiziaprocera', "devil'stree-alstoniascholaris", 'axlewood-anogeissuslatifolia', 'greymangrove-avicenniamarina', 'neem-azadirachtaindica', 'purplebauhinia-bauhiniapurpurea', 'jhinjheri-bauhiniaracemosa', 'semla-bauhiniaretusa', 'redsilkcotton-bombaxceiba', 'toddypalm-borassusflabellifer', 'flameoftheforest-buteamonosperma', 'blackdammer-canariumstrictum', 'fish-tailpalm-caryotaurens', 'indianlaburnum-cassiafistula', 'whitesilkcotton-ceibapentandra', 'flosssilktree-ceibaspeciosa', 'yellowsilkcotton-cochlospermumreligiosum', 'mountaincoffee-coffeaarabica', 'robustacoffee-coffearobusta', 'largesebesten-cordiawallichii', "tanner'stree-coriarianepalensis", 'takoli-dalbergialanceolaria', 'indianrosewood-dalbergiasissoo', 'gulmohur-delonixregia', 'elephantapple-di

In [138]:
# Find proportion of datapoints in citizen data which map to NA (missing values) in reference data
cat_attributes = ['Leaves_fresh', 'Leaves_mature', 'Leaves_old', 'Flowers_bud', 'Flowers_open', 'Flowers_male', 'Flowers_Female', 'Fruits_unripe', 'Fruits_ripe', 'Fruits_open']
val_df = pd.DataFrame(columns=['State Name', 'total_obs', 'proportion of -2 <-> NA', 'proportion of -1 <-> NA', 'proportion of 0 <-> NA', 'proportion of 1 <-> NA', 'proportion of 2 <-> NA'])

for fname in ref_fnames:
    print("calculating proportions for {}".format(fname))
    df_cit = pd.read_csv(f"../data/citizen_states_cleaned/{fname}")
    df_ref = pd.read_csv(f"../data/reference_states_cleaned/{fname}")
    n_obs = 10 * len(df_cit)
    
    species_idweek_to_data_dict = {}
    for _, row in df_ref.iterrows():
        data = list(map(lambda attr: row[attr], cat_attributes))
        species_idweek_to_data_dict[(row['week'], row['species_id'])] = data.copy()

    na_counts = {-2: 0, -1: 0, 0: 0, 1: 0, 2: 0}
    
    for _, row_i in df_cit.iterrows():
        week = row_i['Week']
        try:
            species_id = species_name_to_id[row_i['Species_name'].lower().replace(" ", "")]
        except:
            print("Warning: no id for {}".format(row_i['Species_name']))
            continue
        try:
            ref_row = species_idweek_to_data_dict[(row_i['Week'], species_id)]
        except:
            print("Warning: (week, id) pair ({}, {}) not found in reference data".format(row_i['Week'], species_id))
            continue
        for attr in cat_attributes:
            ref_row_attr_val = ref_row[cat_attributes.index(attr)]
            if ref_row_attr_val == -2:
                # print(attr, row_i[attr])
                if row_i[attr] in [-2.0, -1.0, 0, 1.0, 2.0]:
                    na_counts[int(row_i[attr])] += 1
    print(na_counts)
    val_df.loc[len(val_df)] = {'State Name': fname.replace('.csv', ''), 'total_obs': n_obs, 'proportion of -2 <-> NA': na_counts[-2]/n_obs, 'proportion of -1 <-> NA': na_counts[-1]/n_obs, 'proportion of 0 <-> NA': na_counts[0]/n_obs, 'proportion of 1 <-> NA': na_counts[1]/n_obs, 'proportion of 2 <-> NA': na_counts[2]/n_obs}
    

calculating proportions for tamil_nadu.csv
{-2: 36544, -1: 676, 0: 30611, 1: 23316, 2: 24705}
calculating proportions for delhi.csv
{-2: 1007, -1: 18, 0: 841, 1: 185, 2: 180}
calculating proportions for manipur.csv
{-2: 411, -1: 0, 0: 323, 1: 137, 2: 307}
calculating proportions for jharkhand.csv
{-2: 292, -1: 88, 0: 335, 1: 244, 2: 237}
calculating proportions for assam.csv
{-2: 1328, -1: 57, 0: 1662, 1: 920, 2: 1186}
calculating proportions for rajasthan.csv
{-2: 2698, -1: 46, 0: 4115, 1: 1259, 2: 1715}
calculating proportions for jammu_and_kashmir.csv
{-2: 36, -1: 0, 0: 52, 1: 29, 2: 25}
calculating proportions for tripura.csv
{-2: 95, -1: 0, 0: 53, 1: 45, 2: 61}
calculating proportions for karnataka.csv
{-2: 22216, -1: 686, 0: 23662, 1: 9887, 2: 15682}
calculating proportions for puducherry.csv
{-2: 5015, -1: 47, 0: 4763, 1: 5623, 2: 3449}
calculating proportions for arunachal_pradesh.csv
{-2: 214, -1: 0, 0: 195, 1: 61, 2: 88}
calculating proportions for madhya_pradesh.csv
{-2: 148

## Table of proportions of datapoints in citizen data which map to NA (missing) values in reference data

### total_obs
total number of categorical observations for that dataframe `10 * len(dataframe)`

### proportion of {-2, -1, 0, 1, 2} <-> NA
pct of `total_obs` which map one of {-2, -1, 0, 1, 2} to NA value



In [139]:
val_df.sort_values(by='total_obs', ascending=False).head()

Unnamed: 0,State Name,total_obs,proportion of -2 <-> NA,proportion of -1 <-> NA,proportion of 0 <-> NA,proportion of 1 <-> NA,proportion of 2 <-> NA
17,kerala,4872010,0.142509,0.002549,0.189944,0.174294,0.143917
0,tamil_nadu,149920,0.243757,0.004509,0.204182,0.155523,0.164788
8,karnataka,115510,0.19233,0.005939,0.204848,0.085594,0.135763
11,madhya_pradesh,95530,0.1557,0.013001,0.309829,0.121585,0.105538
19,maharashtra,89830,0.251531,0.007125,0.28928,0.135756,0.192441


## Analysis

- A large proportion of observations map to NA values; there is a lot of missing reference data
- Many citizens actually observe the attribute (e.g. observation is 1 or 2) for NA reference data, which suggests that this reference data should exist
