## This notebook is to calculate metrics for data quality assessment of time interval maps.

1. Importing necessary columns from Excel
2. Declaring categories and other constants
3. Creating a dictionary for unique combination of entries
4. Converting the sample contingency table entries into an estimated population
table using sampling intensity.
5. Calculating Commission and Omission entries for each category
6. Exporting the table as JSON

In [None]:
pip install openpyxl

#### Import necessary python libraries

In [None]:
import pandas as pd
from pprint import pprint
import json

#### Read PIE Error Excel data

In [None]:
file_path = 'PIE_Error_Andre_Edited.xlsx'  # Replace with your file path
sheet_name = 'Data'  # Replace with the name of the sheet you want to read
columns_names = ['Stratum', 'Interval', 'MapInterval', 'RefInterval'] # specific columns to read
df = pd.read_excel(file_path, sheet_name=sheet_name, usecols= columns_names)

In [None]:
df.head()

#### Unique values in columns used to create dictionary keys

In [None]:
intervals = ['2010-2012', '2012-2014', '2014-2016', '2016-2018', '2018-2021']
categories = ['Stable Presence', 'Loss', 'Gain', 'Stable Absence']
stratum = ['All Absence', 'Gain', 'Other', 'Loss', 'All Presence']

#### Creating a dictionary for unique combination of entries

In [None]:
dic = {}
for strata in stratum:
    for category in categories:
        for category2 in categories:
            label = strata + '_' + category + '_' + category2
            dic[label] = 0   

#### Looping through the rows to make the count for each entry 

In [None]:
dic2 = {}
for interval in intervals:
    dic = {}
    for strata in stratum:
        for category in categories:
            for category2 in categories:
                label = strata + '_' + category + '_' + category2
                dic[label] = 0   
    
    df2 = df[df['Interval'] == interval]
    for i in range(len(df2)):
        strata = columns_names[0]
        map_label = columns_names[2]
        ref_label = columns_names[3]
        
        a = df2[strata].iloc[i]
        b = df2[map_label].iloc[i]
        c = df2[ref_label].iloc[i]
        
        dic[f'{a}_{b}_{c}'] += 1
    dic2[interval] = dic

In [None]:
pprint(dic2)

#### Weights calculated for each strata depending on the sampling intensity

In [None]:
weights = {
           'All Absence': 1176064.92,
           'All Presence' : 81422.60,
           'Gain': 27678.11,
           'Loss': 37697.48,
           'Other': 83916.21
}
weights2 = {
           'All Absence': 70.56,
           'All Presence' : 4.88,
           'Gain': 1.66,
           'Loss': 2.26,
           'Other': 5.03
}

#### Converting the sample contingency table entries into an estimated population table

Using the formula #5.2 from Chapter 5 "Application to Categorical Error Assessment with Sampling"
of the book "Metrics That Make a Difference" https://link.springer.com/book/10.1007/978-3-030-70765-1

In [None]:
dic3 = {}
for interval in intervals:   
    dic = {}
    for category in categories:
        for category2 in categories:
            label = category + '_' + category2
            dic[label] = 0
    
    for entry in dic:
        for index, key in enumerate(weights.keys()):
            entry2 = key + '_' + entry
            dic[entry] += weights2[key]*dic2[interval][entry2]
    dic3[interval] = dic

In [None]:
dic3

In [None]:
dic4 =  {}
for category in categories:
    for category2 in categories:
        label = category + '_' + category2
        dic4[label] = []

# Iterate over each year's data
for year_data in dic3.keys():        
    # Iterate over each category
    for category in dic4.keys():
        # Extract count for the current category from the current year's data
        count = dic3[year_data].get(category, 0)
        # Append the count to the corresponding array
        dic4[category].append(count)


In [None]:
dic4

#### Calculating Commission for each category

In [None]:
temp = [0]*5
for cat in range(4):
    for j in range(5):
        cat_com =  {key: value for key, value in dic4.items() if key.startswith(f'{categories[cat]}_') and key != (f'{categories[cat]}_{categories[cat]}')}
        for k in cat_com:
            temp[j] += dic3[intervals[j]][k]
    dic4[f'{categories[cat]}_Commission'] =  temp

#### Calculating Omission for each category

In [None]:
temp = [0]*5
for cat in range(4):
    for j in range(5):
        cat_com =  {key: value for key, value in dic4.items() if key.endswith(f'_{categories[cat]}') and key != (f'{categories[cat]}_{categories[cat]}')}
        for k in cat_com:
            temp[j] += dic3[intervals[j]][k]
    dic4[f'{categories[cat]}_Omission'] =  temp

In [None]:
dic4

#### Rounding data in each entry to two decimals

In [None]:
def convert_decimals_to_2(data):
    if isinstance(data, dict):
        return {key: convert_decimals_to_2(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [convert_decimals_to_2(item) for item in data]
    elif isinstance(data, (int, float)):
        return round(data, 2)
    else:
        return data

In [None]:
result_dict = convert_decimals_to_2(dic4)

In [None]:
pprint(result_dict)

##### Exporting the table as JSON

In [None]:
# Specify the file path
json_file_path = 'table2.json'

# Write the data to the JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(result_dict, json_file, indent=4)
