# Data Discrepancy between HeidICON and DANAM

This jupyter notebook can be used to compare the filenames between HeidICON and DANAM, and export the results into an excel file.

In [1]:
import pandas as pd
from scripts.compare import *
from scripts.write_csv import list_from_txt
from IPython.display import display_markdown, display, Markdown

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

In [3]:
# read heidicon export
heidicon_export = "heidicon_export.xlsx"
danam_export = "json\dict\Monument_2022-11-13_01-18-33.json"

In [4]:
# load both heidicon and danam to pandas
# this might take up to a minute.
heidicon_df, danam_df = load_data(heidicon_export, danam_export)

| key | description |
|-----|-------------|
| mon_id | Monument ID |
| heidicon_img | Number of images in HeidICON |
| heidicon_nometa | Number of images in HeidICON without metadata |
| danam_img | Number of images in DANAM |
| danam_nometa | Number of images in DANAM without a valid caption |
| sds_img | Number of images in SDS |
| files | Files of the monument from SDS, DANAM, and HeidICON as dataframe | 
| missing_danam | Files from HeidICON that are missing in DANAM |
| count_missing_danam | Number of files missing in DANAM |
| missing_heidicon | Files from DANAM that are missing in HeidICON |
| count_missing_heidicon | Number of files missing in HeidICON |

In [5]:
# evaluate file names for all monuments listed in mon/all.mon 
# and export to excel file test.xlsx
# this calculation might take up to 5 minutes, depending 
# on the computer
mon_ids = list_from_txt("mon/all.mon")
all = []
for mon_id in mon_ids:
    all.append(get_info_for_monument(mon_id, heidicon_df, danam_df))
pd.DataFrame(all).to_excel("test.xlsx")

In [6]:
# show the status of a monument and its list of files
# on SDS, DANAM, and HeidICON

mon_id = ''
res = get_info_for_monument(mon_id, heidicon_df, danam_df)

display(Markdown(
"""
| Monument ID   | {0}  |
|---|---|
| Number of images in HeidICON  | {1}  |
| Number of images in HeidICON without metadata  | {2}  |
| Number of images in DANAM  | {3}  |
| Number of image without valid caption in DANAM | {4}  |
| Number of images in SDS  | {5}  |

""".format( res['mon_id'], res['heidicon_img'], res['heidicon_nometa'], 
            res['danam_img'], res['danam_nometa'], res['sds_img'] 
)))
res['files']


| Monument ID   |   |
|---|---|
| Number of images in HeidICON  | 0  |
| Number of images in HeidICON without metadata  | 0  |
| Number of images in DANAM  | 225  |
| Number of image without valid caption in DANAM | 2  |
| Number of images in SDS  | 23972  |



Unnamed: 0,sds,danam,heidicon
0,,KIR0003-001_P_20201003_01,
1,0235_IMG_1936,KIR0003-002_P_20201003_01,
2,0237,KIR0003-003_P_20201003_01,
3,0361_(VO2B8809),KIR0003-004_P_20201003_01,
4,0421_(VO2B8740),KIR0003-005_P_20201003_01,
...,...,...,...
23967,_NEF2244,,
23968,_NEF2245,,
23969,plot,,
23970,roundsimaging,,
