## Notebook for validating a database one rule at a time
I use this for testing. You can programmatically write errors into the database on purpose to check that the tool finds them.

Each rule section starts with copying a source database into a scratch workspace and then modifies the database as necessary to test the rule. Checking topology is a little more difficult. With the copy, go to the map and change the geometry of one or more features, save your edits, and then come back to this notebook to check the rule.

Notes:
1. `arcpy.management.Delete(gdb_c)` at the end of each section is probably not always necessary. If the `copy` command at the beginning of a section fails, and there is no call to delete the database at the end, try adding it.
2. most rule functions return a list, the first three items of which are used to build headers and anchors in the report htmls. List items beyond that will be the errors, usually formatted in html for the report.
3. if you edit any of the scripts imported (modules renamed as `vd`, `gdef`, `guf`, `alc`) while Pro is open, you need to reload them before you run the code cell again. After your edits, add the line `importlib.reload(vd)`, for example, to the top of the cell and try again.

In [None]:
import sys
path_to_scripts_folder = # modify this line to the path of the \Scripts folder of the GeMS Toolbox
sys.path.append(r"{path_to_scripts_folder}")
import validate_database as vd
import GeMS_Definition as gdef
import GeMS_utilityFunctions as guf
import GeMS_ALaCarte as alc
from pathlib import Path
import importlib

In [None]:
gdb = # path\to\geodatabase\or\geopackage
scrath = # path\to\writable\scratch\space
gdb_n = Path(gdb).name
gdb_c = f"{scratch}\\{gdb_n}"
if Path(gdb).suffix == ".gpkg":
    is_gpkg = True
else:
    is_gpkg = False
arcpy.env.overwriteOutput

### Rule 2.1 - no core elements missing

In [None]:
# remove elements and check results
for n in ("DataSources", "DescriptionOfMapUnits", "GeoMaterialDict", "GeologicMap", "ContactsAndFaults", "MapUnitPolys"):
    if Path(gdb_c).exists:
        arcpy.management.Delete(gdb_c)
    arcpy.management.Copy(gdb, gdb_c)
    db_dict = vd.guf.gdb_object_dict(gdb_c)
    if n in db_dict:
        arcpy.management.Delete(db_dict[n]['catalogPath'])
        del db_dict[n]
    summary = vd.rule2_1(db_dict, is_gpkg)
    for s in summary:
        print(s)

In [None]:
# change the name of a required element
# if the name change only includes a suffix or prefix, the tool should still identify 
# the table as a GeMS object 
importlib.reload(vd)
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)

caf = d["ContactsAndFaults"]["catalogPath"]
new_caf = f"{caf}_2"
arcpy.management.Rename(caf, new_caf)
d = vd.guf.gdb_object_dict(gdb_c)
summary = vd.rule2_1(d, is_gpkg)
for s in summary:
    print(s)

arcpy.management.Delete(gdb_c)

### Rule 2.2 - no missing or misdefined fields from core elements

In [None]:
# make a copy
importlib.reload(vd)
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)

# change the names of some fields, delete others
change = {"MapUnit": "mapunit",
          "Type": "Type2",
          "HierarchyKey": "HKEY"}
delete = ("ExistenceConfidence", "Label")

arcpy.management.Copy(gdb, gdb_c)
d = vd.guf.gdb_object_dict(gdb_c)
for k,v in d.items():
    table = v['catalogPath']
    if "fields" in v:
        flds = [f.name for f in v["fields"]]
        # can't use AlterField on geopackages
        # test first.
        if not is_gpkg:
            for a in change:
                if a in flds:
                    arcpy.management.AlterField(table, a, change[a])
                    
        for n in delete:
            if n in flds:
                arcpy.management.DeleteField(table, n)
                
d = vd.guf.gdb_object_dict(gdb_c)
summary = vd.check_fields(d, 2, [])
for s in summary:
    print(s)

In [None]:
    
arcpy.management.Delete(gdb_c)

### Rule 2.4 - no map units in MapUnitPolys that are not in DMU

In [None]:
# add a MapUnit to MapUnitPolys that is not in the DMU
arcpy.management.Copy(gdb, gdb_c)
d = vd.guf.gdb_object_dict(gdb_c)
mup = d["MapUnitPolys"]["catalogPath"]
with arcpy.da.UpdateCursor(mup, "MapUnit") as cursor:
    for i,row in enumerate(cursor):
        if i == 0:
            row[0] = "foo"
        if i == 1:
            row[0] = "bar"
        cursor.updateRow(row)
        
d = vd.guf.gdb_object_dict(gdb_c)
summary = vd.check_map_units(d, 2, [], {})
for s in summary[0]:
    print(s)
    
arcpy.management.Delete(gdb_c)

### Rule 2.5 - no duplicate map units in dmu

In [None]:
# copy a MapUnit value in the DMU
arcpy.management.Copy(gdb, gdb_c)
d = vd.guf.gdb_object_dict(gdb_c)
dmu = d["DescriptionOfMapUnits"]["catalogPath"]
with arcpy.da.UpdateCursor(dmu, "MapUnit", where_clause="MapUnit is not null" ) as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            mu = row[0]
        if i == 1:
            row[0] = mu
        cursor.updateRow(row)
        
summary = guf.get_duplicates(dmu, "Mapunit")
print(summary)

arcpy.management.Delete(gdb_c)

### Rule 2.6 - field values in required elements are defined in glossary

In [None]:
# required element can be renamed but gems_equivalent is correctly assigned
# and the fields are still checked
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)
caf = d["ContactsAndFaults"]['catalogPath']
arcpy.management.Rename(caf, f"{caf}_2")
d = guf.gdb_object_dict(gdb_c)
for k,v in d.items():
    print(k, v["gems_equivalent"])
summary = vd.glossary_check(d, 2, [])
for s in summary:
    print(s)
arcpy.management.Delete(gdb_c)

In [None]:
# rename a value in a required field
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)
caf = d["ContactsAndFaults"]['catalogPath']
with arcpy.da.UpdateCursor(caf, "Type") as cursor:
    for i,row in enumerate(cursor):
        if i == 0:
            row[0] = "foobar"
            cursor.updateRow(row)
summary = vd.glossary_check(d, 2, [])
for s in summary:
    print(s)
arcpy.management.Delete(gdb_c)

### Rule 2.7 - no duplicate terms in Glossary

In [None]:
# Copy one of the terms in Glossary
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)
glo = d["Glossary"]['catalogPath']
with arcpy.da.UpdateCursor(glo, "Term") as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            term = row[0]
        if i == 1:
            row[0] = term
        cursor.updateRow(row)
d = guf.gdb_object_dict(gdb_c)
summary = guf.get_duplicates(glo, "Term")
for s in summary:
    print(s)
arcpy.management.Delete(gdb_c)

### Rule 2.8 - all SourceIDs in required elements are in DataSources

In [None]:
# add a DataSourceID that is not in DataSources
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)
caf = d["ContactsAndFaults"]["catalogPath"]
with arcpy.da.UpdateCursor(caf, "DataSourceID") as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            row[0] = "foobar"
            cursor.updateRow(row)
d = guf.gdb_object_dict(gdb_c)

errors, all_sources = vd.sources_check(d, 2, [])
print(errors)
#print(all_sources)
arcpy.management.Delete(gdb_c)

### Rule 2.9 - no duplicate DataSources_IDs

In [None]:
# add a duplicate DataSource_ID
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)
ds = d["DataSources"]["catalogPath"]
with arcpy.da.UpdateCursor(ds, "DataSources_ID") as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            val = row[0]
        if i == 1:
            row[0] = val
        cursor.updateRow(row)

duplicates = guf.get_duplicates(ds, "DataSources_ID")
print(duplicates)
arcpy.management.Delete(gdb_c)

### Rule 3.1 - non-core elements conform to schema

In [None]:
# add an optional GeMS-defined feature class
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)
if not is_gpkg:
    fd = "GeologicMap"
    sr = d["GeologicMap"]["spatialReference"].name   
    fc = "OverlayPolys"
else:
    fd = "#"
    sr = d["MapUnitPolys"]["spatialReference"].name
    fc = "OverlayPolys"

vt = arcpy.ValueTable(3)
vt.addRow(f"{fd} {sr} {fc}")
alc.process(gdb_c, vt)

d = guf.gdb_object_dict(gdb_c)
results = vd.check_fields(d, 3, [])
for r in results[0]:
    print(r)

In [None]:
# delete required fields from this optional feature class
fc = "OverlayPolys"
delete_fields = ["Type", "Label"]
for f in delete_fields:
    arcpy.management.DeleteField(d[fc]['catalogPath'], f)
d = guf.gdb_object_dict(gdb_c)
results = vd.check_fields(d, 3, [])
print(results)

arcpy.management.Delete(gdb_c)

In [None]:
# add a required field but with the wrong length, and type. Again, we're not checking for nullable fields
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)
if not is_gpkg:
    fd = "GeologicMap"
    sr = d["GeologicMap"]["spatialReference"].name   
    fc = "OverlayPolys"
else:
    fd = "#"
    sr = d["MapUnitPolys"]["spatialReference"].name
    fc = "OverlayPolys"
    
vt = arcpy.ValueTable(3)
vt.addRow(f"{fd} {sr} {fc}")
alc.process(gdb_c, vt)
d = guf.gdb_object_dict(gdb_c)
arcpy.management.DeleteField(d["OverlayPolys"]['catalogPath'], "Label")
# set length and f_type separately
# length only considered if type is text
length = 25
f_type = "float"  # "text"
arcpy.management.AddField(d["OverlayPolys"]['catalogPath'], "Label", f_type, field_length=length)
d = guf.gdb_object_dict(gdb_c)
results = vd.check_fields(d, 3, [])
print(results)

### Rule 3.2 - All MapUnitPolys and ContactsAndFaults based feature classes obey Level 3 topology rules

In [None]:
# make a copy
#arcpy.management.Copy(gdb, gdb_c)
t_path = Path(scratch) / "Topology.gdb"
if t_path.exists:
    arcpy.management.Delete(str(t_path))

In [None]:
# noodle around with the topology
# change names of topo pairs, etc.
d = guf.gdb_object_dict(gdb_c)
topo_pairs = vd.rule2_1(d, is_gpkg)[1]
topo_results = vd.check_topology(d, scratch, False, topo_pairs)
print(topo_results)

### Rule 3.3 - no missing required values

In [None]:
# make a copy
arcpy.management.Copy(gdb, gdb_c)
workdir = Path(r"C:\AAA\gems\testing\scratch")
d = guf.gdb_object_dict(gdb_c)

# delete a couple values from a NoNulls field
mup = d["MapUnitPolys"]["catalogPath"]
with arcpy.da.UpdateCursor(mup, "MapUnit") as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            row[0] = None
        if i == 1:
            row[0] = None
        cursor.updateRow(row)
        
# d = guf.gdb_object_dict(gdb_c)
results = vd.rule3_3(d)
print(results)

### Rule 3.4 - no missing terms in Glossary

`defined_term_fields_list = (
    "Type",
    "ExistenceConfidence",
    "IdentityConfidence",
    "ParagraphStyle",
    "GeoMaterialConfidence",
    "ErrorMeasure",
    "AgeUnits",
    "LocationMethod",
    "ScientificConfidence",
)`

Values in `defined_term_fields` fields not found in Glossary are errors. Values in non-defined fields that end in `type`, `confidence`, or `method` that are not found in Glossary are warnings

In [None]:
# make a copy
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)

# add a term to a required field in a non-core table that is not in the glossary
caf = d["CartographicLines"]["catalogPath"]
with arcpy.da.UpdateCursor(caf, "Type") as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            row[0] = "foobar"
        cursor.updateRow(row)

# first get all glossary terms from a level 2 check
msgs, all_gloss_terms = vd.glossary_check(d, 2, [])
print(msgs)

# and then use that in a level 3 check
msgs, all_gloss_terms, warnings = vd.glossary_check(d, 3, all_gloss_terms)

print(msgs)
print(warnings)

### Rule 3.5 - no unnecessary terms in Glossary

In [None]:
# make a copy
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)

# add a value to the Glossary that is not used anywhere
gloss = d["Glossary"]["catalogPath"]
with arcpy.da.UpdateCursor(gloss, "Term") as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            row[0] = "foobar"
        cursor.updateRow(row)

# need to run glossary_check at levels 2 and 3 to get all_gloss_terms
# first get all glossary terms from a level 2 check
results = vd.glossary_check(d, 2, [])
all_gloss_terms = results[1]

results = vd.glossary_check(d, 3, all_gloss_terms)
all_gloss_terms = results[1]

results = vd.rule3_5_and_7(d, "glossary", all_gloss_terms)
print(results)

### Rule 3.6 - no missing sources in DataSources

In [None]:
# make a copy
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)

# add a data source to a non-core GeMS table that is not in DataSources
sta = d["Stations"]["catalogPath"]
with arcpy.da.UpdateCursor(sta, "DataSourceID") as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            row[0] = "DASfoobar"
        # delete a DataSourceID
        if i == 1:
            row[0] = None
        cursor.updateRow(row)
        
# first run sources_check at level 2 to collect all_sources from required core elements
all_sources = []
msgs, all_sources = vd.sources_check(d, 2, all_sources)
print(msgs)

# then, run at level 3 to check the rest
msgs, all_sources = vd.sources_check(d, 3, all_sources)
print(msgs)

### Rule 3.7 - no unnecessary sources in DataSources

In [None]:
# make a copy
arcpy.management.Copy(gdb, gdb_c)
workdir = Path(r"C:\AAA\gems\testing\scratch")
d = guf.gdb_object_dict(gdb_c)

# add a value to the Glossary that is not used anywhere
ds = d["DataSources"]["catalogPath"]
with arcpy.da.UpdateCursor(ds, "DataSources_ID") as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            row[0] = "DASfoobar"
        cursor.updateRow(row)

# need to run sources_check at levels 2 and 3 to get all_sources
# first get all sources from a level 2 check
all_sources = []
msgs, all_sources = vd.sources_check(d, 2, all_sources)

# then level 3
msgs, all_sources = vd.sources_check(d, 3, all_sources)

# then check rule3_5_and_7
results = vd.rule3_5_and_7(d, "datasources", all_sources)
print(results)

### Rule 3.8 - No map units without entries in DescriptionOfMapUnits and

In [None]:
# make a copy
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)

# first, run check_map_units at level 2 which collects map units from MapUnitPolys
all_map_units = []
fds_map_units = {}
msgs, all_map_units, fds_map_units = vd.check_map_units(d, 2, all_map_units, fds_map_units)

# add a random map unit to a non-core element
clines = d["Stations"]["catalogPath"]
with arcpy.da.UpdateCursor(clines, ["MapUnit", "ObservedMapunit"]) as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            row[0] = "foo"
        if i == 1:
            row[1] = "bar"
        cursor.updateRow(row)

# and then at level 3 to extend all_map_units with units from all tables with 'MapUnit'
(msgs3_8, msgs3_9, all_map_units, fds_map_units, mu_warnings) = vd.check_map_units(d, 3, all_map_units, fds_map_units)

print("Rule 3.8")
for m in msgs3_8: 
    print(m)
print("Warnings")
for m in mu_warnings:
    print(m)

### Rule 3.9 - No unnecessary MapUnits in DescriptionOfMapUnits

In [None]:
# add an extra map unit to DescriptionOfMapUnits
dmu = d["DescriptionOfMapUnits"]["catalogPath"]
with arcpy.da.UpdateCursor(dmu, "MapUnit") as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            row[0] = "foobar"
        cursor.updateRow(row)
        
(msgs3_8, msgs3_9, all_map_units, fds_map_units, mu_warnings) = vd.check_map_units(d, 3, all_map_units, fds_map_units)

for m in msgs3_9: 
    print(m)

### Rule 3.10 - HierarchyKey values in DescriptionOfMapUnits are unique and well formed

In [None]:
# make a copy
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)
dmu = d["DescriptionOfMapUnits"]["catalogPath"]

# take a look at the HierarchyKeys
hkeys = [r[0] for r in arcpy.da.SearchCursor(dmu, "HierarchyKey")]
hkeys.sort()
for hkey in hkeys: print(hkey)

In [None]:
# add a weird HierarchyKey
dmu = d["DescriptionOfMapUnits"]["catalogPath"]
with arcpy.da.UpdateCursor(dmu, "HierarchyKey") as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            row[0] = "1/2"
        cursor.updateRow(row)
results = vd.rule3_10(d)
print(results)

### Rule 3.11 - All values of GeoMaterial are defined in GeoMaterialDict

In [None]:
# make a copy
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)
file = vd.__file__
scripts = Path(file).parent
ref_gmd = scripts /  "GeoMaterialDict.csv"

# add a weird geomaterial
dmu = d["DescriptionOfMapUnits"]["catalogPath"]
with arcpy.da.UpdateCursor(dmu, "GeoMaterial") as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            row[0] = "choss"
        cursor.updateRow(row)

results = vd.rule3_11(d, str(ref_gmd))
print(results)

### Rule 3.12 - No duplicate \_ID values

In [None]:
# make a copy
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)

# duplicate an _ID value
caf = d["ContactsAndFaults"]["catalogPath"]
with arcpy.da.UpdateCursor(caf, "ContactsAndFaults_ID") as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            val = row[0]
        if i == 1:
            row[0] = val
        cursor.updateRow(row)
results = vd.rule3_12(d)
print(results)

### Rule 3.13 - No zero-length or whitespace-only strings

In [None]:
# make a copy
importlib.reload(vd)
arcpy.management.Copy(gdb, gdb_c)
d = guf.gdb_object_dict(gdb_c)

# add some bad null values
caf = d["ContactsAndFaults"]["catalogPath"]
with arcpy.da.UpdateCursor(caf, "Type") as cursor:
    for i, row in enumerate(cursor):
        if i == 0:
            row[0] = ""
        if i == 1:
            row[0] = " "
        if i == 2:
            row[0] = "<NULL>"
        cursor.updateRow(row)

results = vd.rule3_13(d)
zero = results[0]
leading = results[1]
for z in zero: print(z)