# Basic CIF Validation

This notebook demonstrates how to use `cif_validator` to validate CIF files against DDLm dictionaries.

We'll create a simple dictionary and show validation of both valid and invalid CIF content.

In [10]:
from cif_validator import ValidationMode, Validator, validate

## Define a Simple Dictionary

We'll create a minimal DDLm dictionary that defines a few data items for crystal structures:
- Cell lengths (must be positive real numbers)
- Cell angles (must be between 0 and 180 degrees)
- Space group name

In [11]:
simple_dictionary = r"""
#\#CIF_2.0
##############################################################################
#                     Simple Crystal Structure Dictionary                   #
##############################################################################

data_SIMPLE_CRYSTAL

    _dictionary.title            SIMPLE_CRYSTAL
    _dictionary.class            Instance
    _dictionary.version          1.0.0
    _dictionary.date             2025-01-01
    _dictionary.uri              https://example.com/simple.dic
    _dictionary.ddl_conformance  4.2.0
    _dictionary.namespace        CifCore

    _description.text
;
   A simple dictionary for demonstrating CIF validation.
;

save_SIMPLE_HEAD

    _definition.id               SIMPLE_HEAD
    _definition.scope            Category
    _definition.class            Head
    _definition.update           2025-01-01
    _name.category_id            SIMPLE_CRYSTAL
    _name.object_id              HEAD

save_

# CELL category
save_CELL

    _definition.id               CELL
    _definition.scope            Category
    _definition.class            Set
    _definition.update           2025-01-01
    _name.category_id            SIMPLE_HEAD
    _name.object_id              CELL

    _description.text
;
   Unit cell parameters.
;

save_

save_cell.length_a

    _definition.id               '_cell.length_a'
    _definition.update           2025-01-01
    _name.category_id            cell
    _name.object_id              length_a

    _description.text            'Unit cell length a in angstroms.'

    _type.purpose                Measurand
    _type.source                 Recorded
    _type.container              Single
    _type.contents               Real

    _enumeration.range           0.0:

save_

save_cell.length_b

    _definition.id               '_cell.length_b'
    _definition.update           2025-01-01
    _name.category_id            cell
    _name.object_id              length_b

    _description.text            'Unit cell length b in angstroms.'

    _type.purpose                Measurand
    _type.source                 Recorded
    _type.container              Single
    _type.contents               Real

    _enumeration.range           0.0:

save_

save_cell.length_c

    _definition.id               '_cell.length_c'
    _definition.update           2025-01-01
    _name.category_id            cell
    _name.object_id              length_c

    _description.text            'Unit cell length c in angstroms.'

    _type.purpose                Measurand
    _type.source                 Recorded
    _type.container              Single
    _type.contents               Real

    _enumeration.range           0.0:

save_

save_cell.angle_alpha

    _definition.id               '_cell.angle_alpha'
    _definition.update           2025-01-01
    _name.category_id            cell
    _name.object_id              angle_alpha

    _description.text            'Angle between b and c axes in degrees.'

    _type.purpose                Measurand
    _type.source                 Recorded
    _type.container              Single
    _type.contents               Real

    _enumeration.range           0.0:180.0

save_

save_cell.angle_beta

    _definition.id               '_cell.angle_beta'
    _definition.update           2025-01-01
    _name.category_id            cell
    _name.object_id              angle_beta

    _description.text            'Angle between a and c axes in degrees.'

    _type.purpose                Measurand
    _type.source                 Recorded
    _type.container              Single
    _type.contents               Real

    _enumeration.range           0.0:180.0

save_

save_cell.angle_gamma

    _definition.id               '_cell.angle_gamma'
    _definition.update           2025-01-01
    _name.category_id            cell
    _name.object_id              angle_gamma

    _description.text            'Angle between a and b axes in degrees.'

    _type.purpose                Measurand
    _type.source                 Recorded
    _type.container              Single
    _type.contents               Real

    _enumeration.range           0.0:180.0

save_

# SYMMETRY category
save_SYMMETRY

    _definition.id               SYMMETRY
    _definition.scope            Category
    _definition.class            Set
    _definition.update           2025-01-01
    _name.category_id            SIMPLE_HEAD
    _name.object_id              SYMMETRY

    _description.text            'Space group information.'

save_

save_symmetry.space_group_name_H-M

    _definition.id               '_symmetry.space_group_name_H-M'
    _definition.update           2025-01-01
    _name.category_id            symmetry
    _name.object_id              space_group_name_H-M

    _description.text            'Hermann-Mauguin space group symbol.'

    _type.purpose                State
    _type.source                 Assigned
    _type.container              Single
    _type.contents               Text

save_
"""

## Valid CIF Content

This CIF contains valid data that conforms to the dictionary:
- All cell lengths are positive
- All angles are between 0 and 180 degrees

In [12]:
valid_cif = """
data_nacl

_cell.length_a    5.6402
_cell.length_b    5.6402
_cell.length_c    5.6402
_cell.angle_alpha 90.0
_cell.angle_beta  90.0
_cell.angle_gamma 90.0

_symmetry.space_group_name_H-M  'F m -3 m'
"""

## Validate Using One-Shot Function

The `validate()` function is a convenient way to validate a single CIF against a dictionary.

In [16]:
result = validate(valid_cif, simple_dictionary)

print(f"Is valid: {result.is_valid}")
print(f"Errors: {result.error_count}")
print(f"Warnings: {result.warning_count}")

print(f"ERROR: {result.errors}")

Is valid: True
Errors: 0
ERROR: []


## Invalid CIF Content

This CIF contains several validation errors:
1. Negative cell length (`_cell.length_a -5.0`) - violates range constraint
2. Angle outside valid range (`_cell.angle_alpha 200.0`) - exceeds 180.0
3. Wrong type (`_cell.length_b 'not-a-number'`) - should be Real

In [17]:
invalid_cif = """
data_invalid

_cell.length_a    -5.0
_cell.length_b    'not-a-number'
_cell.length_c    10.0
_cell.angle_alpha 200.0
_cell.angle_beta  90.0
_cell.angle_gamma 90.0

_symmetry.space_group_name_H-M  'P 1'
"""

In [18]:
result = validate(invalid_cif, simple_dictionary)

print(f"Is valid: {result.is_valid}")
print(f"Errors: {result.error_count}")
print(f"Warnings: {result.warning_count}")
print()

for error in result.errors:
    print(f"Error at line {error.span.start_line}:")
    print(f"  Category: {error.category}")
    print(f"  Message: {error.message}")
    if error.data_name:
        print(f"  Data name: {error.data_name}")
    print()

Is valid: False
Errors: 3

Error at line 5:
  Category: type error
  Message: Type error for '_cell.length_b': expected real number, got text 'not-a-number'
  Data name: _cell.length_b

Error at line 4:
  Category: range error
  Message: Value -5 for '_cell.length_a' is outside allowed range >= 0
  Data name: _cell.length_a

Error at line 7:
  Category: range error
  Message: Value 200 for '_cell.angle_alpha' is outside allowed range 0 to 180
  Data name: _cell.angle_alpha



## Using the Validator Class

For validating multiple CIF files against the same dictionary, use the `Validator` class.
This is more efficient as the dictionary is parsed only once.

In [None]:
# Create a reusable validator
validator = Validator()
validator.add_dictionary(simple_dictionary)

# Set validation mode (Strict, Lenient, or Pedantic)
validator.set_mode(ValidationMode.Strict)

print(f"Validation mode: {validator.mode.name}")

In [None]:
# Validate multiple CIF documents
cif_documents = [
    ("valid_nacl", valid_cif),
    ("invalid_example", invalid_cif),
]

for name, cif_content in cif_documents:
    result = validator.validate(cif_content)
    status = "PASS" if result.is_valid else "FAIL"
    print(f"{name}: {status} (errors: {result.error_count}, warnings: {result.warning_count})")

## Error Categories

The validator categorizes errors to help identify the type of problem:

- `TypeError`: Value has wrong type (e.g., text instead of number)
- `RangeError`: Value outside allowed range
- `EnumerationError`: Value not in allowed set
- `UnknownDataName`: Data item not defined in dictionary
- `MissingMandatory`: Required data item is missing
- `LoopStructure`: Problem with loop structure
- `LinkError`: Linked item reference is invalid
- `DictionaryError`: Problem with the dictionary itself

In [None]:

# Group errors by category
result = validator.validate(invalid_cif)

errors_by_category = {}
for error in result.errors:
    cat_name = error.category.name
    if cat_name not in errors_by_category:
        errors_by_category[cat_name] = []
    errors_by_category[cat_name].append(error)

for category, errors in errors_by_category.items():
    print(f"\n{category}:")
    for error in errors:
        print(f"  - Line {error.span.start_line}: {error.message}")

## Span Information

Each error includes precise location information (span) for IDE integration.

In [None]:
result = validator.validate(invalid_cif)

for error in result.errors:
    span = error.span
    print(f"Error span: line {span.start_line}, col {span.start_col} -> line {span.end_line}, col {span.end_col}")
    print(f"  Message: {error.message}")
    print()

## Summary

Key points:

1. Use `validate()` for one-shot validation of a single CIF
2. Use `Validator` class for validating multiple CIFs against the same dictionary
3. Errors include category, message, and precise span information
4. Choose validation mode based on strictness needs (Strict, Lenient, Pedantic)