In [15]:
from pathlib import Path
from pprint import pprint

import pandas as pd
from frictionless import describe, extract, validate, transform

from frictionless import describe_schema

In [2]:
data_dir = Path("../data/")
articles = data_dir / "articles.csv"

## Describing

In [3]:
resource = describe(articles, infer_missing_values=[""])

In [23]:
schema = describe_schema(resource)

In [24]:
schema.metadata_attach()

[{'name': 'proposal_id', 'type': 'string'},
 {'name': 'program', 'type': 'string'},
 {'name': 'score', 'type': 'number'},
 {'name': 'DOI', 'type': 'string'},
 {'name': 'journal_name', 'type': 'string'},
 {'name': 'publisher', 'type': 'string'},
 {'name': 'coci_citations', 'type': 'integer'},
 {'name': 'references', 'type': 'integer'},
 {'name': 'authors_count', 'type': 'integer'},
 {'name': 'title', 'type': 'string'},
 {'name': 'created', 'type': 'integer'},
 {'name': 'deposited', 'type': 'integer'},
 {'name': 'indexed', 'type': 'integer'},
 {'name': 'published-online', 'type': 'integer'},
 {'name': 'issued', 'type': 'integer'}]

In [20]:
schema.primary_key = "DOI"
schema.get_field("DOI").constraints["unique"] = True
schema.get_field("proposal_id").description = "ID for proposals"
schema.get_field("program").title = "Research Program"
schema.get_field("program").description = "Name of research program within CDRMP"
schema.get_field("program").description = "Score for citation text and DOI matching using the Crossref matcher"

In [21]:
pprint(schema)

{'fields': [{'description': 'ID for proposals',
             'name': 'proposal_id',
             'type': 'string'},
            {'description': 'Score for citation text and DOI matching using '
                            'the Crossref matcher',
             'name': 'program',
             'title': 'Research Program',
             'type': 'string'},
            {'name': 'score', 'type': 'number'},
            {'constraints': {'unique': True}, 'name': 'DOI', 'type': 'string'},
            {'name': 'journal_name', 'type': 'string'},
            {'name': 'publisher', 'type': 'string'},
            {'name': 'coci_citations', 'type': 'integer'},
            {'name': 'references', 'type': 'integer'},
            {'name': 'authors_count', 'type': 'integer'},
            {'name': 'title', 'type': 'string'},
            {'name': 'created', 'type': 'integer'},
            {'name': 'deposited', 'type': 'integer'},
            {'name': 'indexed', 'type': 'integer'},
            {'name': 'published

## Validating

In [10]:
pprint(validate(articles))

{'errors': [],
 'stats': {'errors': 0, 'tables': 1},
 'tables': [{'compression': 'no',
             'compressionPath': '',
             'dialect': {},
             'encoding': 'utf-8',
             'errors': [],
             'format': 'csv',
             'hashing': 'md5',
             'header': ['proposal_id',
                        'program',
                        'score',
                        'DOI',
                        'journal_name',
                        'publisher',
                        'coci_citations',
                        'references',
                        'authors_count',
                        'title',
                        'created',
                        'deposited',
                        'indexed',
                        'published-online',
                        'issued'],
             'partial': False,
             'path': '../data/articles.csv',
             'query': {},
             'schema': {'fields': [{'name': 'proposal_id', 'type': 'st

## Extracting