In [1]:
from pathlib import Path
from pprint import pprint

import pandas as pd
from frictionless import describe, extract, validate, transform
from frictionless import describe_schema

## Scholarly Metrics Data Package Development

This notebook contains several cycles of increasingly complex use cases for the scholarly metrics data package.

Each use case and MVP comes with a short plain-text description, a particular dataset, and a frictionless schema that describes the dataset.

### 1. Scite

#### 1.1 Citation Counts for article

Describe a simple dataset with citation counts provided from Crossref.

In [14]:
data_dir = Path("../prototypes/1_citations/")
metadata = pd.read_csv(data_dir / "event_contexts.csv")
traces = pd.read_csv(data_dir / "citation_traces.csv")
cit_patterns = pd.read_csv(data_dir / "citation_patterns.csv")
article_patterns = pd.read_csv(data_dir / "article_patterns.csv")

In [32]:
f = data_dir / "event_contexts.csv"
event_contexts = describe(f)["schema"]

In [33]:
event_contexts

{'fields': [{'name': 'doi', 'type': 'string'},
  {'name': 'slug', 'type': 'string'},
  {'name': 'type', 'type': 'string'},
  {'name': 'title', 'type': 'string'},
  {'name': 'abstract', 'type': 'string'},
  {'name': 'authors', 'type': 'string'},
  {'name': 'keywords', 'type': 'string'},
  {'name': 'year', 'type': 'number'},
  {'name': 'shortJournal', 'type': 'string'},
  {'name': 'publisher', 'type': 'string'},
  {'name': 'issue', 'type': 'string'},
  {'name': 'volume', 'type': 'integer'},
  {'name': 'page', 'type': 'string'},
  {'name': 'retracted', 'type': 'boolean'},
  {'name': 'memberId', 'type': 'integer'},
  {'name': 'issns', 'type': 'string'},
  {'name': 'editorialNotices', 'type': 'string'},
  {'name': 'journalSlug', 'type': 'string'},
  {'name': 'journal', 'type': 'string'},
  {'name': 'rwStatus', 'type': 'any'}]}

In [20]:
desc = describe(data_dir / "citation_traces.csv")
trace_schema = desc["schema"]

{'fields': [{'name': 'id', 'type': 'integer'},
  {'name': 'source', 'type': 'string'},
  {'name': 'target', 'type': 'string'}]}

In [28]:
trace_schema.get_field("id").title = "Unique ID for a source-target pair"
trace_schema.get_field("source").title = "DOI of citing article"
trace_schema.get_field("source").type = "string"
trace_schema.get_field("target").title = "DOI of cited article"
trace_schema.get_field("target").type = "string"

In [29]:
trace_schema

{'fields': [{'name': 'id',
   'type': 'integer',
   'title': 'Unique ID for a source-target pair'},
  {'name': 'source', 'type': 'string', 'title': 'DOI of citing article'},
  {'name': 'target', 'type': 'string', 'title': 'DOI of cited article'}]}

In [30]:
desc = describe(data_dir / "citation_patterns.csv")
cit_patterns = desc["schema"]

In [31]:
cit_patterns

{'fields': [{'name': 'source', 'type': 'string'},
  {'name': 'target', 'type': 'string'},
  {'name': 'total_source_mentions', 'type': 'integer'},
  {'name': 'total_source_refs', 'type': 'integer'},
  {'name': 'mentions', 'type': 'integer'},
  {'name': 'norm_refs', 'type': 'number'},
  {'name': 'norm_mentions', 'type': 'number'},
  {'name': 'mentions_per_ref', 'type': 'number'},
  {'name': 'wf1', 'type': 'number'},
  {'name': 'wf2', 'type': 'number'},
  {'name': 'wf3', 'type': 'number'}]}

In [None]:
class Process:
    def __init__(self, name):
        self.name = name

class Performing(Process):
    def __init__(self)
        super().__init__(self)
    
class Tracing(Process):
    def __init__(self)
        super().__init__(self)
        
class Patterning(Process):
    def __init__(self)
        super().__init__(self)

class Valuing(Process):
    def __init__(self)
        super().__init__(self)

In [None]:
class Phenomenom:
    def __init__(self):
        self.title = ""
        self.description = ""

class Event(Phenomenom):
    def __init__(self)
        super().__init__(self)
        
class Trace(Phenomenom):
    def __init__(self)
        super().__init__(self)
    
class Pattern(Phenomenom):
    def __init__(self)
        super().__init__(self)

In [43]:
citations = {
    "name": "scholarly citation",
    "local": "scholarly article",
    "reference": "mentions",
    "remote": "scholarly article"
}

In [45]:
citation_links = {
    "name": "citation link",
    "description": "All references deposited to COCI through the I4OC including a long list of publishers (notably, Elsevier is not contributing its reference data)" \
                    "These references are then matched and aggregated for articles.",
    "tracked_event": citations,
    "coverage": "COCI publishers",
    
}

In [46]:
pattern = {
    "name": "aggregation",
    "description": "Local "
    "trace": citation_links
}

In [52]:
pprint(schema.get_field("citation_count")["pattern"])

{'name': 'aggregation',
 'trace': {'description': 'All references deposited to COCI through the I4OC '
                          'including a long list of publishers (notably, '
                          'Elsevier is not contributing its reference '
                          'data)These references are then matched and '
                          'aggregated for articles.',
           'name': 'citation link',
           'tracked_event': {'local': 'scholarly article',
                             'name': 'scholarly citation',
                             'reference': 'mentions',
                             'remote': 'scholarly article'}}}


#### 1.2 Multiple Citation Counts

Describe a dataset with several citation counts provided from Crossref, Web of Science, and Google Scholar.

### 2. Altmetrics

#### 2.1 Facebook shares from Altmetric.com

Describe a dataset of aggregated Facebook share counts provided by Altmetric

#### 2.2 Public and Private Facebook Shares

Describe a dataset with Altmetric Facebook counts and private Facebook share numbers aggregated by our method.

### 3. References

#### 3.1 Basic reference data

Describe a dataset of in-text references provided by Scite.ai.

#### 3.2 References and Citations

Describe a dataset consisting of citation counts derived from a reference data.