# Generate Test Data

## Setup target Graph

In [1]:
from rdflib import Graph, URIRef

In [2]:
# Setup a Graph for the data
g = Graph()

The defined **prefixes** are stored in the GolemQuery class in `sparql_queries.py`. These are the cannonical prefixes:

In [3]:
# Use defined prefixes
from sparql_queries import GolemQuery
golem = GolemQuery()
golem.prefixes

[{'prefix': 'gd', 'uri': 'http://data.golemlab.eu/data/'},
 {'prefix': 'gt', 'uri': 'http://data.golemlab.eu/data/entity/type/'},
 {'prefix': 'crm', 'uri': 'http://www.cidoc-crm.org/cidoc-crm/'},
 {'prefix': 'owl', 'uri': 'http://www.w3.org/2002/07/owl#'},
 {'prefix': 'xsd', 'uri': 'http://www.w3.org/2001/XMLSchema#'},
 {'prefix': 'cls', 'uri': 'http://clscor.io/ontology/'},
 {'prefix': 'go', 'uri': 'http://golemlab.eu/ontology/'},
 {'prefix': 'lrm', 'uri': 'http://www.cidoc-crm.org/cidoc-crm/lrmoo/'},
 {'prefix': 'rdfs', 'uri': 'http://www.w3.org/2000/01/rdf-schema#'},
 {'prefix': 'nif',
  'uri': 'http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#'}]

In [4]:
# add the prefixes to the graph
for item in golem.prefixes:
    g.namespace_manager.bind(item["prefix"], URIRef(item["uri"]))

## Corpus

Corpus data can be created by using the class `Corpus` in `corpus.py`.
We create a "Harry Potter Corpus".

In [5]:
from corpus import Corpus

In [6]:
# set the id of the Harry Potter corpus corpus
potter_corpus_id = "potter_corpus"

In [7]:
# Setup an instance with all attributes
harry_potter_corpus = Corpus(
    id = potter_corpus_id,
    #URI will be derived from the ID:
    uri = golem.get_prefix_uri("gd") + potter_corpus_id, 
    name = "Harry Potter Corpus",
    acronym = "potter",
    description = "Harry Potter Corpus derived form AO3.",
    licence = dict(
        name="CC0",
        uri="https://creativecommons.org/publicdomain/zero/1.0"),
    repository = dict(
        url="https://github.com/GOLEM-lab/potter_corpus"
    )
)

In [8]:
# Manually create metrics add add them
harry_potter_corpus_metrics = dict(
    chapters = 500,
    paragraphs = 9000,
    characters = 4000,
    male = 1990,
    female = 1990,
    nonbinary = 20,
    comments = 7000,
    wordsInDocuments = 500000,
    wordsInComments = 20000
)

harry_potter_corpus.metrics = harry_potter_corpus_metrics

With an instance of the `Corpus` class we can return the metadata as defined in the schema. We can also validate it:

In [9]:
harry_potter_corpus.get_metadata(include_metrics=True,validation=True)

{'id': 'potter_corpus',
 'uri': 'http://data.golemlab.eu/data/potter_corpus',
 'corpusName': 'Harry Potter Corpus',
 'acronym': 'potter',
 'corpusDescription': 'Harry Potter Corpus derived form AO3.',
 'licence': 'CC0',
 'licenceUrl': 'https://creativecommons.org/publicdomain/zero/1.0',
 'repository': 'https://github.com/GOLEM-lab/potter_corpus',
 'metrics': {'chapters': 500,
  'paragraphs': 9000,
  'characters': 4000,
  'male': 1990,
  'female': 1990,
  'nonbinary': 20,
  'comments': 7000,
  'wordsInDocuments': 500000,
  'wordsInComments': 20000}}

RDF data is generated by calling the method `generate_graph()` on the corpus instance.

In [10]:
harry_potter_corpus_g = harry_potter_corpus.generate_graph()

In [11]:
# Print the RDF
print(harry_potter_corpus_g.serialize())

@prefix cls: <http://clscor.io/ontology/> .
@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix gd: <http://data.golemlab.eu/data/> .
@prefix gt: <http://data.golemlab.eu/data/entity/type/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://data.golemlab.eu/data/potter_corpus/acronym> a crm:E41_Appellation ;
    crm:P1i_identifies gd:potter_corpus ;
    crm:P2_has_type gt:corpus_acronym ;
    rdf:value "potter" .

<http://data.golemlab.eu/data/potter_corpus/corpus_name> a crm:E41_Appellation ;
    crm:P1i_identifies gd:potter_corpus ;
    crm:P2_has_type gt:corpus_name ;
    rdf:value "Harry Potter Corpus" .

<http://data.golemlab.eu/data/potter_corpus/dimension/chapters> a crm:E54_Dimension ;
    crm:P90_has_value "500"^^xsd:int .

<http://data.golemlab.eu/data/potter_corpus/dimension/characters> a crm:E54_Dimension ;
    crm:P90_has_value "4000"^^xs

Add this to the data graph `g`:

In [12]:
g = g + harry_potter_corpus_g

## Characters
Use the class `Character` in `character.py`.

In [13]:
from character import Character

### Canon-Characters

Canonical Character "Harry Potter":

In [14]:
canon_harry_potter_id = "C000000001"
canon_harry_potter = Character(
    id=canon_harry_potter_id,
    uri= golem.get_prefix_uri("gd") + canon_harry_potter_id,
    name="Harry Potter",
    character_type = "canon",
    gender = "male",
    refs=[{"ref": "Q3244512", "type": "wikidata"}],
    corpus_ids = [potter_corpus_id] #corpora the character is included
)

In [15]:
canon_harry_potter_g = canon_harry_potter.generate_graph()

In [16]:
print(canon_harry_potter_g.serialize())

@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix gd: <http://data.golemlab.eu/data/> .
@prefix go: <http://golemlab.eu/ontology/> .
@prefix gt: <http://data.golemlab.eu/data/entity/type/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://data.golemlab.eu/data/C000000001/character_name> a crm:E41_Appellation ;
    crm:P2_has_type gt:character_name ;
    rdf:value "Harry Potter" .

<http://data.golemlab.eu/data/C000000001/id> a crm:E42_Identifier ;
    crm:P2_has_type gt:id ;
    rdf:value "C000000001" .

<http://data.golemlab.eu/data/C000000001/wd> a crm:E42_Identifier ;
    crm:P1i_identifies gd:C000000001 ;
    crm:P2_has_type gt:wikidata ;
    rdf:value "Q3244512" .

gd:potter_corpus crm:P148_has_component gd:C000000001 .

gd:C000000001 a go:C1_Character_Concept ;
    rdfs:label "Harry Potter" ;
    crm:P148i_is_component_of gd:potter_corpus ;
    crm:P1_is_identified_by <http://data.golemlab.eu

A second character –– Hermione Granger:

In [17]:
canon_hermione_id = "C000000002"
canon_hermione = Character(
    id=canon_hermione_id,
    uri= golem.get_prefix_uri("gd") + canon_hermione_id,
    name="Hermione Granger",
    character_type = "canon",
    gender = "female",
    refs=[{"ref": "Q174009", "type": "wikidata"}],
    corpus_ids = [potter_corpus_id]
)

In [18]:
canon_hermione_g = canon_hermione.generate_graph()
# Could inspect by printing it:
# print(hermione_rdf.serialize())

Add the two characters to the graph:

In [19]:
g = g + canon_harry_potter_g + canon_hermione_g

### Fanon-Characters

In [20]:
fanon_harry_potter_id = "C000000003"

# to do this derivative:
fanon_harry_potter_relations = [
    {"type": "derivative_of", "id": canon_harry_potter_id }
]

fanon_harry_potter = Character(
    id=fanon_harry_potter_id,
    uri= golem.get_prefix_uri("gd") + fanon_harry_potter_id,
    name="Harry Potter",
    character_type = "fanon",
    gender = "male",
    corpus_ids = [potter_corpus_id],
    relations = fanon_harry_potter_relations
)

In [21]:
fanon_harry_potter_g = fanon_harry_potter.generate_graph()
print(fanon_harry_potter_g.serialize())

@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix gd: <http://data.golemlab.eu/data/> .
@prefix go: <http://golemlab.eu/ontology/> .
@prefix gt: <http://data.golemlab.eu/data/entity/type/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

gd:C000000001 crm:P130i_features_are_also_found_on gd:C000000003 .

<http://data.golemlab.eu/data/C000000003/character_name> a crm:E41_Appellation ;
    crm:P2_has_type gt:character_name ;
    rdf:value "Harry Potter" .

<http://data.golemlab.eu/data/C000000003/id> a crm:E42_Identifier ;
    crm:P2_has_type gt:id ;
    rdf:value "C000000003" .

gd:potter_corpus crm:P148_has_component gd:C000000003 .

gd:C000000003 a go:C1_Character_Concept ;
    rdfs:label "Harry Potter" ;
    crm:P130_shows_features_of gd:C000000001 ;
    crm:P148i_is_component_of gd:potter_corpus ;
    crm:P1_is_identified_by <http://data.golemlab.eu/data/C000000003/character_name>,
        <http://data

In [22]:
g = g + fanon_harry_potter_g

## Work
Use the class `Work` from `work.py`. This class can only be used to create data and is currently not used in the API.

In [23]:
from work import Work

### Harry Potter and the Philosopher's Stone

In [24]:
philosophers_stone_work_id = "W000000001"

# we will also need the author and re-use this ID later
j_k_rowling_id = "A000000001"

The `Work` class needs information, which characters are connected to the work. We re-use the ids of the characters above; `"effect": "created"` means, that the characters where invented/created in this work.

In [25]:
philosophers_stone_work_character_data = [
    {
        "id": canon_harry_potter_id,
        "effect": "created"
    },
    {
        "id": canon_hermione_id,
        "effect": "created"
    }
]


In [26]:
# use the Work class
philosophers_stone = Work(
    id=philosophers_stone_work_id,
    uri=golem.get_prefix_uri("gd") + philosophers_stone_work_id,
    title="Harry Potter and the Philosopher's Stone",
    characters=philosophers_stone_work_character_data,
    authors=[{"id": j_k_rowling_id}],
    dates={"created": 1997},
    refs=[{"ref": "Q43361", "type": "wikidata"}]
)

In [27]:
philosophers_stone_g = philosophers_stone.generate_graph()
print(philosophers_stone_g.serialize())

@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix gd: <http://data.golemlab.eu/data/> .
@prefix gt: <http://data.golemlab.eu/data/entity/type/> .
@prefix lrm: <http://www.cidoc-crm.org/cidoc-crm/lrmoo/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

gd:A000000001 crm:P14i_performed <http://data.golemlab.eu/data/W000000001/creation> .

gd:C000000001 crm:P94i_was_created_by <http://data.golemlab.eu/data/W000000001/creation> .

gd:C000000002 crm:P94i_was_created_by <http://data.golemlab.eu/data/W000000001/creation> .

<http://data.golemlab.eu/data/W000000001/creation/ts> a crm:E52_Time-Span ;
    crm:P4i_is_time-span_of <http://data.golemlab.eu/data/W000000001/creation> ;
    rdf:value 1997 .

<http://data.golemlab.eu/data/W000000001/id> a crm:E42_Identifier ;
    crm:P1i_identifies gd:W000000001 ;
    crm:P2_has_type gt:id ;
    rdf:value "W000000001" .



Add to the data graph:

In [28]:
g = g + philosophers_stone_g

### Fanfiction story

In [37]:
fanfic_work_id = "W000000002"
fanfic_author_id = "A000000002"

fanfic_work_character_data = [
    {
        "id": canon_harry_potter_id,
        "effect": "used"
    },
    {
        "id": fanon_harry_potter_id,
        "effect": "created"
    }
    
]

fanfic_work = Work(
    id=fanfic_work_id,
    uri=golem.get_prefix_uri("gd") + fanfic_work_id,
    title="A Harry Potter Fanfiction Story",
    characters=fanfic_work_character_data,
    authors=[{"id": fanfic_author_id}]
)

In [38]:
fanfic_work_g = fanfic_work.generate_graph()
print(fanfic_work_g.serialize())

@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix gd: <http://data.golemlab.eu/data/> .
@prefix gt: <http://data.golemlab.eu/data/entity/type/> .
@prefix lrm: <http://www.cidoc-crm.org/cidoc-crm/lrmoo/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

gd:A000000002 crm:P14i_performed <http://data.golemlab.eu/data/W000000002/creation> .

gd:C000000003 crm:P94i_was_created_by <http://data.golemlab.eu/data/W000000002/creation> .

<http://data.golemlab.eu/data/W000000002/id> a crm:E42_Identifier ;
    crm:P1i_identifies gd:W000000002 ;
    crm:P2_has_type gt:id ;
    rdf:value "W000000002" .

<http://data.golemlab.eu/data/W000000002/title> a crm:E35_Title ;
    crm:P102i_is_title_of gd:W000000002 ;
    rdf:value "A Harry Potter Fanfiction Story" .

gd:W000000002 a lrm:F1_Work ;
    rdfs:label "A Harry Potter Fanfiction Story" ;
    crm:P102_has_title <http://data.golemlab.eu/data/W000000002/title> ;
    crm:P

## Author
Use the class `Author` from `author.py`. This class can only be used to create data and is currently not used in the API.

In [29]:
from author import Author

### J. K. Rowling

In [30]:
j_k_rowling = Author(
    id=j_k_rowling_id,
    uri=golem.get_prefix_uri("gd") + j_k_rowling_id,
    name="Rowling, J. K.",
    refs=[{"ref": "Q34660", "type": "wikidata"}]
)

In [31]:
j_k_rowling_g = j_k_rowling.generate_graph()
print(j_k_rowling_g.serialize())

@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix gd: <http://data.golemlab.eu/data/> .
@prefix gt: <http://data.golemlab.eu/data/entity/type/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://data.golemlab.eu/data/A000000001/id> a crm:E42_Identifier ;
    crm:P1i_identifies gd:A000000001 ;
    crm:P2_has_type gt:id ;
    rdf:value "A000000001" .

<http://data.golemlab.eu/data/A000000001/name> a crm:E41_Appellation ;
    crm:P1i_identifies gd:A000000001 ;
    crm:P2_has_type gt:author_name ;
    rdf:value "Rowling, J. K." .

<http://data.golemlab.eu/data/A000000001/wd> a crm:E42_Identifier ;
    crm:P1i_identifies gd:A000000001 ;
    crm:P2_has_type gt:wikidata ;
    rdf:value "Q34660" .

gd:A000000001 a crm:E39_Actor ;
    rdfs:label "Rowling, J. K." ;
    crm:P1_is_identified_by <http://data.golemlab.eu/data/A000000001/id>,
        <http://data.golemlab.eu/data/A000000001/name>,
        <http://

Add the data to the data graph:

In [32]:
g = g + j_k_rowling_g

## Export test data

In [33]:
# print everythig
# print(g.serialize())

In [34]:
g.serialize(destination="data/generated_example_data.ttl",format="ttl")

<Graph identifier=N739323a13812435fb48f91ab74fcd5fe (<class 'rdflib.graph.Graph'>)>