In [1]:
%load_ext lab_black

# Explore `graphdata` 🕸️

The graph data, derived from the preprocessed data, is provided in CSV files. 
<br/>
There are three groups of files (where "weighted" stands for "line-weighted or token-weighted"):
- `{play_name}_ce*`: generate classic character co-occurrence network representations (clique expansions of hypergraphs)
  - nodes: characters
    - `*_ce.nodes.csv` for all representations
  - edges: binarized co-occurrence on stage in a text unit, where text unit $\in \{\text{scene}, \text{(stage) group}\}$
    - `*_ce-scene-mw.edges.csv` for weighted or binary multigraphs with text units resolved at the scene level
    - `*_ce-scene-w.edges.csv` for count-weighted graphs with text units resoved at the scene level
    - `*_ce-group-mw.edges.csv` for weighted or binary multigraphs with text units resolved at the stage group level
    - `*_ce-group-w.edges.csv` for count-weighted graphs with text units resoved at the stage group level
- `{play_name}_se*`: generate bipartite graph representations (star expansions of hypergraphs)
  - nodes: characters and text units, where text unit $\in \{\text{scene}, \text{(stage) group}\}$
    - `*_se-scene.nodes.csv` for bipartite graphs with text units resolved at the scene level
    - `*_se-group.nodes.csv` for bipartite graphs with text units resolved at the stage group level
    - `*_se-speech.nodes.csv` for bipartite graphs with text units resolved at the stage group level and edges resolved at the speech act level (provided for convenience)
  - edges: characters on stage in a text unit or speech acts (uttered or witnessed) in a text unit ($\text{speech}$)
    - `*_se-scene-w.edges.csv` for weighted or binary bipartite graphs with text units resolved at the scene level
    - `*_se-group-w.edges.csv` for weighted or binary bipartite graphs with text units resolved at the group level
    - `*_se-speech-mwd.edges.csv` for weighted or binary directed bipartite multigraphs with text units resolved at the group level and edges resolved at the speech act level
    - `*_se-speech-wd.edges.csv` for weighted or binary directed bipartite graphs with text units resolved at the group level and edges resolved at the speech act level (edge weights from `*_se-speech-mwd.edges.csv` aggregated into single edges)
- `{play_name}_hg*`: generate hypergraph representations
  - nodes: characters
    - `*_hg.nodes.csv` for all representations (contain global node weight columns for convenience)
  - edges: (non-binarized) co-occurrence on stage in a text unit, where text unit $\in \{\text{scene}, \text{(stage) group}\}$, or speech acts ($\text{speech}$)
    - `*_hg-scene-mw.edges.csv` for weighted or binary hypergraphs with text units resolved at the scene level
    - `*_hg-scene-mw.node-weights.csv` for edge-specific node weights in weighted hypergraphs with text units resolved at the scene level
    - `*_hg-group-mw.edges.csv` for weighted or binary hypergraphs with text units resolved at the stage group level
    - `*_hg-group-mw.node-weights.csv` for edge-specific node weights in weighted hypergraphs with text units resolved at the stage group level
    - `*_hg-speech-mwd.edges.csv` for weighted or binary directed multihypergraphs with text units resolved at the speech act level
    - `*_hg-speech-wd.edges.csv` for weighted or binary directed hypergraphs with text units resolved at the speech act level (edge weights from `*_hg-speech-mwd.edges.csv` aggregated into single edges)
    
The columns of the files depend on the file type; we give some examples below.
<br/>
We adopt the following naming conventions:
- Node column names
  - In edge lists of undirected graphs, the node identifiers are named `node1` and `node2`.
  - In edge lists of directed graphs, the node identifiers are named `source` and `target`.
- Edge weight names
  - In edge lists of weighted graphs, the potential weight columns are named `n_lines` and `n_tokens`
  - In edge lists of count-weighted graphs, the potential weight column is named `count`
- Edge indices
  - In edge lists of multigraphs that are not hypergraphs, the column `edge_index` serves to distinguish multiedges
  - In edge lists of multigraphs that are hypergraphs, the column `stagegroup` (or, if present, `setting`) serves to distinguish multiedges
  
You can build graphs and hypergraphs using the graph loader functions from the `hyperbard` source code.
<br/>
Alternatively, the CSV files themselves can be filtered and passed to the graph and hypergraph constructors provided by other libraries, such as `networkx` or `hypernetx` (which `hyperbard` uses under the hood).

In [2]:
from glob import glob
import pandas as pd
from statics import GRAPHDATA_PATH

In [3]:
graphdata_files = sorted(glob(f"{GRAPHDATA_PATH}/*csv"))

In [None]:
graphdata_files[:19]

## Clique Expansions

In [5]:
ce_nodes = pd.read_csv(f"{GRAPHDATA_PATH}/a-midsummer-nights-dream_ce.nodes.csv")
ce_nodes.head()

Unnamed: 0,node
0,#ATTENDANTS_MND
1,#Bottom_MND
2,#Demetrius_MND
3,#Egeus_MND
4,#FAIRIES.OBERON_MND


In [6]:
ce_scene_mw_edges = pd.read_csv(
    f"{GRAPHDATA_PATH}/a-midsummer-nights-dream_ce-scene-mw.edges.csv"
)
ce_scene_mw_edges.head()

Unnamed: 0,node1,node2,key,act,scene,n_tokens,n_lines,edge_index
0,#ATTENDANTS_MND,#Demetrius_MND,0,1,1,1920,257,1
1,#ATTENDANTS_MND,#Egeus_MND,0,1,1,1920,257,1
2,#ATTENDANTS_MND,#Helena_MND,0,1,1,1920,257,1
3,#ATTENDANTS_MND,#Hermia_MND,0,1,1,1920,257,1
4,#ATTENDANTS_MND,#Hippolyta_MND,0,1,1,1920,257,1


In [7]:
ce_group_mw_edges = pd.read_csv(
    f"{GRAPHDATA_PATH}/a-midsummer-nights-dream_ce-group-mw.edges.csv"
)
ce_group_mw_edges.head()

Unnamed: 0,node1,node2,key,act,scene,stagegroup,n_tokens,n_lines,edge_index
0,#ATTENDANTS_MND,#Hippolyta_MND,0,1,1,1,108,16,1
1,#ATTENDANTS_MND,#Philostrate_MND,0,1,1,1,108,16,1
2,#ATTENDANTS_MND,#Theseus_MND,0,1,1,1,108,16,1
3,#Hippolyta_MND,#Philostrate_MND,0,1,1,1,108,16,1
4,#Hippolyta_MND,#Theseus_MND,0,1,1,1,108,16,1


## Star Expansions

In [8]:
se_group_nodes = pd.read_csv(
    f"{GRAPHDATA_PATH}/a-midsummer-nights-dream_se-group.nodes.csv"
)
se_group_nodes.head()

Unnamed: 0,node,node_type
0,#ATTENDANTS_MND,character
1,#Bottom_MND,character
2,#Demetrius_MND,character
3,#Egeus_MND,character
4,#FAIRIES.OBERON_MND,character


In [9]:
se_group_edges = pd.read_csv(f"graphdata/a-midsummer-nights-dream_se-group-w.edges.csv")
se_group_edges.head()

Unnamed: 0,node1,node2,n_lines,n_tokens
0,#ATTENDANTS_MND,1.01.0001,16,108
1,#Hippolyta_MND,1.01.0001,16,108
2,#Philostrate_MND,1.01.0001,16,108
3,#Theseus_MND,1.01.0001,16,108
4,#ATTENDANTS_MND,1.01.0002,4,29


## Hypergraphs

In [10]:
hg_nodes = pd.read_csv(f"graphdata/a-midsummer-nights-dream_hg.nodes.csv")
hg_nodes.head()

Unnamed: 0,node,n_tokens_onstage,n_tokens_speaker,n_lines_onstage,n_lines_speaker
0,#ATTENDANTS_MND,4204,0,603,0
1,#Bottom_MND,5129,2040,727,275
2,#Demetrius_MND,6786,1066,944,143
3,#Egeus_MND,1443,296,196,42
4,#FAIRIES.OBERON_MND,1658,0,215,0


In [11]:
hg_group_edges = pd.read_csv(
    f"graphdata/a-midsummer-nights-dream_hg-group-mw.edges.csv"
)
hg_group_edges.head()

Unnamed: 0,act,scene,stagegroup,n_tokens,n_lines,onstage
0,1,1,1,108,16,#ATTENDANTS_MND #Hippolyta_MND #Philostrate_MN...
1,1,1,2,29,4,#ATTENDANTS_MND #Hippolyta_MND #Theseus_MND
2,1,1,3,799,109,#ATTENDANTS_MND #Demetrius_MND #Egeus_MND #Her...
3,1,1,4,413,53,#Hermia_MND #Lysander_MND
4,1,1,5,353,47,#Helena_MND #Hermia_MND #Lysander_MND


In [12]:
hg_group_node_weights = pd.read_csv(
    f"graphdata/a-midsummer-nights-dream_hg-group-mw.node-weights.csv"
)
hg_group_node_weights.head()

Unnamed: 0,act,scene,stagegroup,node,n_tokens_speaker,n_lines_speaker,n_tokens_onstage,n_lines_onstage
0,1,1,1,#Hippolyta_MND,35,5,108,16
1,1,1,1,#Theseus_MND,73,11,108,16
2,1,1,2,#Theseus_MND,29,4,29,4
3,1,1,3,#Demetrius_MND,13,2,799,109
4,1,1,3,#Egeus_MND,217,30,799,109
