# Data inspection

We load the two edge lists to be used and sanity-check that they look reasonable (sizes, years, any missing values).

In [None]:
from pathlib import Path

#Find the repo root (notebooks sometimes run from weird directories in VS).
repo_root = Path.cwd().resolve()
for _ in range(10):
    if (repo_root / "data").exists():
        break
    repo_root = repo_root.parent
else:
    raise FileNotFoundError("Could not find repo root (no 'data/' folder found)")

DATA_DIR = repo_root / "data"
RAW_DIR = DATA_DIR / "raw"

DATASET_DIR = RAW_DIR / "uscode-complexity-main"

DATASET_DIR

WindowsPath('C:/Users/byron/Documents/graph_modelling_choices/graph-modelling-optimization-outcomes/data/raw/uscode-complexity-main')

In [None]:
# Just to confirm the two edge list files exist before laoding.
cross_path = DATASET_DIR / "Data" / "Data Records" / "Data Set3" / "Cross_Reference.csv"
backbone_path = DATASET_DIR / "Data" / "Data Records" / "Data Set3" / "Backbone_structure.csv"

assert cross_path.exists(), f"Missing file: {cross_path}"
assert backbone_path.exists(), f"Missing file: {backbone_path}"

cross_path, backbone_path

(WindowsPath('C:/Users/byron/Documents/graph_modelling_choices/graph-modelling-optimization-outcomes/data/raw/uscode-complexity-main/Data/Data Records/Data Set3/Cross_Reference.csv'),
 WindowsPath('C:/Users/byron/Documents/graph_modelling_choices/graph-modelling-optimization-outcomes/data/raw/uscode-complexity-main/Data/Data Records/Data Set3/Backbone_structure.csv'))

In [3]:
import pandas as pd

cross = pd.read_csv(cross_path)
backbone = pd.read_csv(backbone_path)

cross.shape, backbone.shape

((59155, 10), (2490, 12))

In [4]:
cross.head()

Unnamed: 0,Citing,Cited,Year,Citation,Citing Title,Cited Title,Citing Word Count,Cited Word Count,Citing/Word Counts,Cited/Word Counts
0,2,5,1926,3,The Congress,Government Organization and Employees,16065,62319,0.000187,4.8e-05
1,2,18,1926,2,The Congress,Crimes and Criminal Procedure,16065,75339,0.000124,2.7e-05
2,2,41,1926,1,The Congress,Public Contracts,16065,1657,6.2e-05,0.000604
3,2,44,1926,1,The Congress,Public Printing and Documents,16065,22933,6.2e-05,4.4e-05
4,3,5,1926,5,The President,Government Organization and Employees,4022,62319,0.001243,8e-05


In [5]:
backbone.head()

Unnamed: 0,Citing,Cited,Year,Weight,Citing Title,Cited Title,Citing Word Count,Cited Word Count,Citing/Word Counts,Cited/Word Counts,Citing Group,Cited Group
0,5,1,1926,8,Government Organization and Employees,General Provisions,62319,1003,0.000128,0.007976,Government Structure,Government Structure
1,8,18,1926,9,Aliens and National and Citizenship,Crimes and Criminal Procedure,52213,75339,0.000172,0.000119,Society,Society
2,29,10,1926,6,Labor,Armed Forces,4295,82467,0.001397,7.3e-05,Society,National Defense
3,10,5,1926,8,Armed Forces,Government Organization and Employees,82467,62319,9.7e-05,0.000128,National Defense,Government Structure
4,10,1,1926,13,Armed Forces,General Provisions,82467,1003,0.000158,0.012961,National Defense,Government Structure


In [6]:
print("Cross_Reference columns:", list(cross.columns))
print("Backbone_structure columns:", list(backbone.columns))

Cross_Reference columns: ['Citing', 'Cited', 'Year', 'Citation', 'Citing Title', 'Cited Title', 'Citing Word Count', 'Cited Word Count', 'Citing/Word Counts', 'Cited/Word Counts']
Backbone_structure columns: ['Citing', 'Cited', 'Year', 'Weight', 'Citing Title', 'Cited Title', 'Citing Word Count', 'Cited Word Count', 'Citing/Word Counts', 'Cited/Word Counts', 'Citing Group', 'Cited Group']


In [10]:
cross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59155 entries, 0 to 59154
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Citing              59155 non-null  int64  
 1   Cited               59155 non-null  int64  
 2   Year                59155 non-null  int64  
 3   Citation            59155 non-null  int64  
 4   Citing Title        59155 non-null  object 
 5   Cited Title         59155 non-null  object 
 6   Citing Word Count   59155 non-null  int64  
 7   Cited Word Count    59155 non-null  int64  
 8   Citing/Word Counts  59155 non-null  float64
 9   Cited/Word Counts   59155 non-null  float64
dtypes: float64(2), int64(6), object(2)
memory usage: 4.5+ MB


In [9]:
backbone.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2490 entries, 0 to 2489
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Citing              2490 non-null   int64  
 1   Cited               2490 non-null   int64  
 2   Year                2490 non-null   int64  
 3   Weight              2490 non-null   int64  
 4   Citing Title        2490 non-null   object 
 5   Cited Title         2490 non-null   object 
 6   Citing Word Count   2490 non-null   int64  
 7   Cited Word Count    2490 non-null   int64  
 8   Citing/Word Counts  2490 non-null   float64
 9   Cited/Word Counts   2490 non-null   float64
 10  Citing Group        2490 non-null   object 
 11  Cited Group         2490 non-null   object 
dtypes: float64(2), int64(6), object(4)
memory usage: 233.6+ KB


In [None]:
# Year coverage: we will decide later if we use all years or a snapshot.
print("Cross years:", cross["Year"].min(), "to", cross["Year"].max())
print("Backbone years:", backbone["Year"].min(), "to", backbone["Year"].max())

Cross years: 1926 to 2023
Backbone years: 1926 to 2023


In [None]:
# If duplicates are present, weights might be aggregated by summing them.
print("Cross duplicates:", cross.duplicated(subset=["Citing", "Cited", "Year"]).sum())
print("Backbone duplicates:", backbone.duplicated(subset=["Citing", "Cited", "Year"]).sum())

Cross duplicates: 0
Backbone duplicates: 0


## Next steps

I will work with three graph representations of the same legal system.

- **Full cross-reference graph**
Nodes are individual legal units ('Citing' -> 'Cited').
Edge weight is the number of citations.

- **Backbone graph**
Same node definition with simplified backbone provided by dataset.
Edge weight is the backbone 'Weight'.

- **Title-level graph**
Nodes are legal titles rather than individual units ('Citing Title' -> 'Cited Title').
Edge weights are the total number of citations between titles.

In [15]:
representation_plan = {
    "A_full": {"file": "Cross_Reference.csv", "source": "Citing", "target": "Cited", "weight": "Citation", "directed": True},
    "B_backbone": {"file": "Backbone_structure.csv", "source": "Citing", "target": "Cited", "weight": "Weight", "directed": True},
    "C_title_agg": {"file": "Cross_Reference.csv", "source": "Citing Title", "target": "Cited Title", "weight": "sum(Citation)", "directed": True}
}

representation_plan

{'A_full': {'file': 'Cross_Reference.csv',
  'source': 'Citing',
  'target': 'Cited',
  'weight': 'Citation',
  'directed': True},
 'B_backbone': {'file': 'Backbone_structure.csv',
  'source': 'Citing',
  'target': 'Cited',
  'weight': 'Weight',
  'directed': True},
 'C_title_agg': {'file': 'Cross_Reference.csv',
  'source': 'Citing Title',
  'target': 'Cited Title',
  'weight': 'sum(Citation)',
  'directed': True}}