In [1]:
import pandas as pd
import os

In [2]:
base_path = "../data/raw/uscode-complexity-main"
os.listdir(base_path)

['.gitignore',
 '.vscode',
 'contents_functions.py',
 'Data',
 'Data_Set1_Figures_part1.py',
 'Data_Set1_FIgures_part2.py',
 'Data_Set1_SI_Figures.py',
 'Data_Set2_Structure_Parsing.py',
 'Data_Set2_Tree_stat.py',
 'Data_Set3_Edge_list.py',
 'download_html.py',
 'fallback_pdf.py',
 'Figures',
 'ocr_processing_gemini.py',
 'parsing_functions.py',
 'README.md',
 'requirements.txt',
 'Technical_Validation_Figures.py',
 'tree_functions.py']

In [3]:
os.listdir(base_path + "/Data")

['Data Records',
 'OCR samples',
 'OCR_sample_processed',
 'Technical Validation',
 'Title2Name.csv',
 'US_govinfo']

In [4]:
os.listdir(base_path + "/Data/Data Records")

['Data Set1', 'Data Set2', 'Data Set3']

In [5]:
os.listdir(base_path + "/Data/Data Records/Data Set3")

['Backbone_structure.csv', 'Cross_Reference.csv', 'test_edge_list_web.csv']

In [6]:
cross_path = base_path + "/Data/Data Records/Data Set3/Cross_Reference.csv"
backbone_path = base_path + "/Data/Data Records/Data Set3/Backbone_structure.csv"

cross = pd.read_csv(cross_path)
backbone = pd.read_csv(backbone_path)

print("Cross_Reference shape:", cross.shape)
print("Backbone_structure shape:", backbone.shape)

print("\n Cross_Reference columns:", list(cross.columns))
print("Backbone_structure columns:", list(backbone.columns))

cross.head()

Cross_Reference shape: (59155, 10)
Backbone_structure shape: (2490, 12)

 Cross_Reference columns: ['Citing', 'Cited', 'Year', 'Citation', 'Citing Title', 'Cited Title', 'Citing Word Count', 'Cited Word Count', 'Citing/Word Counts', 'Cited/Word Counts']
Backbone_structure columns: ['Citing', 'Cited', 'Year', 'Weight', 'Citing Title', 'Cited Title', 'Citing Word Count', 'Cited Word Count', 'Citing/Word Counts', 'Cited/Word Counts', 'Citing Group', 'Cited Group']


Unnamed: 0,Citing,Cited,Year,Citation,Citing Title,Cited Title,Citing Word Count,Cited Word Count,Citing/Word Counts,Cited/Word Counts
0,2,5,1926,3,The Congress,Government Organization and Employees,16065,62319,0.000187,4.8e-05
1,2,18,1926,2,The Congress,Crimes and Criminal Procedure,16065,75339,0.000124,2.7e-05
2,2,41,1926,1,The Congress,Public Contracts,16065,1657,6.2e-05,0.000604
3,2,44,1926,1,The Congress,Public Printing and Documents,16065,22933,6.2e-05,4.4e-05
4,3,5,1926,5,The President,Government Organization and Employees,4022,62319,0.001243,8e-05


In [7]:
backbone.head()

Unnamed: 0,Citing,Cited,Year,Weight,Citing Title,Cited Title,Citing Word Count,Cited Word Count,Citing/Word Counts,Cited/Word Counts,Citing Group,Cited Group
0,5,1,1926,8,Government Organization and Employees,General Provisions,62319,1003,0.000128,0.007976,Government Structure,Government Structure
1,8,18,1926,9,Aliens and National and Citizenship,Crimes and Criminal Procedure,52213,75339,0.000172,0.000119,Society,Society
2,29,10,1926,6,Labor,Armed Forces,4295,82467,0.001397,7.3e-05,Society,National Defense
3,10,5,1926,8,Armed Forces,Government Organization and Employees,82467,62319,9.7e-05,0.000128,National Defense,Government Structure
4,10,1,1926,13,Armed Forces,General Provisions,82467,1003,0.000158,0.012961,National Defense,Government Structure


In [8]:
cross.info()
backbone.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59155 entries, 0 to 59154
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Citing              59155 non-null  int64  
 1   Cited               59155 non-null  int64  
 2   Year                59155 non-null  int64  
 3   Citation            59155 non-null  int64  
 4   Citing Title        59155 non-null  object 
 5   Cited Title         59155 non-null  object 
 6   Citing Word Count   59155 non-null  int64  
 7   Cited Word Count    59155 non-null  int64  
 8   Citing/Word Counts  59155 non-null  float64
 9   Cited/Word Counts   59155 non-null  float64
dtypes: float64(2), int64(6), object(2)
memory usage: 4.5+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2490 entries, 0 to 2489
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Citing              2490 non-nu

In [9]:
cross_missing = cross.isna().sum().sort_values(ascending=False)
backbone_missing = backbone.isna().sum().sort_values(ascending=False)

print("Cross missing values:\n", cross_missing[cross_missing > 0])
print("\nBackbone missing values:\n", backbone_missing[backbone_missing > 0])

Cross missing values:
 Series([], dtype: int64)

Backbone missing values:
 Series([], dtype: int64)


In [None]:
print("Cross years:", cross["Year"].min(), "to", cross["Year"].max())
print("Backbone years:", backbone["Year"].min(), "to", backbone["Year"].max())

print("\nCross year counts (top 10):")
print(cross["Year"].value_counts().head())

print("\nBackbone year counts (top 10):")
print(backbone["Year"].value_counts().head())

Cross years: 1926 to 2023
Backbone years: 1926 to 2023

Cross year counts (top 10):
Year
2023    1677
2022    1676
2020    1667
2021    1667
2018    1655
Name: count, dtype: int64

Backbone year counts (top 10):
Year
2022    92
2023    92
2020    90
2021    89
2018    80
Name: count, dtype: int64


In [None]:
cross_dupes = cross.duplicated(subset=["Citing", "Cited", "Year"]).sum()
backbone_dupes = backbone.duplicated(subset=["Citing", "Cited", "Year"]).sum()

print("Cross duplicates on (Citing, Cited, Year):", cross_dupes)
print("Backbone duplicates on (Citing, Cited, Year):", backbone_dupes)

Cross duplicates on (Citing, Cited, Year): 0
Backbone duplicates on (Citing, Cited, Year): 0


In [12]:
print("Cross Citation stats:")
print(cross["Citation"].describe())

print("\nBackbone Weight stats:")
print(backbone["Weight"].describe())

Cross Citation stats:
count    59155.000000
mean        39.622568
std         97.675446
min          1.000000
25%          3.000000
50%          9.000000
75%         32.000000
max       2018.000000
Name: Citation, dtype: float64

Backbone Weight stats:
count    2490.000000
mean      335.511245
std       286.428619
min         5.000000
25%       128.250000
50%       272.000000
75%       447.000000
max      2018.000000
Name: Weight, dtype: float64


In [13]:
graph_plan = {
    "Representation A (Full)": {
        "file": "Cross_Reference.csv",
        "source_col": "Citing",
        "target_col": "Cited",
        "weight_col": "Citation",
        "directed": True
    },
    "Representation B (Backbone)": {
        "file": "Backbone_structure.csv",
        "source_col": "Citing",
        "target_col": "Cited",
        "weight_col": "Weight",
        "directed": True
    }
}

graph_plan

{'Representation A (Full)': {'file': 'Cross_Reference.csv',
  'source_col': 'Citing',
  'target_col': 'Cited',
  'weight_col': 'Citation',
  'directed': True},
 'Representation B (Backbone)': {'file': 'Backbone_structure.csv',
  'source_col': 'Citing',
  'target_col': 'Cited',
  'weight_col': 'Weight',
  'directed': True}}