## Import dependencies

In [1]:
import pandas as pd
import sqlite3

## Loading Citation-Paper Datasets

In [5]:
# Connect to the SQLite database file
conn = sqlite3.connect("data/openalex_works-13m.db")

In [6]:
### Table Name = 'works'
# Read the SQL query results directly into a pandas DataFrame
df = pd.read_sql_query("""
    SELECT *
    FROM works
    """,
    conn
)
print(df.head())

            id                            doi  apc_list_price   topic  \
0  W1775749144  10.1016/s0021-9258(19)52451-6          2500.0  T10602   
1  W2582743722                           None             NaN  T10538   
2  W2100837269               10.1038/227680a0         11690.0  T11048   
3  W2128635872         10.1006/abio.1976.9999          3320.0  T10333   
4  W4293247451   10.1016/0003-2697(76)90527-3          3320.0  T10333   

   referenced_works_count                                   referenced_works  \
0                      20  ["W1507976594", "W1515052776", "W1535643256", ...   
1                       0                                                 []   
2                      21  ["W1510011675", "W1561690684", "W1591624429", ...   
3                      19  ["W1506043065", "W1532876153", "W1568632913", ...   
4                      19  ["W1506043065", "W1568632913", "W1775749144", ...   

                                             authors  cited_by_count  \
0  ["A51

In [24]:
print(type(df["referenced_works"][0]))
print(type(df["referenced_works_count"][0]))
print(type(df["authors"][0]))
print("Number of topics:", len(set(df["topic"])))

df.info()

<class 'str'>
<class 'numpy.int64'>
<class 'str'>
Number of topics: 4513
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13825424 entries, 0 to 13825423
Data columns (total 9 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   id                      object 
 1   doi                     object 
 2   apc_list_price          float64
 3   topic                   object 
 4   referenced_works_count  int64  
 5   referenced_works        object 
 6   authors                 object 
 7   cited_by_count          int64  
 8   publication_date        object 
dtypes: float64(1), int64(2), object(6)
memory usage: 949.3+ MB


## Filter Paper Data based on Topics

In [16]:
target_topic = "T10181"

topic_df = df[df['topic'] == target_topic]

In [17]:
print(f"Number of works in topic {target_topic}: {len(topic_df)}")
print(topic_df.head())
topic_df.info()

Number of works in topic T10181: 11401
              id                        doi  apc_list_price   topic  \
57   W4385245566  10.48550/arxiv.1706.03762             NaN  T10181   
315  W1880262756      10.5555/944919.944937             NaN  T10181   
448  W2157331557        10.3115/v1/d14-1179             NaN  T10181   
517  W2963403868                       None             NaN  T10181   
568  W2101105183    10.3115/1073083.1073135             NaN  T10181   

     referenced_works_count  \
57                        0   
315                      28   
448                      31   
517                       0   
568                       3   

                                      referenced_works  \
57                                                  []   
315  ["W125892352", "W1508165687", "W1516111018", "...   
448  ["W1490600648", "W1606347560", "W1675954498", ...   
517                                                 []   
568      ["W2001810881", "W2732923061", "W3037252522"]   

## Build Paper Network

In [18]:
import ast
import networkx as nx
import pandas as pd

# Don't import cudf/cugraph here - we'll do it later with proper GPU detection
# Build a citation edge list from the existing `df`, run NetworkX (CPU) and cuGraph (GPU if available),
# compute PageRank and degree, and show top results.
import matplotlib.pyplot as plt

# 1) Parse referenced_works into Python lists (handles already-list objects or JSON-like strings)
def parse_list_cell(x):
    if x is None:
        return []
    if isinstance(x, (list, tuple)):
        return list(x)
    try:
        return ast.literal_eval(x)
    except Exception:
        # fallback: try to strip and split (very defensive)
        s = str(x).strip()
        if s.startswith("[") and s.endswith("]"):
            s = s[1:-1].strip()
        if not s:
            return []
        return [e.strip().strip('"').strip("'") for e in s.split(",") if e.strip()]
    
topic_df['referenced_works_parsed'] = topic_df['referenced_works'].apply(parse_list_cell)

# 2) Create an edge list DataFrame (source -> referenced_work)
edges = []
for src, refs in zip(topic_df['id'], topic_df['referenced_works_parsed']):
    for dst in refs:
        # ignore empty ids
        if dst:
            edges.append((src, dst))

edges_df = pd.DataFrame(edges, columns=['source', 'target'])
print(f"Built {len(edges_df)} edges")

Built 299526 edges


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  topic_df['referenced_works_parsed'] = topic_df['referenced_works'].apply(parse_list_cell)


## Running PageRank on Network

In [8]:
# 3) NetworkX (CPU) graph and analysis
G = nx.DiGraph()
G.add_edges_from(edges_df.itertuples(index=False, name=None))
print("NetworkX nodes, edges:", G.number_of_nodes(), G.number_of_edges())

# Basic metrics: degree and PageRank
nx_in_deg = dict(G.in_degree())
nx_out_deg = dict(G.out_degree())
nx_pagerank = nx.pagerank(G, alpha=0.85)

# Show top-10 by PageRank
top_nx_pr = sorted(nx_pagerank.items(), key=lambda kv: kv[1], reverse=True)[:10]
print("Top NetworkX PageRank (top 10):")
for node, score in top_nx_pr:
    print(node, score)

# Optional: small layout plot (only if graph small)
if G.number_of_nodes() <= 200:
    plt.figure(figsize=(8, 6))
    pos = nx.spring_layout(G, k=0.5, seed=42)
    nx.draw_networkx_nodes(G, pos, node_size=20)
    nx.draw_networkx_edges(G, pos, alpha=0.3)
    plt.title("Citation network (NetworkX)")
    plt.axis('off')
    plt.show()

NetworkX nodes, edges: 320699 819037
Top NetworkX PageRank (top 10):
W1775749144 0.00010770610030505522
W2100837269 8.925982623963923e-05
W1580023435 4.82279780686072e-05
W2152708819 3.3096220141780066e-05
W2129232972 3.0398343010996745e-05
W4285719527 2.9925987394631795e-05
W1536299500 2.9678960593807327e-05
W1968834637 2.9542463783937527e-05
W2018289835 2.7365750425364704e-05
W2138270253 2.500389092530062e-05


In [19]:
# 4) cuGraph (GPU) - run if available
import cudf
import cugraph as cg

use_cugraph = True

if use_cugraph and len(edges_df) > 0:
    # convert pandas edges_df to cudf
    cudf_edges = cudf.from_pandas(edges_df.astype(str))  # ensure string dtype for ids
    # cuGraph expects numeric vertex IDs for many algorithms; create a mapping
    unique_vertices = cudf.concat([cudf_edges['source'], cudf_edges['target']]).unique()
    vid_df = unique_vertices.reset_index(drop=True).to_frame(name='vertex_id')
    vid_df['int_id'] = cudf.RangeIndex(len(vid_df))
    # join to get numeric src/dst columns
    cudf_edges = cudf_edges.merge(vid_df, left_on='source', right_on='vertex_id', how='left').rename(columns={'int_id':'src_id'}).drop(columns=['vertex_id'])
    cudf_edges = cudf_edges.merge(vid_df, left_on='target', right_on='vertex_id', how='left').rename(columns={'int_id':'dst_id'}).drop(columns=['vertex_id'])
    # build cuGraph Graph with store_transposed=True for optimal PageRank performance
    G_gpu = cg.Graph(directed=True)
    # from_cudf_edgelist: graph.from_cudf_edgelist(cudf_edges, source='src_id', destination='dst_id', edge_attr=None)
    try:
        G_gpu.from_cudf_edgelist(cudf_edges, source='src_id', destination='dst_id', edge_attr=None)
    except Exception:
        # older API: G_gpu.add_edge_list(...)
        G_gpu.add_edge_list(cudf_edges['src_id'], cudf_edges['dst_id'], None)
    print("cuGraph nodes, edges (approx):", G_gpu.number_of_vertices(), G_gpu.number_of_edges())

    # PageRank on GPU (returns cudf DataFrame with vertex and pagerank)
    try:
        pr_df = cg.pagerank(G_gpu)
    except Exception:
        # alternate signature
        pr_df = cg.pagerank(G_gpu, alpha=0.85)
    # pr_df has numeric vertex ids; join back to original ids
    pr_df = pr_df.merge(vid_df, left_on='vertex', right_on='int_id', how='left')
    top_gpu_pr = pr_df.sort_values('pagerank', ascending=False).head(10)
    print("Top cuGraph PageRank (top 10):")
    print(top_gpu_pr[['vertex_id', 'pagerank']].to_pandas())

    # Degree (in-degree) using cuGraph Graph methods
    try:
        deg_df = G_gpu.in_degree()
        deg_df = deg_df.merge(vid_df, left_on='vertex', right_on='int_id', how='left')
        print("\nTop cuGraph in-degree (top 10):")
        print(deg_df.sort_values('degree', ascending=False).head(10)[['vertex_id','degree']].to_pandas())
    except Exception as e:
        print(f"cuGraph degree computation failed: {e}")
else:
    print("â„¹ Skipping cuGraph analysis (GPU not available or no edges).")
    print("  NetworkX results above are still valid for CPU-based analysis.")

cuGraph nodes, edges (approx): 106070 299526
Top cuGraph PageRank (top 10):
         vertex_id  pagerank
14736  W2101105183  0.000526
14739  W1632114991  0.000417
14737  W4285719527  0.000369
14749  W2006969979  0.000359
14738  W2964308564  0.000300
2531   W2153653739  0.000230
14751  W2038721957  0.000223
14750  W2124807415  0.000202
14741  W2133564696  0.000196
10267  W2061271742  0.000193

Top cuGraph in-degree (top 10):
         vertex_id  degree
16352  W2101105183    1013
16353  W4285719527     975
16354  W2964308564     792
16355  W1632114991     680
16356  W2962784628     632
16357  W2133564696     622
16358  W4385245566     614
16359  W2963403868     573
16360  W2130942839     550
16361  W2064675550     504


