# Graph construction and feature extraction

This notebook builds an undirected co-authorship graph from `trimmed_dataset.csv`.
Nodes: authors (attribute: total_papers).
Edges: co-authorship links (attribute: weight = number of joint publications).

Outputs: graph object (NetworkX), node summary CSV, edge summary CSV, and a .png for visualization.

In [41]:
import pandas as pd
import networkx as nx
from collections import Counter
import itertools
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
csv_path = 'trimmed_dataset.csv'
assert os.path.exists(csv_path), f'File not found: {csv_path}'
df = pd.read_csv(csv_path)
print('Loaded dataset with shape:', df.shape)
df.head()

Loaded dataset with shape: (166, 15)


Unnamed: 0,abstract,authors,authors_parsed,categories,comments,doi,id,journal-ref,license,report-no,submitter,title,update_date,versions,author_list
0,Context. Swift data are revolutionising our ...,"P.A. Evans (1), A.P. Beardmore (1), K.L. Page ...","[[""Evans"", ""P. A."", """"], [""Beardmore"", ""A. P.""...",astro-ph,"8 pages, 6 figures, Accepted for publication i...",10.1051/0004-6361:20077530,704.0128,,,,Kim Page,An online repository of Swift/XRT light curves...,2009-11-13,"[{""version"": ""v1"", ""created"": ""Mon, 2 Apr 2007...","['P. A. Evans', 'A. P. Beardmore', 'K. L. Page..."
1,The star HE 1305-0007 is a metal-poor double...,"Wen-Yuan Cui (1,2,3), D. N. Cui (1), Y. S. Du ...","[[""Cui"", ""Wen-Yuan"", """"], [""Cui"", ""D. N."", """"]...",astro-ph,"4 pages, 3 figures, paper accepted for publica...",10.1088/0256-307X/24/5/081,704.0576,"Chin.Phys.Lett.24:1417-1421,2007",,,Wenyuan Cui,Neutron-Capture Elements in the Double-Enhance...,2009-06-23,"[{""version"": ""v1"", ""created"": ""Wed, 4 Apr 2007...","['Wen-Yuan Cui', 'D. N. Cui', 'Y. S. Du', 'B. ..."
2,The search for MSSM Higgs bosons will be an ...,"S. Gennai, S. Heinemeyer, A. Kalinowski, R. Ki...","[[""Gennai"", ""S."", """"], [""Heinemeyer"", ""S."", """"...",hep-ph,"24 pages, 8 figures",10.1140/epjc/s10052-007-0398-0,704.0619,"Eur.Phys.J.C52:383-395,2007",,"DCPT/07/12, IPPP/07/06",Sven Heinemeyer,Search for Heavy Neutral MSSM Higgs Bosons wit...,2008-11-26,"[{""version"": ""v1"", ""created"": ""Wed, 4 Apr 2007...","['S. Gennai', 'S. Heinemeyer', 'A. Kalinowski'..."
3,The blazar PKS0537-441 has been observed by ...,"E. Pian (1), P. Romano (2,3), A. Treves (4), G...","[[""Pian"", ""E."", """"], [""Romano"", ""P."", """"], [""T...",astro-ph,"24 pages, 7 figures, 3 tables, in press in the...",10.1086/518469,704.0958,"Astrophys.J.664:106-116,2007",,,Elena Pian,Simultaneous Swift and REM monitoring of the b...,2009-06-23,"[{""version"": ""v1"", ""created"": ""Fri, 6 Apr 2007...","['E. Pian', 'P. Romano', 'A. Treves', 'G. Ghis..."
4,Precise measurements of the single spin asym...,"H. Okada, I. Alekseev, A. Bravar, G. Bunce, S....","[[""Okada"", ""H."", """"], [""Alekseev"", ""I."", """"], ...",hep-ex,4 pages,10.1063/1.2750871,704.1031,"AIP Conf.Proc.915:681-684,2007",,,Hiromi Okada Dr.,Measurements of Single and Double Spin Asymmet...,2010-12-13,"[{""version"": ""v1"", ""created"": ""Sun, 8 Apr 2007...","['H. Okada', 'I. Alekseev', 'A. Bravar', 'G. B..."


In [None]:
# Parse authors into a list per row with validation rules
# The `authors` column contains comma-separated author names like:
# 'C. Balazs, E. L. Berger, P. M. Nadolsky'
import re

def is_valid_author(name):
    """Check if a string looks like a valid author name."""
    # Common noise patterns to filter out
    noise_patterns = [
        r'^et al\.?$',  # "et al" or "et al."
        r'^[0-9\(\)]+$',  # Just numbers or parentheses
        r'^[A-Z][A-Z]+$',  # All caps words (likely institutions)
        r'^The\s+.*\s+Collaboration$',  # Collaboration names
        r'^.*\d{4}.*$',  # Anything containing a year
        r'^(Germany|France|Italy|USA|UK|Japan|China)$',  # Common country names
        r'^(University|Institute|Lab|Laboratory).*$',  # Institution starts
        r'^.*\((ed|eds|editor|editors)\)$',  # Editor markers
        r'^\s*$',  # Empty or whitespace
    ]
    
    # If it matches any noise pattern, it's not valid
    for pattern in noise_patterns:
        if re.match(pattern, name, re.IGNORECASE):
            return False
    
    # Basic author name pattern (allowing initials, names with spaces)
    valid_patterns = [
        # Initial(s) + Last name: "A. B. Smith" or "A.B. Smith"
        r'^[A-Z]\.(\s*[A-Z]\.)*\s+[A-Z][a-zA-Z\'\-]+$',
        # Full name patterns: "John Smith" or "John A. Smith"
        r'^[A-Z][a-z]+(\s+[A-Z]\.)*\s+[A-Z][a-zA-Z\'\-]+$',
        # Last, First format: "Smith, John A."
        r'^[A-Z][a-zA-Z\'\-]+,\s*[A-Z][a-zA-Z\'\-]+(\s+[A-Z]\.)*$',
        # von/van/de patterns: "von Neumann" or "de Broglie"
        r'^(von|van|de|del|della|der|den|di|le|la)\s+[A-Z][a-zA-Z\'\-]+$'
    ]
    
    # Must match at least one valid pattern
    return any(re.match(pattern, name.strip()) for pattern in valid_patterns)

def parse_authors(s):
    if pd.isna(s):
        return []
    # Split on comma and clean each part
    parts = [p.strip() for p in s.split(',') if p.strip()]
    # Keep only valid author names
    valid_authors = [author for author in parts if is_valid_author(author)]
    return valid_authors

# Apply the improved parsing
df['author_list'] = df['authors'].astype(str).apply(parse_authors)

In [44]:
# Validation: check for any remaining potential noise in the parsed authors
all_authors = set()
for authors in df['author_list']:
    all_authors.update(authors)

print(f"\nTotal unique authors found: {len(all_authors)}")
print("\nSample of parsed author names (first 20):")
print('\n'.join(sorted(list(all_authors))[:20]))

# Look for potential remaining noise (very short or suspicious patterns)
suspicious = [name for name in all_authors if len(name.split()) < 2 or re.search(r'\d', name)]
if suspicious:
    print("\nPotential noise entries to review:")
    print('\n'.join(suspicious))
else:
    print("\nNo obvious noise entries found in the parsed authors.")


Total unique authors found: 2408

Sample of parsed author names (first 20):
A.
  Balanda
A.
  Bevan
A.
  Bueno
A.
  Caccianiga
A.
  Ceccucci
A.
  Chieffi
A.
  Evans
A.
  Falcone
A.
  Ferrero
A.
  Gillitzer
A.
  Khodjamirian
A.
  Kyriakis
A.
  Latina
A.
  Lenz
A.
  Mazure
A.
  Mizuno
A.
  Moretti
A.
  Nagaytsev
A.
  Nikitenko
A.
  Onofre

No obvious noise entries found in the parsed authors.


In [45]:
# Build undirected co-authorship graph
G = nx.Graph()
author_papers = Counter()  # counts how many papers each author has
edge_counts = Counter()    # counts joint publications between two authors

for authors in df['author_list']:
    # use unique authors for the paper (in case of duplicates)
    unique_authors = list(dict.fromkeys([a for a in authors if a]))
    # increment paper count for each author (one per paper)
    for a in unique_authors:
        author_papers[a] += 1
    # add co-authorship edges for every pair in this paper
    for a, b in itertools.combinations(sorted(unique_authors), 2):
        edge_counts[(a, b)] += 1

# Add nodes with total_papers attribute
for author, count in author_papers.items():
    G.add_node(author, total_papers=int(count))

# Add edges with weight attribute (number of joint publications)
for (a, b), w in edge_counts.items():
    G.add_edge(a, b, weight=int(w))

print('Graph built: nodes =', G.number_of_nodes(), 'edges =', G.number_of_edges())

Graph built: nodes = 2408 edges = 91193


In [46]:
# Compute basic graph statistics
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges()
density = nx.density(G)
degrees = dict(G.degree())
avg_degree = sum(degrees.values()) / n_nodes if n_nodes > 0 else 0

print(f'Total nodes: {n_nodes}')
print(f'Total edges: {n_edges}')
print(f'Density: {density:.6f}')
print(f'Average degree: {avg_degree:.3f}')

Total nodes: 2408
Total edges: 91193
Density: 0.031467
Average degree: 75.742


In [47]:
# Create summary tables for documentation
nodes_df = pd.DataFrame([{'author': n, 'total_papers': G.nodes[n].get('total_papers', 0), 'degree': G.degree(n)} for n in G.nodes()])
nodes_df = nodes_df.sort_values(['total_papers','degree'], ascending=[False, False]).reset_index(drop=True)

edges_df = pd.DataFrame([{'author_1': u, 'author_2': v, 'weight': d.get('weight',1)} for u,v,d in G.edges(data=True)])
edges_df = edges_df.sort_values('weight', ascending=False).reset_index(drop=True)

print('Top 10 authors by total_papers:')
display(nodes_df.head(10))

print('Top 10 co-author pairs by joint publications:')
display(edges_df.head(10))

Top 10 authors by total_papers:


Unnamed: 0,author,total_papers,degree
0,N. Gehrels,24,315
1,S. Heinemeyer,14,257
2,T. Suzuki,14,208
3,G. Weiglein,11,248
4,J. P. Osborne,8,174
5,H. A. Krimm,7,132
6,G. Chincarini,6,178
7,T. Saito,6,146
8,P. Giommi,6,88
9,T. Goto,5,224


Top 10 co-author pairs by joint publications:


Unnamed: 0,author_1,author_2,weight
0,S. Heinemeyer,G. Weiglein,11
1,N. Gehrels,H. A. Krimm,6
2,J. Tueller,C. B. Markwardt,5
3,G. Chincarini,G. Tagliaferri,5
4,J. Chiba,T. Ishikawa,4
5,T. Ishiwatari,K. Itahashi,4
6,T. Hanaki,M. Sato,4
7,T. Hanaki,M. Iwasaki,4
8,T. Hanaki,M. Iio,4
9,D. Tomono,E. Widmann,4


In [None]:
# Save artifacts: summaries and graph file for visualization
out_nodes = '/outputs/graph_construction_and_feature_extraction/nodes_summary.csv'
out_edges = '/outputs/graph_construction_and_feature_extraction/edges_summary.csv'
nodes_df.to_csv(out_nodes, index=False)
edges_df.to_csv(out_edges, index=False)
print('Saved:', out_nodes, out_edges)

In [None]:
# Set up the plot with a decent size
plt.figure(figsize=(15, 15))

# Use a spring layout for node positioning
# Scale by node degree to emphasize important authors
node_size = [G.degree(node) * 100 for node in G.nodes()]
pos = nx.spring_layout(G, k=1, iterations=50)

# Draw the network
nx.draw_networkx_nodes(G, pos, node_size=node_size, alpha=0.6, 
                      node_color='lightblue')
nx.draw_networkx_edges(G, pos, alpha=0.2, width=0.5)

# Add labels only for top authors (to avoid cluttering)
top_authors = nodes_df.head(10)['author'].tolist()
labels = {node: node if node in top_authors else '' for node in G.nodes()}
nx.draw_networkx_labels(G, pos, labels, font_size=8)

plt.title('Co-authorship Network\nNode size represents number of collaborations\nLabels shown for top 10 authors')
plt.axis('off')

# Save the plot
out_png = '/outputs/coauthorship_network.png'
plt.savefig(out_png, dpi=300, bbox_inches='tight')
plt.close()
print('Saved network visualization:', out_png)

## Notes
- The author parsing strategy here is a conservative split on commas. Does not take into account different format (e.g., `Last, First`).