# Graph construction and feature extraction

This notebook builds an undirected co-authorship graph from `trimmed_dataset.csv`.
Nodes: authors (attribute: total_papers).
Edges: co-authorship links (attribute: weight = number of joint publications).

Outputs: graph object (NetworkX), node summary CSV, edge summary CSV, and a .png for visualization.

In [4]:
import pandas as pd
import ast
import re
import networkx as nx
from collections import Counter
import itertools
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')

In [5]:
csv_path = '/mnt/c/Users/Isaac/Documents/Visual Studio Code/GraphMining_Amalzen/trimmed_dataset.csv'
assert os.path.exists(csv_path), f'File not found: {csv_path}'
df = pd.read_csv(csv_path)
print('Loaded dataset with shape:', df.shape)
df.head()

Loaded dataset with shape: (166, 15)


Unnamed: 0,abstract,authors,authors_parsed,categories,comments,doi,id,journal-ref,license,report-no,submitter,title,update_date,versions,author_list
0,Context. Swift data are revolutionising our ...,"P.A. Evans (1), A.P. Beardmore (1), K.L. Page ...","[[""Evans"", ""P. A."", """"], [""Beardmore"", ""A. P.""...",astro-ph,"8 pages, 6 figures, Accepted for publication i...",10.1051/0004-6361:20077530,704.0128,,,,Kim Page,An online repository of Swift/XRT light curves...,2009-11-13,"[{""version"": ""v1"", ""created"": ""Mon, 2 Apr 2007...","['P. A. Evans', 'A. P. Beardmore', 'K. L. Page..."
1,The star HE 1305-0007 is a metal-poor double...,"Wen-Yuan Cui (1,2,3), D. N. Cui (1), Y. S. Du ...","[[""Cui"", ""Wen-Yuan"", """"], [""Cui"", ""D. N."", """"]...",astro-ph,"4 pages, 3 figures, paper accepted for publica...",10.1088/0256-307X/24/5/081,704.0576,"Chin.Phys.Lett.24:1417-1421,2007",,,Wenyuan Cui,Neutron-Capture Elements in the Double-Enhance...,2009-06-23,"[{""version"": ""v1"", ""created"": ""Wed, 4 Apr 2007...","['Wen-Yuan Cui', 'D. N. Cui', 'Y. S. Du', 'B. ..."
2,The search for MSSM Higgs bosons will be an ...,"S. Gennai, S. Heinemeyer, A. Kalinowski, R. Ki...","[[""Gennai"", ""S."", """"], [""Heinemeyer"", ""S."", """"...",hep-ph,"24 pages, 8 figures",10.1140/epjc/s10052-007-0398-0,704.0619,"Eur.Phys.J.C52:383-395,2007",,"DCPT/07/12, IPPP/07/06",Sven Heinemeyer,Search for Heavy Neutral MSSM Higgs Bosons wit...,2008-11-26,"[{""version"": ""v1"", ""created"": ""Wed, 4 Apr 2007...","['S. Gennai', 'S. Heinemeyer', 'A. Kalinowski'..."
3,The blazar PKS0537-441 has been observed by ...,"E. Pian (1), P. Romano (2,3), A. Treves (4), G...","[[""Pian"", ""E."", """"], [""Romano"", ""P."", """"], [""T...",astro-ph,"24 pages, 7 figures, 3 tables, in press in the...",10.1086/518469,704.0958,"Astrophys.J.664:106-116,2007",,,Elena Pian,Simultaneous Swift and REM monitoring of the b...,2009-06-23,"[{""version"": ""v1"", ""created"": ""Fri, 6 Apr 2007...","['E. Pian', 'P. Romano', 'A. Treves', 'G. Ghis..."
4,Precise measurements of the single spin asym...,"H. Okada, I. Alekseev, A. Bravar, G. Bunce, S....","[[""Okada"", ""H."", """"], [""Alekseev"", ""I."", """"], ...",hep-ex,4 pages,10.1063/1.2750871,704.1031,"AIP Conf.Proc.915:681-684,2007",,,Hiromi Okada Dr.,Measurements of Single and Double Spin Asymmet...,2010-12-13,"[{""version"": ""v1"", ""created"": ""Sun, 8 Apr 2007...","['H. Okada', 'I. Alekseev', 'A. Bravar', 'G. B..."


In [6]:
# Parse authors from authors_parsed column (structured format)
def extract_authors_parsed(parsed_str):
    """Extract and normalize author names from authors_parsed column.
    Expected format: List of [lastname, firstname, middle] entries"""
    if pd.isna(parsed_str):
        return []
    
    try:
        # Parse the string representation of the list
        author_list = ast.literal_eval(parsed_str)
        
        # Build full names from [lastname, firstname, middle] format
        full_names = []
        for author in author_list:
            if len(author) >= 2:  # Must have at least lastname and firstname
                # Combine: firstname middle lastname
                first = author[1].strip()
                last = author[0].strip()
                middle = author[2].strip() if len(author) > 2 else ""
                
                # Build full name
                if middle:
                    full_name = f"{first} {middle} {last}"
                else:
                    full_name = f"{first} {last}"
                
                # Add normalized name if it looks valid
                if full_name.strip():
                    full_names.append(full_name.strip())
        
        return full_names
    except:
        return []

# Create both normalized and display versions of author names
df['author_list'] = df['authors_parsed'].apply(extract_authors_parsed)

# Validation: check the parsed authors
all_authors = set()
for authors in df['author_list']:
    all_authors.update(authors)

print(f"\nTotal unique authors found: {len(all_authors)}")
print("\nSample of parsed author names (first 20):")
print('\n'.join(sorted(list(all_authors))[:20]))

# Basic validation of name formats
malformed = []
for name in all_authors:
    parts = name.split()
    if len(parts) < 2:  # Should have at least first and last name
        malformed.append(name)
    elif any(bool(re.search(r'\d', part)) for part in parts):  # Check for digits
        malformed.append(name)

if malformed:
    print("\nPotential issues in parsed names:")
    print('\n'.join(malformed))
else:
    print("\nAll parsed names appear to be well-formed.")


Total unique authors found: 2792

Sample of parsed author names (first 20):
--
A. A. Abdo
A. A. Lednev
A. A. Mozhegorov
A. A. Petrov
A. A. Wells
A. Acha
A. Akindinov
A. Alici
A. Alshino
A. Amoroso
A. Andronic
A. Arbuzov
A. Argan
A. B. Gridnev
A. B. Hill
A. B. Kaidalov
A. Badertscher
A. Balanda
A. Baran

Potential issues in parsed names:
Zhaxisangzhu
Labaciren
Krishichayan
Risdiana
--
Danzengluobu
Dinh-V-Trung


In [7]:
# Validation: check for any remaining potential noise in the parsed authors
all_authors = set()
for authors in df['author_list']:
    all_authors.update(authors)

print(f"\nTotal unique authors found: {len(all_authors)}")
print("\nSample of parsed author names (first 20):")
print('\n'.join(sorted(list(all_authors))[:20]))

# Look for potential remaining noise (very short or suspicious patterns)
suspicious = [name for name in all_authors if len(name.split()) < 2 or re.search(r'\d', name)]
if suspicious:
    print("\nPotential noise entries to review:")
    print('\n'.join(suspicious))
else:
    print("\nNo obvious noise entries found in the parsed authors.")


Total unique authors found: 2792

Sample of parsed author names (first 20):
--
A. A. Abdo
A. A. Lednev
A. A. Mozhegorov
A. A. Petrov
A. A. Wells
A. Acha
A. Akindinov
A. Alici
A. Alshino
A. Amoroso
A. Andronic
A. Arbuzov
A. Argan
A. B. Gridnev
A. B. Hill
A. B. Kaidalov
A. Badertscher
A. Balanda
A. Baran

Potential noise entries to review:
Zhaxisangzhu
Labaciren
Krishichayan
Risdiana
--
Danzengluobu
Dinh-V-Trung


In [8]:
# Build undirected co-authorship graph
G = nx.Graph()
author_papers = Counter()  # counts how many papers each author has
edge_counts = Counter()    # counts joint publications between two authors

for authors in df['author_list']:
    # use unique authors for the paper (in case of duplicates)
    unique_authors = list(dict.fromkeys([a for a in authors if a]))
    # increment paper count for each author (one per paper)
    for a in unique_authors:
        author_papers[a] += 1
    # add co-authorship edges for every pair in this paper
    for a, b in itertools.combinations(sorted(unique_authors), 2):
        edge_counts[(a, b)] += 1

# Add nodes with total_papers attribute
for author, count in author_papers.items():
    G.add_node(author, total_papers=int(count))

# Add edges with weight attribute (number of joint publications)
for (a, b), w in edge_counts.items():
    G.add_edge(a, b, weight=int(w))

print('Graph built: nodes =', G.number_of_nodes(), 'edges =', G.number_of_edges())

Graph built: nodes = 2792 edges = 118561


In [9]:
# Compute basic graph statistics
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges()
density = nx.density(G)
degrees = dict(G.degree())
avg_degree = sum(degrees.values()) / n_nodes if n_nodes > 0 else 0

print(f'Total nodes: {n_nodes}')
print(f'Total edges: {n_edges}')
print(f'Density: {density:.6f}')
print(f'Average degree: {avg_degree:.3f}')

Total nodes: 2792
Total edges: 118561
Density: 0.030430
Average degree: 84.929


In [10]:
# Create summary tables for documentation
nodes_df = pd.DataFrame([{'author': n, 'total_papers': G.nodes[n].get('total_papers', 0), 'degree': G.degree(n)} for n in G.nodes()])
nodes_df = nodes_df.sort_values(['total_papers','degree'], ascending=[False, False]).reset_index(drop=True)

edges_df = pd.DataFrame([{'author_1': u, 'author_2': v, 'weight': d.get('weight',1)} for u,v,d in G.edges(data=True)])
edges_df = edges_df.sort_values('weight', ascending=False).reset_index(drop=True)

print('Top 10 authors by total_papers:')
display(nodes_df.head(10))

print('Top 10 co-author pairs by joint publications:')
display(edges_df.head(10))

Top 10 authors by total_papers:


Unnamed: 0,author,total_papers,degree
0,N. Gehrels,35,372
1,J. P. Osborne,22,298
2,T. Suzuki,16,299
3,D. N. Burrows,15,193
4,S. Heinemeyer,14,264
5,G. Weiglein,14,263
6,P. Romano,12,168
7,B. Zhang,11,322
8,K. L. Page,11,155
9,G. Chincarini,10,201


Top 10 co-author pairs by joint publications:


Unnamed: 0,author_1,author_2,weight
0,S. Heinemeyer,G. Weiglein,13
1,D. N. Burrows,N. Gehrels,12
2,J. P. Osborne,D. N. Burrows,10
3,J. P. Osborne,N. Gehrels,10
4,P. Romano,S. Campana,9
5,P. Romano,G. Chincarini,9
6,K. L. Page,J. P. Osborne,9
7,G. Tagliaferri,G. Chincarini,9
8,N. Gehrels,P. Romano,9
9,S. Campana,G. Chincarini,8


In [11]:
# Save artifacts: summaries and graph file for visualization
out_nodes = '/mnt/c/Users/Isaac/Documents/Visual Studio Code/GraphMining_Amalzen/outputs/graph_construction_and_feature_extraction/nodes_summary.csv'
out_edges = '/mnt/c/Users/Isaac/Documents/Visual Studio Code/GraphMining_Amalzen/outputs/graph_construction_and_feature_extraction/edges_summary.csv'
nodes_df.to_csv(out_nodes, index=False)
edges_df.to_csv(out_edges, index=False)
print('Saved:', out_nodes, out_edges)

Saved: /mnt/c/Users/Isaac/Documents/Visual Studio Code/GraphMining_Amalzen/outputs/graph_construction_and_feature_extraction/nodes_summary.csv /mnt/c/Users/Isaac/Documents/Visual Studio Code/GraphMining_Amalzen/outputs/graph_construction_and_feature_extraction/edges_summary.csv


In [12]:
# Create an enhanced network visualization
plt.figure(figsize=(20, 16))

# Calculate node sizes based on degree centrality with better scaling
node_degrees = dict(G.degree())
max_degree = max(node_degrees.values())
node_size = [((node_degrees[node] / max_degree) * 300 + 50) for node in G.nodes()]

# Use the same layout parameters as the Louvain visualization for consistency
print("Calculating layout...")
pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)

# Draw edges with improved visibility (matching Louvain style)
nx.draw_networkx_edges(G, pos, 
                      alpha=0.2,
                      width=0.5,
                      edge_color='gray')

# Draw nodes with better visibility and styling
nodes = nx.draw_networkx_nodes(G, pos,
                              node_size=node_size,
                              node_color='steelblue',
                              alpha=0.8,
                              edgecolors='black',
                              linewidths=0.5)

# Add labels for top authors with improved visibility (top 10 to match Louvain)
top_authors = sorted(G.degree(), key=lambda x: x[1], reverse=True)[:10]
labels = {node: node for node, degree in top_authors}
nx.draw_networkx_labels(G, pos, labels, 
                       font_size=8,
                       font_weight='bold',
                       bbox=dict(facecolor='white', 
                               alpha=0.7,
                               edgecolor='none',
                               pad=0.5))

# Add title and styling
plt.title('Co-authorship Network\n', 
         fontsize=16, 
         fontweight='bold')

# Add legend-like text for interpretation
plt.figtext(0.02, 0.02, 
            'Node size: Proportional to number of collaborations\n' +
            'Node color: Author (sized by collaboration count)\n' +
            'Edge: Represents co-authorship between authors',
            fontsize=10,
            bbox=dict(facecolor='white', alpha=0.9, edgecolor='gray'))

plt.axis('off')
plt.tight_layout()

# Save the plot with high resolution
output_path = '/mnt/c/Users/Isaac/Documents/Visual Studio Code/GraphMining_Amalzen/outputs/graph_construction_and_feature_extraction/coauthorship_network.png'
plt.savefig(output_path, 
            dpi=300, 
            bbox_inches='tight',
            facecolor='white')
plt.close()
print(f'✓ Saved network visualization: {output_path}')

Calculating layout...
✓ Saved network visualization: /mnt/c/Users/Isaac/Documents/Visual Studio Code/GraphMining_Amalzen/outputs/graph_construction_and_feature_extraction/coauthorship_network.png
✓ Saved network visualization: /mnt/c/Users/Isaac/Documents/Visual Studio Code/GraphMining_Amalzen/outputs/graph_construction_and_feature_extraction/coauthorship_network.png


## Notes
- Authors are parsed from the structured `authors_parsed` column which contains [lastname, firstname, middle] entries
- Names are normalized to "Firstname [Middle] Lastname" format for consistency across all notebooks
- This matches the format used in centrality and community detection analyses
- The graph maintains the same author identities as other notebooks, enabling direct comparison of results