In [2]:
import pandas as pd

# Import the four dataframes
df_artists = pd.read_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists.csv')
df_songs = pd.read_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_songs.csv')
df_albums = pd.read_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_albums.csv')
df_artists_top_10_songs_only = pd.read_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists_top_10_songs_only.csv')

# Display basic info about each dataframe
print(f"df_artists shape: {df_artists.shape}")
print(f"df_songs shape: {df_songs.shape}")
print(f"df_albums shape: {df_albums.shape}")
print(f"df_artists_top_10_songs_only shape: {df_artists_top_10_songs_only.shape}")


df_artists shape: (14226, 54)
df_songs shape: (5099, 16)
df_albums shape: (4569, 16)
df_artists_top_10_songs_only shape: (2420, 54)


In [12]:
import psycopg2

DB_PARAMS = {
    'dbname': 'postgres',  # Connect to default postgres database
    'user': 'musicbrainz',
    'password': 'musicbrainz',
    'host': 'localhost',
    'port': 5432
}

try:
    conn = psycopg2.connect(**DB_PARAMS)
    conn.autocommit = True
    
    with conn.cursor() as cur:
        # List all databases
        cur.execute("""
            SELECT datname 
            FROM pg_database 
            WHERE datistemplate = false
            ORDER BY datname
        """)
        databases = cur.fetchall()
        
        print("Available databases:")
        for db in databases:
            print(f"  {db[0]}")
    
    conn.close()
except Exception as e:
    print(f"Error: {e}")


Available databases:
  musicbrainz
  musicbrainz_db
  postgres


In [16]:
import pandas as pd
import psycopg2
from tqdm import tqdm

# Database connection parameters
DB_PARAMS = {
    'dbname': 'musicbrainz_db',
    'user': 'musicbrainz',
    'password': 'musicbrainz',
    'host': 'localhost',
    'port': 5432
}

print("Step 1: Loading all dataframes...")
df_artists = pd.read_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists.csv')
df_songs = pd.read_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_songs.csv')
df_albums = pd.read_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_albums.csv')
df_artists_top_10_songs_only = pd.read_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists_top_10_songs_only.csv')

print(f"  df_artists: {df_artists.shape}")
print(f"  df_songs: {df_songs.shape}")
print(f"  df_albums: {df_albums.shape}")
print(f"  df_artists_top_10_songs_only: {df_artists_top_10_songs_only.shape}")

# Step 2: Get all unique artist names
print("\nStep 2: Collecting unique artist names...")
all_artist_names = set()

if 'performer_normalized' in df_artists.columns:
    all_artist_names.update(df_artists['performer_normalized'].dropna().unique())

if 'performer_normalized' in df_songs.columns:
    all_artist_names.update(df_songs['performer_normalized'].dropna().unique())

if 'performer_normalized' in df_albums.columns:
    all_artist_names.update(df_albums['performer_normalized'].dropna().unique())

if 'performer_normalized' in df_artists_top_10_songs_only.columns:
    all_artist_names.update(df_artists_top_10_songs_only['performer_normalized'].dropna().unique())

all_artist_names = list(all_artist_names)
print(f"  Found {len(all_artist_names)} unique artist names to look up")

# Step 3: Query MusicBrainz database for artist IDs in batches
print("\nStep 3: Querying MusicBrainz database for artist IDs (batch mode)...")

artist_name_to_id = {}
batch_size = 1000  # Process 1000 names at a time

conn = psycopg2.connect(**DB_PARAMS)
conn.autocommit = True

num_batches = (len(all_artist_names) + batch_size - 1) // batch_size

with conn.cursor() as cur:
    for i in tqdm(range(num_batches), desc="Processing batches"):
        batch_names = all_artist_names[i * batch_size:(i + 1) * batch_size]
        
        # Single query for the entire batch using unnest and join
        query = """
            SELECT DISTINCT ON (LOWER(input_name)) 
                input_name, 
                artist.id, 
                artist.name
            FROM unnest(%s::text[]) AS input_name
            LEFT JOIN artist ON LOWER(artist.name) = LOWER(input_name)
            WHERE artist.id IS NOT NULL
        """
        
        cur.execute(query, (batch_names,))
        results = cur.fetchall()
        
        # Map results
        for input_name, artist_id, matched_name in results:
            artist_name_to_id[input_name] = str(artist_id)

conn.close()

# Find which artists weren't matched
not_found = [name for name in all_artist_names if name not in artist_name_to_id]

print(f"  Successfully matched: {len(artist_name_to_id):,} artists")
print(f"  Not found in MusicBrainz: {len(not_found):,} artists")

if not_found:
    print(f"\nSample of artists not found (first 10):")
    for name in not_found[:10]:
        print(f"    - {name}")

# Step 4: Add musicbrainz_artist_id column to all dataframes
print("\nStep 4: Adding musicbrainz_artist_id column to all dataframes...")

def add_mb_id_column(df, name):
    if 'performer_normalized' in df.columns:
        df['musicbrainz_artist_id'] = df['performer_normalized'].map(artist_name_to_id)
        matched = df['musicbrainz_artist_id'].notna().sum()
        total = len(df)
        print(f"  {name}: {matched:,}/{total:,} rows matched ({matched/total*100:.1f}%)")
    else:
        print(f"  {name}: No 'performer_normalized' column found")
    return df

df_artists = add_mb_id_column(df_artists, 'df_artists')
df_songs = add_mb_id_column(df_songs, 'df_songs')
df_albums = add_mb_id_column(df_albums, 'df_albums')
df_artists_top_10_songs_only = add_mb_id_column(df_artists_top_10_songs_only, 'df_artists_top_10_songs_only')

# Step 5: Save updated dataframes
print("\nStep 5: Saving updated dataframes...")
df_artists.to_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists.csv', index=False)
print("  ✓ Saved df_artists.csv")

df_songs.to_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_songs.csv', index=False)
print("  ✓ Saved df_songs.csv")

df_albums.to_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_albums.csv', index=False)
print("  ✓ Saved df_albums.csv")

df_artists_top_10_songs_only.to_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists_top_10_songs_only.csv', index=False)
print("  ✓ Saved df_artists_top_10_songs_only.csv")

print("\n" + "="*80)
print("COMPLETE: All dataframes now have 'musicbrainz_artist_id' column")
print("="*80)


Step 1: Loading all dataframes...
  df_artists: (14226, 54)
  df_songs: (5099, 16)
  df_albums: (4569, 16)
  df_artists_top_10_songs_only: (2420, 54)

Step 2: Collecting unique artist names...
  Found 14226 unique artist names to look up

Step 3: Querying MusicBrainz database for artist IDs (batch mode)...


Processing batches: 100%|██████████| 15/15 [00:27<00:00,  1.81s/it]

  Successfully matched: 11,564 artists
  Not found in MusicBrainz: 2,662 artists

Sample of artists not found (first 10):
    - tsol
    - the dropkick murphys
    - auli'i cravalho
    - beele
    - k.w.s.
    - danyel gerard
    - persuasions
    - '68
    - 21 savage & tyler, the creator
    - steve stevens atomic playboys

Step 4: Adding musicbrainz_artist_id column to all dataframes...
  df_artists: 11,564/14,226 rows matched (81.3%)
  df_songs: 4,500/5,099 rows matched (88.3%)
  df_albums: 4,181/4,569 rows matched (91.5%)
  df_artists_top_10_songs_only: 2,041/2,420 rows matched (84.3%)

Step 5: Saving updated dataframes...
  ✓ Saved df_artists.csv
  ✓ Saved df_songs.csv
  ✓ Saved df_albums.csv
  ✓ Saved df_artists_top_10_songs_only.csv

COMPLETE: All dataframes now have 'musicbrainz_artist_id' column





In [25]:
import pandas as pd
import psycopg2
from psycopg2 import sql
from tqdm import tqdm
import os
from pathlib import Path

# Database connection parameters
DB_PARAMS = {
    'dbname': 'musicbrainz_db',
    'user': 'musicbrainz',
    'password': 'musicbrainz',
    'host': 'localhost',
    'port': 5432
}

# ============================================================================
# LOAD BILLBOARD ARTIST IDs
# ============================================================================

print("Loading Billboard artist IDs...")
df_artists_top_10 = pd.read_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists_top_10_songs_only.csv')

# Get artist IDs and filter out NaN values, convert to integers
artist_ids = df_artists_top_10['musicbrainz_artist_id'].dropna().astype(int).unique().tolist()

print(f"Total artists in df_artists_top_10_songs_only: {len(df_artists_top_10)}")
print(f"Artists with MusicBrainz IDs: {len(artist_ids)}")
print(f"Artists without MusicBrainz IDs: {df_artists_top_10['musicbrainz_artist_id'].isna().sum()}")

# ============================================================================
# PHASE 1: Extract Master Edge List
# ============================================================================

print("\n" + "="*80)
print("PHASE 1: Extracting Master Edge List from MusicBrainz")
print("="*80)

def extract_master_edge_list(artist_ids, batch_size=500):
    """
    Extract all collaboration edges for the given artist IDs.
    Uses batching to avoid memory issues with large IN clauses.
    """
    
    # SQL query template - Uses INTEGER type, not UUID
    sql_query = """
    WITH EarliestRG AS (
        -- Anchor every album/single to its original release year
        SELECT r.release_group, MIN(rc.date_year) as first_year
        FROM release r
        JOIN release_country rc ON r.id = rc.release
        WHERE rc.date_year IS NOT NULL
        GROUP BY r.release_group
    ),
    CollaborationEdges AS (
        -- Find every unique pair of artists on the same credit
        SELECT 
            acn1.artist AS artist_a,
            acn2.artist AS artist_b,
            rg.id AS release_group_id,
            erg.first_year
        FROM artist_credit_name acn1
        JOIN artist_credit_name acn2 ON acn1.artist_credit = acn2.artist_credit
        JOIN release_group rg ON rg.artist_credit = acn1.artist_credit
        JOIN EarliestRG erg ON rg.id = erg.release_group
        WHERE acn1.artist = ANY(%s::integer[])
          AND acn1.artist < acn2.artist
    )
    SELECT * FROM CollaborationEdges;
    """
    
    all_edges = []
    
    # Process in batches
    num_batches = (len(artist_ids) + batch_size - 1) // batch_size
    
    conn = psycopg2.connect(**DB_PARAMS)
    conn.autocommit = True
    
    with conn.cursor() as cur:
        for i in tqdm(range(num_batches), desc="Extracting batches"):
            batch_ids = artist_ids[i*batch_size:(i+1)*batch_size]
            
            # Execute query for this batch
            cur.execute(sql_query, (batch_ids,))
            
            # Fetch results
            batch_results = cur.fetchall()
            all_edges.extend(batch_results)
            
            print(f"  Batch {i+1}/{num_batches}: Found {len(batch_results):,} edges")
    
    conn.close()
    
    # Convert to DataFrame
    df_edges = pd.DataFrame(all_edges, columns=['artist_a', 'artist_b', 'release_group_id', 'first_year'])
    
    return df_edges

# Extract the master edge list
print("\nExecuting SQL query (this may take several minutes)...")
master_edge_list = extract_master_edge_list(artist_ids)

print(f"\nTotal edges extracted: {len(master_edge_list):,}")
print(f"Year range: {master_edge_list['first_year'].min()} - {master_edge_list['first_year'].max()}")
print(f"Unique artist pairs: {master_edge_list[['artist_a', 'artist_b']].drop_duplicates().shape[0]:,}")

# Save to parquet
output_path = '/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/master_edge_list.parquet'
master_edge_list.to_parquet(output_path, index=False)
print(f"\nMaster edge list saved to: {output_path}")

# Also save as CSV for easy inspection
csv_path = '/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/master_edge_list.csv'
master_edge_list.to_csv(csv_path, index=False)
print(f"Also saved as CSV: {csv_path}")

# ============================================================================
# PHASE 2: Generate Temporal Network Snapshots
# ============================================================================

print("\n" + "="*80)
print("PHASE 2: Generating Temporal Network Snapshots")
print("="*80)

# Create output directory structure
networks_dir = Path('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/networks')
networks_dir.mkdir(exist_ok=True)

# Determine year range
min_year = int(master_edge_list['first_year'].min())
max_year = int(master_edge_list['first_year'].max())
years = range(max(1958, min_year), min(2025, max_year + 1))

print(f"\nGenerating networks for years {max(1958, min_year)} to {min(2024, max_year)}")
print(f"Total years to process: {len(list(years))}")

# Process each year
for year in tqdm(list(years), desc="Generating yearly networks"):
    
    # Create year directory
    year_dir = networks_dir / str(year)
    year_dir.mkdir(exist_ok=True)
    
    # 1. Yearly Snapshot: Edges where first_year == year
    yearly = master_edge_list[master_edge_list['first_year'] == year].copy()
    yearly_dedup = yearly[['artist_a', 'artist_b']].drop_duplicates()
    yearly_dedup.to_csv(year_dir / 'yearly_snapshot.csv', index=False)
    
    # 2. Cumulative Network: Edges where first_year <= year
    cumulative = master_edge_list[master_edge_list['first_year'] <= year].copy()
    cumulative_dedup = cumulative[['artist_a', 'artist_b']].drop_duplicates()
    cumulative_dedup.to_csv(year_dir / 'cumulative_network.csv', index=False)
    
    # 3. Rolling 10-Year Network: Edges where (year - 9) <= first_year <= year
    rolling_start = year - 9
    rolling = master_edge_list[
        (master_edge_list['first_year'] >= rolling_start) & 
        (master_edge_list['first_year'] <= year)
    ].copy()
    rolling_dedup = rolling[['artist_a', 'artist_b']].drop_duplicates()
    rolling_dedup.to_csv(year_dir / 'rolling_10year.csv', index=False)

print("\n" + "="*80)
print("PROCESSING COMPLETE")
print("="*80)

# Summary statistics
print("\nSummary Statistics:")
print(f"  Master edge list: {len(master_edge_list):,} total collaboration records")
print(f"  Unique artist pairs: {master_edge_list[['artist_a', 'artist_b']].drop_duplicates().shape[0]:,}")
print(f"  Year range: {min_year} - {max_year}")
print(f"  Networks generated: {len(list(years))} years × 3 network types = {len(list(years)) * 3} files")
print(f"\nOutput locations:")
print(f"  Master edge list: {output_path}")
print(f"  Yearly networks: {networks_dir}/[YEAR]/")
print(f"    - yearly_snapshot.csv (edges from that year only)")
print(f"    - cumulative_network.csv (all edges up to that year)")
print(f"    - rolling_10year.csv (edges from previous 10 years)")


Loading Billboard artist IDs...
Total artists in df_artists_top_10_songs_only: 2420
Artists with MusicBrainz IDs: 2041
Artists without MusicBrainz IDs: 379

PHASE 1: Extracting Master Edge List from MusicBrainz

Executing SQL query (this may take several minutes)...


Extracting batches:  20%|██        | 1/5 [00:19<01:16, 19.21s/it]

  Batch 1/5: Found 6,175 edges


Extracting batches:  40%|████      | 2/5 [00:35<00:53, 17.68s/it]

  Batch 2/5: Found 4,569 edges


Extracting batches:  60%|██████    | 3/5 [00:52<00:34, 17.26s/it]

  Batch 3/5: Found 10,629 edges


Extracting batches:  80%|████████  | 4/5 [01:07<00:16, 16.14s/it]

  Batch 4/5: Found 3,298 edges


Extracting batches: 100%|██████████| 5/5 [01:21<00:00, 16.28s/it]

  Batch 5/5: Found 577 edges

Total edges extracted: 25,248
Year range: 1939 - 2026
Unique artist pairs: 15,404






Master edge list saved to: /Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/master_edge_list.parquet
Also saved as CSV: /Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/master_edge_list.csv

PHASE 2: Generating Temporal Network Snapshots

Generating networks for years 1958 to 2024
Total years to process: 67


Generating yearly networks: 100%|██████████| 67/67 [00:00<00:00, 239.69it/s]


PROCESSING COMPLETE

Summary Statistics:
  Master edge list: 25,248 total collaboration records
  Unique artist pairs: 15,404
  Year range: 1939 - 2026
  Networks generated: 67 years × 3 network types = 201 files

Output locations:
  Master edge list: /Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/master_edge_list.parquet
  Yearly networks: /Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/networks/[YEAR]/
    - yearly_snapshot.csv (edges from that year only)
    - cumulative_network.csv (all edges up to that year)
    - rolling_10year.csv (edges from previous 10 years)





In [26]:
import pandas as pd
import psycopg2
from psycopg2 import sql
from tqdm import tqdm
import os
from pathlib import Path

# Database connection parameters
DB_PARAMS = {
    'dbname': 'musicbrainz_db',
    'user': 'musicbrainz',
    'password': 'musicbrainz',
    'host': 'localhost',
    'port': 5432
}

# ============================================================================
# LOAD BILLBOARD ARTIST IDs (ALL ARTISTS)
# ============================================================================

print("Loading Billboard artist IDs (ALL ARTISTS)...")
df_artists = pd.read_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists.csv')

# Get artist IDs and filter out NaN values, convert to integers
artist_ids = df_artists['musicbrainz_artist_id'].dropna().astype(int).unique().tolist()

print(f"Total artists in df_artists: {len(df_artists)}")
print(f"Artists with MusicBrainz IDs: {len(artist_ids)}")
print(f"Artists without MusicBrainz IDs: {df_artists['musicbrainz_artist_id'].isna().sum()}")

# ============================================================================
# PHASE 1: Extract Master Edge List
# ============================================================================

print("\n" + "="*80)
print("PHASE 1: Extracting Master Edge List from MusicBrainz")
print("="*80)

def extract_master_edge_list(artist_ids, batch_size=500):
    """
    Extract all collaboration edges for the given artist IDs.
    Uses batching to avoid memory issues with large IN clauses.
    """
    
    # SQL query template - Uses INTEGER type, not UUID
    sql_query = """
    WITH EarliestRG AS (
        -- Anchor every album/single to its original release year
        SELECT r.release_group, MIN(rc.date_year) as first_year
        FROM release r
        JOIN release_country rc ON r.id = rc.release
        WHERE rc.date_year IS NOT NULL
        GROUP BY r.release_group
    ),
    CollaborationEdges AS (
        -- Find every unique pair of artists on the same credit
        SELECT 
            acn1.artist AS artist_a,
            acn2.artist AS artist_b,
            rg.id AS release_group_id,
            erg.first_year
        FROM artist_credit_name acn1
        JOIN artist_credit_name acn2 ON acn1.artist_credit = acn2.artist_credit
        JOIN release_group rg ON rg.artist_credit = acn1.artist_credit
        JOIN EarliestRG erg ON rg.id = erg.release_group
        WHERE acn1.artist = ANY(%s::integer[])
          AND acn1.artist < acn2.artist
    )
    SELECT * FROM CollaborationEdges;
    """
    
    all_edges = []
    
    # Process in batches
    num_batches = (len(artist_ids) + batch_size - 1) // batch_size
    
    conn = psycopg2.connect(**DB_PARAMS)
    conn.autocommit = True
    
    with conn.cursor() as cur:
        for i in tqdm(range(num_batches), desc="Extracting batches"):
            batch_ids = artist_ids[i*batch_size:(i+1)*batch_size]
            
            # Execute query for this batch
            cur.execute(sql_query, (batch_ids,))
            
            # Fetch results
            batch_results = cur.fetchall()
            all_edges.extend(batch_results)
            
            print(f"  Batch {i+1}/{num_batches}: Found {len(batch_results):,} edges")
    
    conn.close()
    
    # Convert to DataFrame
    df_edges = pd.DataFrame(all_edges, columns=['artist_a', 'artist_b', 'release_group_id', 'first_year'])
    
    return df_edges

# Extract the master edge list
print("\nExecuting SQL query (this may take several minutes)...")
master_edge_list = extract_master_edge_list(artist_ids)

print(f"\nTotal edges extracted: {len(master_edge_list):,}")
print(f"Year range: {master_edge_list['first_year'].min()} - {master_edge_list['first_year'].max()}")
print(f"Unique artist pairs: {master_edge_list[['artist_a', 'artist_b']].drop_duplicates().shape[0]:,}")

# Save to parquet
output_path = '/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/master_edge_list_all_artists.parquet'
master_edge_list.to_parquet(output_path, index=False)
print(f"\nMaster edge list saved to: {output_path}")

# Also save as CSV for easy inspection
csv_path = '/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/master_edge_list_all_artists.csv'
master_edge_list.to_csv(csv_path, index=False)
print(f"Also saved as CSV: {csv_path}")

# ============================================================================
# PHASE 2: Generate Temporal Network Snapshots
# ============================================================================

print("\n" + "="*80)
print("PHASE 2: Generating Temporal Network Snapshots")
print("="*80)

# Create output directory structure
networks_dir = Path('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/networks_all_artists')
networks_dir.mkdir(exist_ok=True)

# Determine year range
min_year = int(master_edge_list['first_year'].min())
max_year = int(master_edge_list['first_year'].max())
years = range(max(1958, min_year), min(2025, max_year + 1))

print(f"\nGenerating networks for years {max(1958, min_year)} to {min(2024, max_year)}")
print(f"Total years to process: {len(list(years))}")

# Process each year
for year in tqdm(list(years), desc="Generating yearly networks"):
    
    # Create year directory
    year_dir = networks_dir / str(year)
    year_dir.mkdir(exist_ok=True)
    
    # 1. Yearly Snapshot: Edges where first_year == year
    yearly = master_edge_list[master_edge_list['first_year'] == year].copy()
    yearly_dedup = yearly[['artist_a', 'artist_b']].drop_duplicates()
    yearly_dedup.to_csv(year_dir / 'yearly_snapshot.csv', index=False)
    
    # 2. Cumulative Network: Edges where first_year <= year
    cumulative = master_edge_list[master_edge_list['first_year'] <= year].copy()
    cumulative_dedup = cumulative[['artist_a', 'artist_b']].drop_duplicates()
    cumulative_dedup.to_csv(year_dir / 'cumulative_network.csv', index=False)
    
    # 3. Rolling 10-Year Network: Edges where (year - 9) <= first_year <= year
    rolling_start = year - 9
    rolling = master_edge_list[
        (master_edge_list['first_year'] >= rolling_start) & 
        (master_edge_list['first_year'] <= year)
    ].copy()
    rolling_dedup = rolling[['artist_a', 'artist_b']].drop_duplicates()
    rolling_dedup.to_csv(year_dir / 'rolling_10year.csv', index=False)

print("\n" + "="*80)
print("PROCESSING COMPLETE")
print("="*80)

# Summary statistics
print("\nSummary Statistics:")
print(f"  Master edge list: {len(master_edge_list):,} total collaboration records")
print(f"  Unique artist pairs: {master_edge_list[['artist_a', 'artist_b']].drop_duplicates().shape[0]:,}")
print(f"  Year range: {min_year} - {max_year}")
print(f"  Networks generated: {len(list(years))} years × 3 network types = {len(list(years)) * 3} files")
print(f"\nOutput locations:")
print(f"  Master edge list: {output_path}")
print(f"  Yearly networks: {networks_dir}/[YEAR]/")
print(f"    - yearly_snapshot.csv (edges from that year only)")
print(f"    - cumulative_network.csv (all edges up to that year)")
print(f"    - rolling_10year.csv (edges from previous 10 years)")


Loading Billboard artist IDs (ALL ARTISTS)...
Total artists in df_artists: 14226
Artists with MusicBrainz IDs: 11564
Artists without MusicBrainz IDs: 2662

PHASE 1: Extracting Master Edge List from MusicBrainz

Executing SQL query (this may take several minutes)...


Extracting batches:   4%|▍         | 1/24 [00:15<06:06, 15.95s/it]

  Batch 1/24: Found 2,518 edges


Extracting batches:   8%|▊         | 2/24 [00:29<05:19, 14.52s/it]

  Batch 2/24: Found 2,467 edges


Extracting batches:  12%|█▎        | 3/24 [00:44<05:08, 14.67s/it]

  Batch 3/24: Found 3,277 edges


Extracting batches:  17%|█▋        | 4/24 [00:58<04:50, 14.53s/it]

  Batch 4/24: Found 2,861 edges


Extracting batches:  21%|██        | 5/24 [01:12<04:30, 14.24s/it]

  Batch 5/24: Found 2,609 edges


Extracting batches:  25%|██▌       | 6/24 [01:26<04:14, 14.12s/it]

  Batch 6/24: Found 2,772 edges


Extracting batches:  29%|██▉       | 7/24 [01:40<03:59, 14.07s/it]

  Batch 7/24: Found 2,870 edges


Extracting batches:  33%|███▎      | 8/24 [01:53<03:42, 13.89s/it]

  Batch 8/24: Found 2,226 edges


Extracting batches:  38%|███▊      | 9/24 [02:07<03:26, 13.75s/it]

  Batch 9/24: Found 2,195 edges


Extracting batches:  42%|████▏     | 10/24 [02:21<03:13, 13.85s/it]

  Batch 10/24: Found 2,202 edges


Extracting batches:  46%|████▌     | 11/24 [02:36<03:03, 14.15s/it]

  Batch 11/24: Found 3,074 edges


Extracting batches:  50%|█████     | 12/24 [02:51<02:52, 14.40s/it]

  Batch 12/24: Found 9,887 edges


Extracting batches:  54%|█████▍    | 13/24 [03:05<02:38, 14.39s/it]

  Batch 13/24: Found 3,465 edges


Extracting batches:  58%|█████▊    | 14/24 [03:19<02:24, 14.44s/it]

  Batch 14/24: Found 2,722 edges


Extracting batches:  62%|██████▎   | 15/24 [03:35<02:12, 14.71s/it]

  Batch 15/24: Found 3,107 edges


Extracting batches:  67%|██████▋   | 16/24 [03:48<01:55, 14.38s/it]

  Batch 16/24: Found 1,904 edges


Extracting batches:  71%|███████   | 17/24 [04:04<01:42, 14.71s/it]

  Batch 17/24: Found 4,001 edges


Extracting batches:  75%|███████▌  | 18/24 [04:18<01:27, 14.64s/it]

  Batch 18/24: Found 2,560 edges


Extracting batches:  79%|███████▉  | 19/24 [04:33<01:13, 14.62s/it]

  Batch 19/24: Found 1,357 edges


Extracting batches:  83%|████████▎ | 20/24 [04:48<00:58, 14.68s/it]

  Batch 20/24: Found 1,134 edges


Extracting batches:  88%|████████▊ | 21/24 [05:01<00:43, 14.34s/it]

  Batch 21/24: Found 543 edges


Extracting batches:  92%|█████████▏| 22/24 [05:15<00:28, 14.17s/it]

  Batch 22/24: Found 1,848 edges


Extracting batches:  96%|█████████▌| 23/24 [05:29<00:14, 14.02s/it]

  Batch 23/24: Found 2,648 edges


Extracting batches: 100%|██████████| 24/24 [05:42<00:00, 14.29s/it]


  Batch 24/24: Found 1,104 edges

Total edges extracted: 65,351
Year range: 1926 - 2026
Unique artist pairs: 42,887

Master edge list saved to: /Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/master_edge_list_all_artists.parquet
Also saved as CSV: /Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/master_edge_list_all_artists.csv

PHASE 2: Generating Temporal Network Snapshots

Generating networks for years 1958 to 2024
Total years to process: 67


Generating yearly networks: 100%|██████████| 67/67 [00:00<00:00, 117.34it/s]


PROCESSING COMPLETE

Summary Statistics:
  Master edge list: 65,351 total collaboration records
  Unique artist pairs: 42,887
  Year range: 1926 - 2026
  Networks generated: 67 years × 3 network types = 201 files

Output locations:
  Master edge list: /Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/master_edge_list_all_artists.parquet
  Yearly networks: /Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/networks_all_artists/[YEAR]/
    - yearly_snapshot.csv (edges from that year only)
    - cumulative_network.csv (all edges up to that year)
    - rolling_10year.csv (edges from previous 10 years)





In [27]:
import pandas as pd
import networkx as nx
from pathlib import Path
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION
# ============================================================================
NETWORKS_DIR = Path('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/networks_all_artists')
DF_ARTISTS_PATH = '/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists.csv'
OUTPUT_PATH = '/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists_with_network_metrics.csv'

# ============================================================================
# DATA PREPARATION
# ============================================================================

print("Loading df_artists...")
df_artists = pd.read_csv(DF_ARTISTS_PATH)

# Keep IDs as integers (matching the network edge lists)
df_artists['musicbrainz_artist_id'] = df_artists['musicbrainz_artist_id'].fillna(-1).astype(int)

# Initialize all metric columns with None
metric_names = ['degree', 'closeness', 'harmonic_closeness', 'betweenness', 'eigenvector']
network_types_short = ['yearly', 'rolling10', 'cumulative']

for metric in metric_names:
    for net_type in network_types_short:
        df_artists[f'{metric}_centrality_top10_{net_type}'] = None
        df_artists[f'{metric}_centrality_firstsong_{net_type}'] = None

# Map years to lists of artist indices
print("Building year-to-artist index maps...")
top10_year_map = {}
for idx, row in df_artists.iterrows():
    year = row['first_year_top_10_songs']
    if pd.notna(year):
        year = int(year)
        if year not in top10_year_map:
            top10_year_map[year] = []
        top10_year_map[year].append(idx)

firstsong_year_map = {}
for idx, row in df_artists.iterrows():
    year = row['first_song_year']
    if pd.notna(year):
        year = int(year)
        if year not in firstsong_year_map:
            firstsong_year_map[year] = []
        firstsong_year_map[year].append(idx)

# Get unique sorted years to process
all_relevant_years = sorted(set(top10_year_map.keys()) | set(firstsong_year_map.keys()))
print(f"Found {len(all_relevant_years)} unique years to process")

# ============================================================================
# CENTRALITY COMPUTATION ENGINE
# ============================================================================

def get_all_metrics(G):
    """Calculates all 5 centralities for ALL nodes in G at once."""
    if G is None or len(G) == 0:
        return {m: {} for m in metric_names}
    
    print(f" (Nodes: {len(G):,}, Edges: {len(G.edges()):,})", end="")
    
    metrics = {}
    
    try:
        metrics['degree'] = nx.degree_centrality(G)
    except:
        metrics['degree'] = {}
    
    try:
        metrics['closeness'] = nx.closeness_centrality(G)
    except:
        metrics['closeness'] = {}
    
    try:
        metrics['harmonic_closeness'] = nx.harmonic_centrality(G)
    except:
        metrics['harmonic_closeness'] = {}
    
    try:
        # Sample for large graphs to speed up betweenness
        k = min(len(G), 500) if len(G) > 500 else None
        metrics['betweenness'] = nx.betweenness_centrality(G, k=k)
    except:
        metrics['betweenness'] = {}
    
    try:
        metrics['eigenvector'] = nx.eigenvector_centrality(G, max_iter=1000)
    except:
        metrics['eigenvector'] = {}
        
    return metrics

# ============================================================================
# MAIN LOOP (By Year)
# ============================================================================

network_types = {
    'yearly': 'yearly_snapshot',
    'rolling10': 'rolling_10year',
    'cumulative': 'cumulative_network'
}

for year in tqdm(all_relevant_years, desc="Processing years"):
    print(f"\n{'='*80}\nYear {year}")
    
    for net_key, net_filename in network_types.items():
        
        # 1. Load the network file once
        file_path = NETWORKS_DIR / str(year) / f'{net_filename}.csv'
        if not file_path.exists():
            print(f"  {net_key}: File not found, skipping")
            continue
        
        try:
            edges = pd.read_csv(file_path)
            if len(edges) == 0:
                print(f"  {net_key}: Empty network, skipping")
                continue
                
            G = nx.from_pandas_edgelist(edges, 'artist_a', 'artist_b')
            
            # 2. Compute metrics for every node in this year's graph
            print(f"  {net_key}: Computing metrics...", end="")
            all_node_metrics = get_all_metrics(G)
            print(" ✓")
            
            # 3. Update all artists who had their first Top 10 hit in this year
            if year in top10_year_map:
                for idx in top10_year_map[year]:
                    m_id = int(df_artists.at[idx, 'musicbrainz_artist_id'])
                    if m_id == -1:  # Skip artists without MusicBrainz ID
                        continue
                    
                    for m_name in metric_names:
                        val = all_node_metrics[m_name].get(m_id)
                        df_artists.at[idx, f'{m_name}_centrality_top10_{net_key}'] = val

            # 4. Update all artists who had their first charting song in this year
            if year in firstsong_year_map:
                for idx in firstsong_year_map[year]:
                    m_id = int(df_artists.at[idx, 'musicbrainz_artist_id'])
                    if m_id == -1:  # Skip artists without MusicBrainz ID
                        continue
                    
                    for m_name in metric_names:
                        val = all_node_metrics[m_name].get(m_id)
                        df_artists.at[idx, f'{m_name}_centrality_firstsong_{net_key}'] = val
        
        except Exception as e:
            print(f"  {net_key}: Error - {e}")
            continue

# ============================================================================
# SAVE RESULTS
# ============================================================================

print("\n" + "="*80)
print(f"Saving results to {OUTPUT_PATH}...")
df_artists.to_csv(OUTPUT_PATH, index=False)
print("✓ Complete!")

# Summary
print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(f"\nMetrics calculated for {len(all_relevant_years)} unique years")
print(f"Total columns in output: {len(df_artists.columns)}")
print(f"\nOutput saved to: {OUTPUT_PATH}")


Loading df_artists...
Building year-to-artist index maps...
Found 69 unique years to process


Processing years:   0%|          | 0/69 [00:00<?, ?it/s]


Year 1958
  yearly: Computing metrics... (Nodes: 197, Edges: 152) ✓
  rolling10: Computing metrics... (Nodes: 691, Edges: 737) ✓
  cumulative: Computing metrics... (Nodes: 873, Edges: 1,034)

Processing years:   1%|▏         | 1/69 [00:00<00:42,  1.62it/s]

 ✓

Year 1959
  yearly: Computing metrics... (Nodes: 181, Edges: 135) ✓
  rolling10: Computing metrics... (Nodes: 759, Edges: 813) ✓
  cumulative: Computing metrics... (Nodes: 960, Edges: 1,136)

Processing years:   3%|▎         | 2/69 [00:01<00:44,  1.51it/s]

 ✓

Year 1960
  yearly: Computing metrics... (Nodes: 162, Edges: 110) ✓
  rolling10: Computing metrics... (Nodes: 812, Edges: 846) ✓
  cumulative: Computing metrics... (Nodes: 1,045, Edges: 1,221)

Processing years:   4%|▍         | 3/69 [00:02<00:49,  1.32it/s]

 ✓

Year 1961
  yearly: Computing metrics... (Nodes: 173, Edges: 126) ✓
  rolling10: Computing metrics... (Nodes: 857, Edges: 879) ✓
  cumulative: Computing metrics... (Nodes: 1,136, Edges: 1,318)

Processing years:   6%|▌         | 4/69 [00:03<00:50,  1.28it/s]

 ✓

Year 1962
  yearly: Computing metrics... (Nodes: 212, Edges: 149) ✓
  rolling10: Computing metrics... (Nodes: 937, Edges: 955) ✓
  cumulative: Computing metrics... (Nodes: 1,248, Edges: 1,437)

Processing years:   7%|▋         | 5/69 [00:03<00:54,  1.17it/s]

 ✓

Year 1963
  yearly: Computing metrics... (Nodes: 169, Edges: 127) ✓
  rolling10: Computing metrics... (Nodes: 984, Edges: 1,007) ✓
  cumulative: Computing metrics... (Nodes: 1,322, Edges: 1,525)

Processing years:   9%|▊         | 6/69 [00:04<00:56,  1.12it/s]

 ✓

Year 1964
  yearly: Computing metrics... (Nodes: 170, Edges: 119) ✓
  rolling10: Computing metrics... (Nodes: 1,042, Edges: 1,032) ✓
  cumulative: Computing metrics... (Nodes: 1,415, Edges: 1,611)

Processing years:  10%|█         | 7/69 [00:05<00:58,  1.06it/s]

 ✓

Year 1965
  yearly: Computing metrics... (Nodes: 227, Edges: 182) ✓
  rolling10: Computing metrics... (Nodes: 1,100, Edges: 1,101) ✓
  cumulative: Computing metrics... (Nodes: 1,520, Edges: 1,746)

Processing years:  12%|█▏        | 8/69 [00:07<01:02,  1.03s/it]

 ✓

Year 1966
  yearly: Computing metrics... (Nodes: 191, Edges: 137) ✓
  rolling10: Computing metrics... (Nodes: 1,132, Edges: 1,106) ✓
  cumulative: Computing metrics... (Nodes: 1,621, Edges: 1,848)

Processing years:  13%|█▎        | 9/69 [00:08<01:07,  1.12s/it]

 ✓

Year 1967
  yearly: Computing metrics... (Nodes: 159, Edges: 114) ✓
  rolling10: Computing metrics... (Nodes: 1,138, Edges: 1,097) ✓
  cumulative: Computing metrics... (Nodes: 1,693, Edges: 1,935)

Processing years:  14%|█▍        | 10/69 [00:09<01:09,  1.18s/it]

 ✓

Year 1968
  yearly: Computing metrics... (Nodes: 221, Edges: 154) ✓
  rolling10: Computing metrics... (Nodes: 1,151, Edges: 1,102) ✓
  cumulative: Computing metrics... (Nodes: 1,806, Edges: 2,058)

Processing years:  16%|█▌        | 11/69 [00:11<01:13,  1.27s/it]

 ✓

Year 1969
  yearly: Computing metrics... (Nodes: 179, Edges: 146) ✓
  rolling10: Computing metrics... (Nodes: 1,134, Edges: 1,103) ✓
  cumulative: Computing metrics... (Nodes: 1,875, Edges: 2,156)

Processing years:  17%|█▋        | 12/69 [00:12<01:18,  1.38s/it]

 ✓

Year 1970
  yearly: Computing metrics... (Nodes: 240, Edges: 190) ✓
  rolling10: Computing metrics... (Nodes: 1,174, Edges: 1,169) ✓
  cumulative: Computing metrics... (Nodes: 1,988, Edges: 2,292)

Processing years:  19%|█▉        | 13/69 [00:14<01:26,  1.54s/it]

 ✓

Year 1971
  yearly: Computing metrics... (Nodes: 218, Edges: 185) ✓
  rolling10: Computing metrics... (Nodes: 1,196, Edges: 1,210) ✓
  cumulative: Computing metrics... (Nodes: 2,084, Edges: 2,425)

Processing years:  20%|██        | 14/69 [00:16<01:32,  1.68s/it]

 ✓

Year 1972
  yearly: Computing metrics... (Nodes: 298, Edges: 264) ✓
  rolling10: Computing metrics... (Nodes: 1,259, Edges: 1,308) ✓
  cumulative: Computing metrics... (Nodes: 2,240, Edges: 2,626)

Processing years:  22%|██▏       | 15/69 [00:19<01:42,  1.90s/it]

 ✓

Year 1973
  yearly: Computing metrics... (Nodes: 294, Edges: 261) ✓
  rolling10: Computing metrics... (Nodes: 1,336, Edges: 1,408) ✓
  cumulative: Computing metrics... (Nodes: 2,383, Edges: 2,818)

Processing years:  23%|██▎       | 16/69 [00:21<01:51,  2.10s/it]

 ✓

Year 1974
  yearly: Computing metrics... (Nodes: 270, Edges: 231) ✓
  rolling10: Computing metrics... (Nodes: 1,420, Edges: 1,513) ✓
  cumulative: Computing metrics... (Nodes: 2,527, Edges: 3,005)

Processing years:  25%|██▍       | 17/69 [00:24<02:01,  2.34s/it]

 ✓

Year 1975
  yearly: Computing metrics... (Nodes: 207, Edges: 187) ✓
  rolling10: Computing metrics... (Nodes: 1,407, Edges: 1,500) ✓
  cumulative: Computing metrics... (Nodes: 2,608, Edges: 3,126)

Processing years:  26%|██▌       | 18/69 [00:27<02:09,  2.54s/it]

 ✓

Year 1976
  yearly: Computing metrics... (Nodes: 252, Edges: 192) ✓
  rolling10: Computing metrics... (Nodes: 1,448, Edges: 1,557) ✓
  cumulative: Computing metrics... (Nodes: 2,718, Edges: 3,263)

Processing years:  28%|██▊       | 19/69 [00:31<02:17,  2.76s/it]

 ✓

Year 1977
  yearly: Computing metrics... (Nodes: 270, Edges: 196) ✓
  rolling10: Computing metrics... (Nodes: 1,532, Edges: 1,637) ✓
  cumulative: Computing metrics... (Nodes: 2,855, Edges: 3,415)

Processing years:  29%|██▉       | 20/69 [00:34<02:26,  2.99s/it]

 ✓

Year 1978
  yearly: Computing metrics... (Nodes: 257, Edges: 209) ✓
  rolling10: Computing metrics... (Nodes: 1,553, Edges: 1,672) ✓
  cumulative: Computing metrics... (Nodes: 2,967, Edges: 3,556)

Processing years:  30%|███       | 21/69 [00:38<02:38,  3.29s/it]

 ✓

Year 1979
  yearly: Computing metrics... (Nodes: 277, Edges: 249) ✓
  rolling10: Computing metrics... (Nodes: 1,617, Edges: 1,761) ✓
  cumulative: Computing metrics... (Nodes: 3,083, Edges: 3,737)

Processing years:  32%|███▏      | 22/69 [00:42<02:48,  3.60s/it]

 ✓

Year 1980
  yearly: Computing metrics... (Nodes: 290, Edges: 259) ✓
  rolling10: Computing metrics... (Nodes: 1,670, Edges: 1,833) ✓
  cumulative: Computing metrics... (Nodes: 3,218, Edges: 3,930)

Processing years:  33%|███▎      | 23/69 [00:47<02:59,  3.90s/it]

 ✓

Year 1981
  yearly: Computing metrics... (Nodes: 331, Edges: 273) ✓
  rolling10: Computing metrics... (Nodes: 1,737, Edges: 1,917) ✓
  cumulative: Computing metrics... (Nodes: 3,354, Edges: 4,131)

Processing years:  35%|███▍      | 24/69 [00:52<03:09,  4.22s/it]

 ✓

Year 1982
  yearly: Computing metrics... (Nodes: 340, Edges: 290) ✓
  rolling10: Computing metrics... (Nodes: 1,763, Edges: 1,951) ✓
  cumulative: Computing metrics... (Nodes: 3,503, Edges: 4,348)

Processing years:  36%|███▌      | 25/69 [00:58<03:26,  4.69s/it]

 ✓

Year 1983
  yearly: Computing metrics... (Nodes: 309, Edges: 236) ✓
  rolling10: Computing metrics... (Nodes: 1,770, Edges: 1,932) ✓
  cumulative: Computing metrics... (Nodes: 3,617, Edges: 4,503)

Processing years:  38%|███▊      | 26/69 [01:04<03:36,  5.04s/it]

 ✓

Year 1984
  yearly: Computing metrics... (Nodes: 334, Edges: 273) ✓
  rolling10: Computing metrics... (Nodes: 1,820, Edges: 1,959) ✓
  cumulative: Computing metrics... (Nodes: 3,765, Edges: 4,706)

Processing years:  39%|███▉      | 27/69 [01:10<03:48,  5.44s/it]

 ✓

Year 1985
  yearly: Computing metrics... (Nodes: 406, Edges: 330) ✓
  rolling10: Computing metrics... (Nodes: 1,944, Edges: 2,069) ✓
  cumulative: Computing metrics... (Nodes: 3,945, Edges: 4,939)

Processing years:  41%|████      | 28/69 [01:17<04:02,  5.90s/it]

 ✓

Year 1986
  yearly: Computing metrics... (Nodes: 429, Edges: 369) ✓
  rolling10: Computing metrics... (Nodes: 2,057, Edges: 2,201) ✓
  cumulative: Computing metrics... (Nodes: 4,126, Edges: 5,189)

Processing years:  42%|████▏     | 29/69 [01:25<04:17,  6.43s/it]

 ✓

Year 1987
  yearly: Computing metrics... (Nodes: 517, Edges: 522) ✓
  rolling10: Computing metrics... (Nodes: 2,153, Edges: 2,420) ✓
  cumulative: Computing metrics... (Nodes: 4,336, Edges: 5,529)

Processing years:  43%|████▎     | 30/69 [01:33<04:37,  7.13s/it]

 ✓

Year 1988
  yearly: Computing metrics... (Nodes: 564, Edges: 522) ✓
  rolling10: Computing metrics... (Nodes: 2,339, Edges: 2,652) ✓
  cumulative: Computing metrics... (Nodes: 4,570, Edges: 5,863)

Processing years:  45%|████▍     | 31/69 [01:43<05:03,  7.98s/it]

 ✓

Year 1989
  yearly: Computing metrics... (Nodes: 595, Edges: 508) ✓
  rolling10: Computing metrics... (Nodes: 2,486, Edges: 2,820) ✓
  cumulative: Computing metrics... (Nodes: 4,786, Edges: 6,174)

Processing years:  46%|████▋     | 32/69 [01:54<05:28,  8.87s/it]

 ✓

Year 1990
  yearly: Computing metrics... (Nodes: 659, Edges: 592) ✓
  rolling10: Computing metrics... (Nodes: 2,690, Edges: 3,054) ✓
  cumulative: Computing metrics... (Nodes: 5,059, Edges: 6,565)

Processing years:  48%|████▊     | 33/69 [02:06<05:55,  9.88s/it]

 ✓

Year 1991
  yearly: Computing metrics... (Nodes: 663, Edges: 625) ✓
  rolling10: Computing metrics... (Nodes: 2,859, Edges: 3,280) ✓
  cumulative: Computing metrics... (Nodes: 5,303, Edges: 6,947)

Processing years:  49%|████▉     | 34/69 [02:20<06:21, 10.90s/it]

 ✓

Year 1992
  yearly: Computing metrics... (Nodes: 768, Edges: 708) ✓
  rolling10: Computing metrics... (Nodes: 3,066, Edges: 3,580) ✓
  cumulative: Computing metrics... (Nodes: 5,604, Edges: 7,399)

Processing years:  51%|█████     | 35/69 [02:34<06:48, 12.01s/it]

 ✓

Year 1993
  yearly: Computing metrics... (Nodes: 712, Edges: 668) ✓
  rolling10: Computing metrics... (Nodes: 3,277, Edges: 3,874) ✓
  cumulative: Computing metrics... (Nodes: 5,861, Edges: 7,796)

Processing years:  52%|█████▏    | 36/69 [02:50<07:14, 13.15s/it]

 ✓

Year 1994
  yearly: Computing metrics... (Nodes: 793, Edges: 720) ✓
  rolling10: Computing metrics... (Nodes: 3,537, Edges: 4,209) ✓
  cumulative: Computing metrics... (Nodes: 6,183, Edges: 8,253)

Processing years:  54%|█████▎    | 37/69 [03:08<07:41, 14.42s/it]

 ✓

Year 1995
  yearly: Computing metrics... (Nodes: 747, Edges: 644) ✓
  rolling10: Computing metrics... (Nodes: 3,739, Edges: 4,444) ✓
  cumulative: Computing metrics... (Nodes: 6,491, Edges: 8,671)

Processing years:  55%|█████▌    | 38/69 [03:27<08:12, 15.87s/it]

 ✓

Year 1996
  yearly: Computing metrics... (Nodes: 773, Edges: 716) ✓
  rolling10: Computing metrics... (Nodes: 3,921, Edges: 4,711) ✓
  cumulative: Computing metrics... (Nodes: 6,798, Edges: 9,138)

Processing years:  57%|█████▋    | 39/69 [03:49<08:49, 17.64s/it]

 ✓

Year 1997
  yearly: Computing metrics... (Nodes: 776, Edges: 682) ✓
  rolling10: Computing metrics... (Nodes: 4,113, Edges: 4,868) ✓
  cumulative: Computing metrics... (Nodes: 7,125, Edges: 9,585)

Processing years:  58%|█████▊    | 40/69 [04:12<09:18, 19.27s/it]

 ✓

Year 1998
  yearly: Computing metrics... (Nodes: 804, Edges: 672) ✓
  rolling10: Computing metrics... (Nodes: 4,316, Edges: 5,045) ✓
  cumulative: Computing metrics... (Nodes: 7,453, Edges: 10,020)

Processing years:  59%|█████▉    | 41/69 [04:38<09:57, 21.33s/it]

 ✓

Year 1999
  yearly: Computing metrics... (Nodes: 876, Edges: 1,254) ✓
  rolling10: Computing metrics... (Nodes: 4,551, Edges: 5,813) ✓
  cumulative: Computing metrics... (Nodes: 7,845, Edges: 11,043)

Processing years:  61%|██████    | 42/69 [05:07<10:40, 23.71s/it]

 ✓

Year 2000
  yearly: Computing metrics... (Nodes: 820, Edges: 700) ✓
  rolling10: Computing metrics... (Nodes: 4,684, Edges: 5,952) ✓
  cumulative: Computing metrics... (Nodes: 8,199, Edges: 11,510)

Processing years:  62%|██████▏   | 43/69 [05:39<11:16, 26.03s/it]

 ✓

Year 2001
  yearly: Computing metrics... (Nodes: 778, Edges: 684) ✓
  rolling10: Computing metrics... (Nodes: 4,786, Edges: 6,024) ✓
  cumulative: Computing metrics... (Nodes: 8,513, Edges: 11,925)

Processing years:  64%|██████▍   | 44/69 [06:12<11:46, 28.27s/it]

 ✓

Year 2002
  yearly: Computing metrics... (Nodes: 826, Edges: 641) ✓
  rolling10: Computing metrics... (Nodes: 4,869, Edges: 6,028) ✓
  cumulative: Computing metrics... (Nodes: 8,861, Edges: 12,356)

Processing years:  65%|██████▌   | 45/69 [06:48<12:10, 30.45s/it]

 ✓

Year 2003
  yearly: Computing metrics... (Nodes: 816, Edges: 723) ✓
  rolling10: Computing metrics... (Nodes: 4,979, Edges: 6,167) ✓
  cumulative: Computing metrics... (Nodes: 9,194, Edges: 12,852)

Processing years:  67%|██████▋   | 46/69 [07:26<12:35, 32.83s/it]

 ✓

Year 2004
  yearly: Computing metrics... (Nodes: 797, Edges: 642) ✓
  rolling10: Computing metrics... (Nodes: 5,019, Edges: 6,163) ✓
  cumulative: Computing metrics... (Nodes: 9,526, Edges: 13,304)

Processing years:  68%|██████▊   | 47/69 [08:07<12:53, 35.17s/it]

 ✓

Year 2005
  yearly: Computing metrics... (Nodes: 933, Edges: 821) ✓
  rolling10: Computing metrics... (Nodes: 5,152, Edges: 6,340) ✓
  cumulative: Computing metrics... (Nodes: 9,914, Edges: 13,899)

Processing years:  70%|██████▉   | 48/69 [08:51<13:17, 37.96s/it]

 ✓

Year 2006
  yearly: Computing metrics... (Nodes: 1,014, Edges: 871) ✓
  rolling10: Computing metrics... (Nodes: 5,304, Edges: 6,490) ✓
  cumulative: Computing metrics... (Nodes: 10,335, Edges: 14,531)

Processing years:  71%|███████   | 49/69 [09:39<13:41, 41.08s/it]

 ✓

Year 2007
  yearly: Computing metrics... (Nodes: 956, Edges: 820) ✓
  rolling10: Computing metrics... (Nodes: 5,466, Edges: 6,661) ✓
  cumulative: Computing metrics... (Nodes: 10,744, Edges: 15,120)

Processing years:  72%|███████▏  | 50/69 [10:31<14:01, 44.27s/it]

 ✓

Year 2008
  yearly: Computing metrics... (Nodes: 1,025, Edges: 806) ✓
  rolling10: Computing metrics... (Nodes: 5,607, Edges: 6,826) ✓
  cumulative: Computing metrics... (Nodes: 11,186, Edges: 15,724)

Processing years:  74%|███████▍  | 51/69 [11:26<14:11, 47.32s/it]

 ✓

Year 2009
  yearly: Computing metrics... (Nodes: 1,019, Edges: 854) ✓
  rolling10: Computing metrics... (Nodes: 5,702, Edges: 6,461) ✓
  cumulative: Computing metrics... (Nodes: 11,658, Edges: 16,397)

Processing years:  75%|███████▌  | 52/69 [12:25<14:23, 50.82s/it]

 ✓

Year 2010
  yearly: Computing metrics... (Nodes: 1,139, Edges: 1,048) ✓
  rolling10: Computing metrics... (Nodes: 5,891, Edges: 6,815) ✓
  cumulative: Computing metrics... (Nodes: 12,174, Edges: 17,251)

Processing years:  77%|███████▋  | 53/69 [13:31<14:48, 55.52s/it]

 ✓

Year 2011
  yearly: Computing metrics... (Nodes: 1,234, Edges: 1,074) ✓
  rolling10: Computing metrics... (Nodes: 6,144, Edges: 7,192) ✓
  cumulative: Computing metrics... (Nodes: 12,720, Edges: 18,084)

Processing years:  78%|███████▊  | 54/69 [26:20<1:07:23, 269.57s/it]

 ✓

Year 2012
  yearly: Computing metrics... (Nodes: 1,400, Edges: 1,322) ✓
  rolling10: Computing metrics... (Nodes: 6,489, Edges: 7,819) ✓
  cumulative: Computing metrics... (Nodes: 13,356, Edges: 19,124)

Processing years:  80%|███████▉  | 55/69 [27:40<49:38, 212.74s/it]  

 ✓

Year 2013
  yearly: Computing metrics... (Nodes: 1,462, Edges: 1,284) ✓
  rolling10: Computing metrics... (Nodes: 6,925, Edges: 8,338) ✓
  cumulative: Computing metrics... (Nodes: 14,067, Edges: 20,158)

Processing years:  81%|████████  | 56/69 [29:09<38:03, 175.68s/it]

 ✓

Year 2014
  yearly: Computing metrics... (Nodes: 1,574, Edges: 1,396) ✓
  rolling10: Computing metrics... (Nodes: 7,374, Edges: 8,999) ✓
  cumulative: Computing metrics... (Nodes: 14,793, Edges: 21,273)

Processing years:  83%|████████▎ | 57/69 [30:49<30:35, 152.99s/it]

 ✓

Year 2015
  yearly: Computing metrics... (Nodes: 1,694, Edges: 1,650) ✓
  rolling10: Computing metrics... (Nodes: 7,789, Edges: 9,736) ✓
  cumulative: Computing metrics... (Nodes: 15,531, Edges: 22,598)

Processing years:  84%|████████▍ | 58/69 [32:45<25:59, 141.80s/it]

 ✓

Year 2016
  yearly: Computing metrics... (Nodes: 1,833, Edges: 1,927) ✓
  rolling10: Computing metrics... (Nodes: 8,235, Edges: 10,686) ✓
  cumulative: Computing metrics... (Nodes: 16,378, Edges: 24,180)

Processing years:  86%|████████▌ | 59/69 [34:57<23:08, 138.85s/it]

 ✓

Year 2017
  yearly: Computing metrics... (Nodes: 1,995, Edges: 2,232) ✓
  rolling10: Computing metrics... (Nodes: 8,752, Edges: 11,863) ✓
  cumulative: Computing metrics... (Nodes: 17,287, Edges: 25,979)

Processing years:  87%|████████▋ | 60/69 [37:29<21:25, 142.80s/it]

 ✓

Year 2018
  yearly: Computing metrics... (Nodes: 2,241, Edges: 2,391) ✓
  rolling10: Computing metrics... (Nodes: 9,373, Edges: 13,176) ✓
  cumulative: Computing metrics... (Nodes: 18,283, Edges: 27,880)

Processing years:  88%|████████▊ | 61/69 [59:57<1:07:15, 504.40s/it]

 ✓

Year 2019
  yearly: Computing metrics... (Nodes: 2,435, Edges: 2,584) ✓
  rolling10: Computing metrics... (Nodes: 10,091, Edges: 14,641) ✓
  cumulative: Computing metrics... (Nodes: 19,368, Edges: 29,974)

Processing years:  90%|████████▉ | 62/69 [1:07:14<56:29, 484.23s/it]

 ✓

Year 2020
  yearly: Computing metrics... (Nodes: 2,814, Edges: 3,603) ✓
  rolling10: Computing metrics... (Nodes: 10,989, Edges: 16,952) ✓
  cumulative: Computing metrics... (Nodes: 20,626, Edges: 33,069)

Processing years:  91%|█████████▏| 63/69 [1:11:23<41:21, 413.64s/it]

 ✓

Year 2021
  yearly: Computing metrics... (Nodes: 2,939, Edges: 3,186) ✓
  rolling10: Computing metrics... (Nodes: 11,901, Edges: 18,828) ✓
  cumulative: Computing metrics... (Nodes: 21,909, Edges: 35,729)

Processing years:  93%|█████████▎| 64/69 [1:16:43<32:07, 385.49s/it]

 ✓

Year 2022
  yearly: Computing metrics... (Nodes: 2,514, Edges: 2,300) ✓
  rolling10: Computing metrics... (Nodes: 12,377, Edges: 19,665) ✓
  cumulative: Computing metrics... (Nodes: 22,917, Edges: 37,597)

Processing years:  94%|█████████▍| 65/69 [1:32:13<36:34, 548.70s/it]

 ✓

Year 2023
  yearly: Computing metrics... (Nodes: 2,492, Edges: 2,287) ✓
  rolling10: Computing metrics... (Nodes: 12,841, Edges: 20,531) ✓
  cumulative: Computing metrics... (Nodes: 23,929, Edges: 39,467)

Processing years:  96%|█████████▌| 66/69 [1:38:28<24:50, 496.79s/it]

 ✓

Year 2024
  yearly: Computing metrics... (Nodes: 2,393, Edges: 2,113) ✓
  rolling10: Computing metrics... (Nodes: 13,186, Edges: 21,200) ✓
  cumulative: Computing metrics... (Nodes: 24,858, Edges: 41,190)

Processing years: 100%|██████████| 69/69 [1:45:34<00:00, 91.81s/it] 

 ✓

Year 2025
  yearly: File not found, skipping
  rolling10: File not found, skipping
  cumulative: File not found, skipping

Year 2026
  yearly: File not found, skipping
  rolling10: File not found, skipping
  cumulative: File not found, skipping

Saving results to /Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists_with_network_metrics.csv...
✓ Complete!

SUMMARY

Metrics calculated for 69 unique years
Total columns in output: 85

Output saved to: /Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists_with_network_metrics.csv





In [28]:
import pandas as pd

# ============================================================================
# LOAD DATA
# ============================================================================

print("Loading dataframes...")
df_artists_with_metrics = pd.read_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists_with_network_metrics.csv')
df_artists_top_10 = pd.read_csv('/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists_top_10_songs_only.csv')

print(f"df_artists_with_metrics: {df_artists_with_metrics.shape}")
print(f"df_artists_top_10_songs_only: {df_artists_top_10.shape}")

# ============================================================================
# IDENTIFY NETWORK METRIC COLUMNS
# ============================================================================

# Get all the network metric column names
network_columns = [col for col in df_artists_with_metrics.columns 
                   if '_centrality_' in col]

print(f"\nFound {len(network_columns)} network metric columns to transfer")

# ============================================================================
# MERGE DATAFRAMES
# ============================================================================

print("\nMerging dataframes on 'performer_normalized'...")

# Select only the columns we need from df_artists_with_metrics
columns_to_merge = ['performer_normalized'] + network_columns
df_metrics_subset = df_artists_with_metrics[columns_to_merge]

# Merge with df_artists_top_10
df_result = df_artists_top_10.merge(
    df_metrics_subset,
    on='performer_normalized',
    how='left'
)

print(f"Result shape: {df_result.shape}")

# ============================================================================
# VERIFY MERGE
# ============================================================================

# Check how many artists got matched
matched = df_result[network_columns[0]].notna().sum()
total = len(df_result)
print(f"\nMetrics added for {matched}/{total} artists ({matched/total*100:.1f}%)")

# ============================================================================
# SAVE RESULTS
# ============================================================================

output_path = '/Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists_top_10_songs_only_with_network_metrics.csv'
print(f"\nSaving to {output_path}...")
df_result.to_csv(output_path, index=False)

print("✓ Complete!")

# ============================================================================
# SUMMARY
# ============================================================================

print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(f"Original df_artists_top_10_songs_only columns: {len(df_artists_top_10.columns)}")
print(f"Network metric columns added: {len(network_columns)}")
print(f"New total columns: {len(df_result.columns)}")
print(f"\nOutput file: df_artists_top_10_songs_only_with_network_metrics.csv")


Loading dataframes...
df_artists_with_metrics: (14226, 85)
df_artists_top_10_songs_only: (2420, 55)

Found 30 network metric columns to transfer

Merging dataframes on 'performer_normalized'...
Result shape: (2420, 85)

Metrics added for 407/2420 artists (16.8%)

Saving to /Users/jamesemcnally/Documents/GitHub/spring-2026-hitmakers/df_artists_top_10_songs_only_with_network_metrics.csv...
✓ Complete!

SUMMARY
Original df_artists_top_10_songs_only columns: 55
Network metric columns added: 30
New total columns: 85

Output file: df_artists_top_10_songs_only_with_network_metrics.csv
