# 0. Load library

In [1]:
import pandas as pd
import numpy as np

# 1. Target

The target corporate

In [2]:
Target_corporation = "Lead B"

# 2. Analyze Cleaned Original Partner List

Load target corporate's partner list (.csv file)

In [None]:
# Load CSV into a Data Pool DataFrame
# Change encoding from 'utf-8' to 'latin1' to avoid invalid continuation byte
df_partner_list = pd.read_csv('')

# Apply .strip() to all string values in the DataFrame using apply with a lambda function
df_partner_list = df_partner_list.apply(lambda col: col.map(lambda x: x.strip() if isinstance(x, str) else x))

# Now all string values in df_pool will have leading/trailing spaces removed
print("Stripped all leading/trailing spaces from string attributes.")

Stripped all leading/trailing spaces from string attributes.


In [None]:
# Check loaded dataframe
df_partner_list.head()

In [5]:
# Update dataframe attribute type
# Define the mapping
type_rename_mapping = {
  "gov": "government",
  "univ": "university",
  "npo": "nonprofit organization"
}

# Apply the mapping
df_partner_list["Type"] = df_partner_list["Type"].replace(type_rename_mapping)

In [None]:
# Check loaded dataframe
df_partner_list.head()

Check unique partners

In [7]:
# ex. (3M	business	MEDA	MN	2022) and (3M	business	UMN	MN	2024) will be treat as 1 partner
Partner_list = df_partner_list["Sponsors"].unique()
# Length of the partner list
print("Unique number of partners are:", len(Partner_list))
# Option to print the whole list
# print(Partner_list)

Unique number of partners are: 84


Output type distribution of the dataset

In [8]:
# Key point, cannot do simple value counts
# ex. (3M	business	MEDA	MN	2022) and (3M	business	UMN	MN	2024) are not two business institution
# ex. (Medtronic  medical MEDA  MN  2022) and (Medtronic business MEDA  MN  2023) should be counted separately
# Because they are 1 institution's two types

# Naive count
# print(df['Type'].value_counts())

# Drop duplicate (Sponsor, Type) pairs
unique_type_entries = df_partner_list[['Sponsors', 'Type']].drop_duplicates()

# Count how many times each type appears among unique sponsor-type pairs
# Store result in dataframe style
type_distribution = unique_type_entries['Type'].value_counts().reset_index()
type_distribution.columns = ['Type', 'Count']
# Print the result
print(type_distribution)

                     Type  Count
0  nonprofit organization     33
1               financial     19
2              government     15
3              university      9
4                business      8


# 3. Analyze linked result

Load connection result dataset (.csv file)

In [9]:
# Load CSV into a Data Pool DataFrame
# Change encoding from 'utf-8' to 'latin1' to avoid invalid continuation byte
# df = pd.read_csv('BOC_update - Sheet1.csv', encoding='latin1')
df_result = pd.read_csv('masked_final_results_expand.csv')

# Apply .strip() to all string values in the DataFrame using apply with a lambda function
df_result = df_result.apply(lambda col: col.map(lambda x: x.strip() if isinstance(x, str) else x))

# Now all string values in df_pool will have leading/trailing spaces removed
print("Stripped all leading/trailing spaces from string attributes.")

Stripped all leading/trailing spaces from string attributes.


In [10]:
# Check dataset
df_result.head()

Unnamed: 0,Edge,Google_proof,Hyperlink_proof
0,"('Ascend New York', 'BOC Capital Corp')",SearchResult(url=https://www.ascendnewyorkcity...,https://boccapital.org/
1,"('BXL Business Incubator', 'Ascend New York')","SearchResult(url=https://www.bxl.nyc/, title=B...",
2,"('Bronx Community College', 'Ascend New York')",SearchResult(url=https://www.ascendnewyorkcity...,https://www.bcc.cuny.edu
3,"('Ascend New York', 'Bronx Cooperative Develop...",SearchResult(url=https://www.ascendnewyorkcity...,
4,"('CDFI Fund', 'Ascend New York')",SearchResult(url=https://www.ascendnewyorkcity...,


Google_text_mentioned connection

In [11]:
# Count non-NaN values under the "Google_proof" column
num_google_mentions = df_result['Google_proof'].notna().sum()

print(f"Number of Google_text_mentioned connections: {num_google_mentions}")

Number of Google_text_mentioned connections: 484


Hyperlink_mentioned connection

In [12]:
# Count non-NaN values under the "Hyperlink_proof" column
num_hyperlink_mentions = df_result['Hyperlink_proof'].notna().sum()

print(f"Number of Hyperlink mentioned connections: {num_hyperlink_mentions}")

Number of Hyperlink mentioned connections: 139


In [13]:
# Total connection
print(f"Total detected connections: {df_result.shape[0]}")

Total detected connections: 635


# 4. Create Graph

Aim to recreate graphs as presented in paper: \
https://www.researchgate.net/publication/374243743_UNIVERSITY-LED_ENTREPRENEURSHIP_ECOSYSTEM_BUILDING_IN_UNDERSERVED_COMMUNITIES_FROM_A_NETWORK_PERSPECTIVE

Tool used:
* NetworkX: https://pypi.org/project/networkx/

## Install necessary libraries

In [14]:
!pip install matplotlib networkx



In [15]:
import matplotlib.pyplot as plt
import networkx as nx
import ast  # For safely evaluating stringified tuples
import matplotlib.patches as mpatches # Add legend patches

## Basic Graph, with label on

In [16]:
# Initialize an undirected graph
Connection_Graph = nx.Graph()

# Add edges from the filtered dataframe
for _, row in df_result.iterrows():
  # convert stringified tuples to python tuples
  edge = row['Edge']
  if isinstance(edge, str):
    edge = ast.literal_eval(edge)
  source, target = edge
  Connection_Graph.add_edge(source, target)

# Optionally print some info
print(f"Graph created with {Connection_Graph.number_of_nodes()} nodes and {Connection_Graph.number_of_edges()} edges.")

Graph created with 85 nodes and 635 edges.


In [17]:
# Compute layout once and reuse it
pos = nx.spring_layout(Connection_Graph, seed=42)

In [18]:
# Visualize large size image with high resolution
plt.figure(figsize=(64, 64), dpi=300)

# Adjust attributes and draw graph
nx.draw_networkx(
  Connection_Graph,
  pos=pos,
  # Adjust node size
  node_size=800,
  # Adjust font size for readability
  font_size=20,
  # Set font color
  font_color='black',
  # Set node color
  node_color='skyblue',
  # Set edge color
  edge_color='gray',
  # Display node labels
  with_labels=True,
  # Set edge width
  width=2
)
plt.title("SNA Connection Graph", fontsize=100)
plt.show()

Output hidden; open in https://colab.research.google.com to view.

## Graph with node size dynamicly based degree and label on

In [None]:
# We will be using the same connection graph

# Get degree
# And save as (node, degree) tuples
degree_list = list(Connection_Graph.degree())

# Sort by degree descending
sorted_degrees = sorted(degree_list, key=lambda x: x[1], reverse=True)

# Display top 10 nodes by degree
for node, degree in sorted_degrees[:10]:
  print(f"{node}: {degree}")

In [None]:
# Scale node size by degree (number of connections)
degree_dict = dict(Connection_Graph.degree())
node_sizes = [degree_dict[node] * 50 for node in Connection_Graph.nodes()]

# We will only label the top 10 results
top_10_nodes = [node for node, _ in sorted_degrees[:10]]
labels = {node: node for node in top_10_nodes}

# Adjust node color
node_colors = [
  "orange" if node in top_10_nodes else "skyblue"
  for node in Connection_Graph.nodes()
]

# Visualize large size image with high resolution
plt.figure(figsize=(64, 64), dpi=300)

# Adjust attributes and draw graph
nx.draw_networkx(
  Connection_Graph,
  pos=pos,
  # Adjust node size
  node_size=node_sizes,
  # Adjust font size for readability
  font_size=20,
  # Set node color
  node_color=node_colors,
  # Set edge color
  edge_color='gray',
  # Display node labels
  with_labels=False,
  # Set edge width
  width=2
)
# Add labels manually
nx.draw_networkx_labels(Connection_Graph, pos, labels=labels, font_size=20, font_color='black')
plt.title("SNA Connection Graph", fontsize=100)
plt.show()

Output hidden; open in https://colab.research.google.com to view.

## Graph with variation on edge type and label off

In [None]:
# Initialize an undirected graph
Connection_Graph = nx.Graph()

# Add edges from the filtered dataframe
for _, row in df_result.iterrows():
    # convert stringified tuples to python tuples
    edge = row['Edge']
    if isinstance(edge, str):
      edge = ast.literal_eval(edge)
    source, target = edge

    # Determine connection type
    google = pd.notna(row['Google_proof'])
    hyperlink = pd.notna(row['Hyperlink_proof'])

    # Every edge will have at least 1 connection type
    # Except the ones directly with target corporate
    if google and hyperlink:
      connection_type = "both"
    elif google:
      connection_type = "google"
    elif hyperlink:
      connection_type = "hyperlink"
    else:
      connection_type = "none"

    # Add the edge with "connection_type" as attribute
    Connection_Graph.add_edge(source, target, connection_type=connection_type)

# Optionally print some info
print(f"Graph created with {Connection_Graph.number_of_nodes()} nodes and {Connection_Graph.number_of_edges()} edges.")

Graph created with 327 nodes and 3195 edges.


In [None]:
# Get first edge in the graph to verify if edges are created properly
first_edge = list(Connection_Graph.edges(data=True))[0]
print(first_edge)

('21st Century Bank', '3M', {'connection_type': 'google'})


In [None]:
from math import e
# Define edge colors
edge_colors = []
for u, v in Connection_Graph.edges():
  connection_type = Connection_Graph[u][v]["connection_type"]
  if connection_type == "both":
    edge_colors.append("purple")
  elif connection_type == "google":
    edge_colors.append("red")
  elif connection_type == "hyperlink":
    edge_colors.append("blue")
  elif connection_type == "none":
    edge_colors.append("green")

# Define color-to-label mapping
legend_patches = [
  mpatches.Patch(color="purple", label="Both Connection detected"),
  mpatches.Patch(color="red", label="Google Search Connection"),
  mpatches.Patch(color="blue", label="Hyperlink Connection"),
  mpatches.Patch(color="green", label="Connection towards target corporate"),
]

# Visualize large size image with high resolution
plt.figure(figsize=(64, 64), dpi=300)

# Adjust attributes and draw graph
nx.draw_networkx(
  Connection_Graph,
  # Graph reproductivity
  pos=pos,
  # Adjust node size
  node_size=800,
  # Set node color
  node_color='skyblue',
  # Set edge color
  edge_color=edge_colors,
  # Display node labels
  with_labels=False,
  # Set edge width
  width=2
)
plt.legend(handles=legend_patches, loc='upper left', fontsize=50)
plt.title("SNA Connection Graph", fontsize=100)
plt.show()

## Graph with variation on node type and label off

### [Run this one] Display single corporation type (dynamic size)

In [19]:
# Check dataframe that consists the (Sponsors, Type)
unique_type_entries.head()

Unnamed: 0,Sponsors,Type
0,Ascend New York,nonprofit organization
1,Association of Neighborhood and Housing Develo...,nonprofit organization
2,BOC Capital Corp,nonprofit organization
3,BOC Development Corporation,nonprofit organization
4,BOC Women's Business Center,nonprofit organization


In [20]:
Target_corporation

'Lead B'

In [21]:
# Add the target corporation into the df
# Careful for repeated step
new_row = pd.DataFrame([{"Sponsors": Target_corporation, "Type": Target_corporation}])
unique_type_entries = pd.concat([unique_type_entries, new_row], ignore_index=True)

In [22]:
# Check the total types consistent in the graph
unique_type_entries["Type"].unique()

array(['nonprofit organization', 'financial', 'university', 'government',
       'business', 'Lead B'], dtype=object)

In [23]:
# Initialize an undirected graph
Connection_Graph = nx.Graph()

# Add edges from the filtered dataframe
for _, row in df_result.iterrows():
    # convert stringified tuples to python tuples
    edge = row['Edge']
    if isinstance(edge, str):
      edge = ast.literal_eval(edge)
    source, target = edge

    # Determine the type per corporation
    # For corporation with multiple types, consider only first one
    # Get all types for source and target
    source_types = unique_type_entries[unique_type_entries["Sponsors"] == source]["Type"].tolist()
    target_types = unique_type_entries[unique_type_entries["Sponsors"] == target]["Type"].tolist()
    source_type = source_types[0]
    target_type = target_types[0]

    # Add nodes with one type only
    Connection_Graph.add_node(source, node_type=source_type)
    Connection_Graph.add_node(target, node_type=target_type)

    # Add the edge with "connection_type" as attribute
    Connection_Graph.add_edge(source, target)

# Optionally print some info
print(f"Graph created with {Connection_Graph.number_of_nodes()} nodes and {Connection_Graph.number_of_edges()} edges.")

Graph created with 85 nodes and 635 edges.


In [24]:
# Get degree
# And save as (node, degree) tuples
degree_list = list(Connection_Graph.degree())

# Sort by degree descending
sorted_degrees = sorted(degree_list, key=lambda x: x[1], reverse=True)

# Display top 10 nodes by degree
for node, degree in sorted_degrees[:10]:
  print(f"{node}: {degree}")

# Scale node size by degree (number of connections)
degree_dict = dict(Connection_Graph.degree())
node_sizes = [degree_dict[node] * 50 for node in Connection_Graph.nodes()]

# We will only label the top 10 results
top_10_nodes = [node for node, _ in sorted_degrees[:10]]
labels = {node: node for node in top_10_nodes}

Lead B: 84
Data: 48
Empire State Development: 45
U.S. Small Business Administration: 44
Hofstra University: 38
NYS Assembly: 38
JPMorgan Chase: 33
Brooklyn Public Library: 33
CITY University of New York: 32
NYC Economic Development Corporation: 31


In [None]:
# Assign colors to each node
# This must match all types in the network
type_to_color = {
  "financial": "blue",
  "business": "red",
  "nonprofit organization": "green",
  "university": "purple",
  "government": "orange",
  "Lead B": "white",
}

node_colors = []
for node in Connection_Graph.nodes():
  node_type = Connection_Graph.nodes[node].get("node_type")
  node_colors.append(type_to_color.get(node_type))

# Create legend
legend_patches = [mpatches.Patch(color=color, label=type_) for type_, color in type_to_color.items()]

# Visualize large size image with high resolution
plt.figure(figsize=(64, 64), dpi=300)

# Adjust attributes and draw graph
nx.draw_networkx(
  Connection_Graph,
  # Graph reproductivity
  pos=pos,
  # Adjust node size
  node_size=node_sizes,
  # Set node color
  node_color=node_colors,
  # Set edge color
  edge_color="gray",
  # Display node labels
  with_labels=False,
  # Set edge width
  width=2
)
# Add labels manually
nx.draw_networkx_labels(Connection_Graph, pos, labels=labels, font_size=20, font_color='black')
plt.legend(handles=legend_patches, loc='upper left', fontsize=50)
plt.title("SNA Connection Graph", fontsize=100)
plt.show()

### Display single corporation type

In [None]:
# Check dataframe that consists the (Sponsors, Type)
unique_type_entries.head()

Unnamed: 0,Sponsors,Type
0,AAA,private
1,ADP,private
2,ANDALE Construction,private
3,APS Foundation,npo
4,AT&T,private


In [None]:
Target_corporation

In [None]:
# Add the target corporation into the df
new_row = pd.DataFrame([{"Sponsors": Target_corporation, "Type": Target_corporation}])
unique_type_entries = pd.concat([unique_type_entries, new_row], ignore_index=True)

In [None]:
# Check the total types consistent in the graph
unique_type_entries["Type"].unique()

In [None]:
# Initialize an undirected graph
Connection_Graph = nx.Graph()

# Add edges from the filtered dataframe
for _, row in df_result.iterrows():
    # convert stringified tuples to python tuples
    edge = row['Edge']
    if isinstance(edge, str):
      edge = ast.literal_eval(edge)
    source, target = edge

    # Determine the type per corporation
    # For corporation with multiple types, consider only first one
    # Get all types for source and target
    source_types = unique_type_entries[unique_type_entries["Sponsors"] == source]["Type"].tolist()
    target_types = unique_type_entries[unique_type_entries["Sponsors"] == target]["Type"].tolist()
    source_type = source_types[0]
    target_type = target_types[0]

    # Add nodes with one type only
    Connection_Graph.add_node(source, node_type=source_type)
    Connection_Graph.add_node(target, node_type=target_type)

    # Add the edge with "connection_type" as attribute
    Connection_Graph.add_edge(source, target)

# Optionally print some info
print(f"Graph created with {Connection_Graph.number_of_nodes()} nodes and {Connection_Graph.number_of_edges()} edges.")

Graph created with 141 nodes and 1807 edges.


In [None]:
# Assign colors to each node
# This must match all types in the network
type_to_color = {
  "financial": "blue",
  "private": "red",
  "npo": "green",
  "med": "cyan",
  "univ": "purple",
  "gov": "orange",
  "AZ HCC": "black",
}

node_colors = []
for node in Connection_Graph.nodes():
  node_type = Connection_Graph.nodes[node].get("node_type")
  node_colors.append(type_to_color.get(node_type))

# Create legend
legend_patches = [mpatches.Patch(color=color, label=type_) for type_, color in type_to_color.items()]

# Visualize large size image with high resolution
plt.figure(figsize=(64, 64), dpi=300)

# Adjust attributes and draw graph
nx.draw_networkx(
  Connection_Graph,
  # Graph reproductivity
  pos=pos,
  # Adjust node size
  node_size=800,
  # Set node color
  node_color=node_colors,
  # Set edge color
  edge_color="gray",
  # Display node labels
  with_labels=False,
  # Set edge width
  width=2
)
plt.legend(handles=legend_patches, loc='upper left', fontsize=50)
plt.title("SNA Connection Graph", fontsize=100)
plt.show()

Output hidden; open in https://colab.research.google.com to view.

### Display full corporation type

In [None]:
# Check dataframe that consists the (Sponsors, Type)
unique_type_entries.head()

Unnamed: 0,Sponsors,Type
0,A.T. Kearney,private
1,AK Consulting,private
2,AMD Pensando Systems,private
3,AT&T,private
4,African American Chamber of Commerce of New Je...,npo


In [None]:
# Check the total types consistent in the graph
unique_type_entries["Type"].unique()

In [None]:
# Set of all unique (sponsor, type) pairs as nodes
nodes = set()
# List of edges
edges = []

for _, row in df_result.iterrows():
    edge = row['Edge']
    if isinstance(edge, str):
        edge = ast.literal_eval(edge)
    source, target = edge

    # Get all types for source and target
    source_types = unique_type_entries[unique_type_entries["Sponsors"] == source]["Type"].tolist()
    target_types = unique_type_entries[unique_type_entries["Sponsors"] == target]["Type"].tolist()

    # Create all combinations of (source_type, target_type)
    for s_type in source_types:
      for t_type in target_types:
        source_node = f"{source} ({s_type})"
        target_node = f"{target} ({t_type})"

        # Track the node and the edge
        nodes.add((source_node, s_type))
        nodes.add((target_node, t_type))
        edges.append((source_node, target_node))

# Create new graph
Connection_Graph = nx.Graph()

# Add nodes with node_type attribute
for node_name, node_type in nodes:
  Connection_Graph.add_node(node_name, node_type=node_type)

# Add expanded edges
for source_node, target_node in edges:
  Connection_Graph.add_edge(source_node, target_node)

# Optionally print some info
print(f"Graph created with {Connection_Graph.number_of_nodes()} nodes and {Connection_Graph.number_of_edges()} edges.")

Graph created with 289 nodes and 2960 edges.


In [None]:
# Assign colors to each node
type_to_color = {
  "financial": "blue",
  "business": "red",
  "npo": "green",
  "medical": "cyan",
  "univ": "purple",
  "gov": "orange",
  "individual": "yellow",
}

node_colors = []
for node in Connection_Graph.nodes():
  node_type = Connection_Graph.nodes[node].get("node_type")
  node_colors.append(type_to_color.get(node_type))

# Create legend
legend_patches = [mpatches.Patch(color=color, label=type_) for type_, color in type_to_color.items()]

# Visualize large size image with high resolution
plt.figure(figsize=(64, 64), dpi=300)

# Adjust attributes and draw graph
nx.draw_networkx(
  Connection_Graph,
  # Adjust node size
  node_size=800,
  # Set node color
  node_color=node_colors,
  # Set edge color
  edge_color="gray",
  # Display node labels
  with_labels=False,
  # Set edge width
  width=2
)
plt.legend(handles=legend_patches, loc='upper left', fontsize=50)
plt.title("SNA Connection Graph", fontsize=100)
plt.show()

## Simplified graph (only corporate types as node)

In [26]:
# Check dataframe that consists the (Sponsors, Type)
# Make sure target corporation is in the frame
unique_type_entries.tail()

Unnamed: 0,Sponsors,Type
80,West Brighton Community LDC,nonprofit organization
81,Whistler Capital Partners,business
82,Women's Enterprise Development Center,nonprofit organization
83,YMCA of Brooklyn,nonprofit organization
84,Lead B,Lead B


In [27]:
# Check the total types consistent in the graph
unique_type_entries["Type"].unique()

array(['nonprofit organization', 'financial', 'university', 'government',
       'business', 'Lead B'], dtype=object)

In [28]:
from collections import Counter

type_edge_counter = Counter()

# Initialize an undirected graph
Connection_Graph = nx.Graph()

# Add edges from the filtered dataframe
for _, row in df_result.iterrows():
    # convert stringified tuples to python tuples
    edge = row['Edge']
    if isinstance(edge, str):
      edge = ast.literal_eval(edge)
    source, target = edge

    # Determine the type per corporation
    # For corporation with multiple types, consider only first one
    # Get all types for source and target
    source_types = unique_type_entries[unique_type_entries["Sponsors"] == source]["Type"].tolist()
    target_types = unique_type_entries[unique_type_entries["Sponsors"] == target]["Type"].tolist()
    source_type = source_types[0]
    target_type = target_types[0]

    if source_type and target_type:
      # Undirected Graph
      type_pair = tuple(sorted([source_type, target_type]))
      type_edge_counter[type_pair] += 1

In [29]:
# Check the first item of type_edge_counter
list(type_edge_counter.items())[0]

(('nonprofit organization', 'nonprofit organization'), 39)

In [30]:
# Create graph
for (t1, t2), weight in type_edge_counter.items():
  Connection_Graph.add_edge(t1, t2, weight=weight)

# Optionally print some info
print(f"Graph created with {Connection_Graph.number_of_nodes()} nodes and {Connection_Graph.number_of_edges()} edges.")

Graph created with 6 nodes and 20 edges.


In [33]:
# Labels are used for displaying purpose
edge_weights = nx.get_edge_attributes(Connection_Graph, "weight")

# Scale node size by sum of weights of edges connected to each node (weighted degree)
# (node, _, {'weight': _})
# Linear scaling
node_sizes = [
  sum(data["weight"] for _, _, data in Connection_Graph.edges(node, data=True)) ** 1.4
  for node in Connection_Graph.nodes()
]

# Scale edge width by edge weights
# Square root scaling
edge_widths = [np.sqrt(data["weight"]) for _, _, data in Connection_Graph.edges(data=True)]

# Generate circular layout but sort nodes alphabetically
sorted_nodes = sorted(Connection_Graph.nodes())
pos = nx.circular_layout(sorted_nodes)

# Visualize large size image with high resolution
plt.figure(figsize=(64, 64), dpi=300)

# Adjust attributes and draw graph
nx.draw_networkx(
  Connection_Graph,
  # Arranges all nodes evenly around a circle
  pos=pos,
  # Adjust node size
  node_size=node_sizes,
  # Adjust font size for readability
  font_size=60,
  # Set font color
  font_color='black',
  # Set node color
  node_color='skyblue',
  # Set edge color
  edge_color='gray',
  # Display node labels
  with_labels=True,
  # Set edge width
  width=edge_widths
)
# Draw edge labels for weight
nx.draw_networkx_edge_labels(Connection_Graph, pos=pos, edge_labels=edge_weights, font_size=35, label_pos=0.45)
plt.title("SNA Connection Graph", fontsize=100)
plt.show()

Output hidden; open in https://colab.research.google.com to view.

In [32]:
# Check sorted order
pos

{'Lead B': array([1.00000000e+00, 2.45045699e-08]),
 'business': array([0.49999998, 0.86602546]),
 'financial': array([-0.50000004,  0.8660254 ]),
 'government': array([-9.99999970e-01, -6.29182054e-08]),
 'nonprofit organization': array([-0.49999989, -0.86602541]),
 'university': array([ 0.49999992, -0.86602541])}