In [8]:
import pandas as pd
import glob
import os
import string
import datetime
import numpy as np
from joblib import Parallel, delayed
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import warnings
from utils import *

pd.set_option('display.max_columns', None)
plt.rcParams.update({'font.size': 14})
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=pd.core.common.SettingWithCopyWarning)

# Loading data

In [9]:
df_org          = pd.read_csv('../data/organizations.csv')
df_jobs         = pd.read_csv('../data/jobs.csv')
df_people       = pd.read_csv('../data/people.csv')
df_fund_rounds  = pd.read_csv('../data/funding_rounds.csv')
df_ipos         = pd.read_csv('../data/ipos.csv')
df_acquisitions = pd.read_csv('../data/acquisitions.csv')

In [18]:
df_jobs_cleaned = pd.read_csv('../data/jobs_cleaned.csv',
                              converters={'group': str_to_list})

df_org_foundation = df_org[['uuid', 'founded_on','country_code','city','total_funding_usd']]
# Convert the 'founded_on' column to datetime
df_org_foundation['founded_on'] = pd.to_datetime(df_org_foundation['founded_on'], errors='coerce')
# Drop rows with NaN values (including out-of-bounds datetime)
df_org_foundation.dropna(subset=['founded_on','country_code','city'],inplace=True)
df_org_foundation = df_org_foundation.rename(columns={'uuid':'org_uuid'})

In [59]:
df_jobs_cleaned

Unnamed: 0,started_on,org_uuid,group,person_uuid
0,2005-10-01,e1393508-30ea-8a36-3f96-dd3226033abd,"[ExCo, Founder]",ed13cd36-fe2b-3707-197b-0c2d56e37a71
1,2000-11-01,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,[Other],9f99a98a-aa97-b30b-0d36-db67c1d277e0
2,2006-03-01,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,[Leadership],6e1bca72-a865-b518-b305-31214ce2d1b0
3,2005-07-01,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,[ExCo],c92a1f00-8c19-bf2e-0f28-dbbd383dc968
4,2004-01-01,df662812-7f97-0b43-9d3e-12f64f504fbb,"[ExCo, Leadership, Founder]",a01b8d46-d311-3333-7c34-aa3ae9c03f22
...,...,...,...,...
2012047,2022-01-03,708a2573-690b-456f-9de4-53344e493104,[Ownership],3e8a1b46-9884-7f7c-6c36-19ec417291ee
2012048,2022-07-01,fbdacb6c-cd3e-49cc-82da-30eb9f3dffb0,"[Board, Other]",3e8a1b46-9884-7f7c-6c36-19ec417291ee
2012049,2024-01-01,cb9a8725-767c-4462-bbae-2f23f6887bb8,[Leadership],bc0f88f0-fcc0-ebac-0470-8939d76484bc
2012050,2024-01-01,cb9a8725-767c-4462-bbae-2f23f6887bb8,[Leadership],bc0f88f0-fcc0-ebac-0470-8939d76484bc


In [96]:
def extractBipartiteEdgelist(org,jobs,f1,f2,city=None,year='all'):
    
    if year != 'all':
        org = filter_up_to_year(org,f1,year)
        jobs = filter_up_to_year(jobs,f2,year)
    
    if city:
        org = org[org.city==city]
    
    org_list = org.org_uuid.tolist()
    
    jobs = jobs[jobs.org_uuid.isin(org_list)]

    merged = pd.merge(jobs,org, how='left',on='org_uuid')
    # Group by person_uuid and collect unique org_uuid
    person_orgs = merged.groupby('person_uuid').agg({
        'org_uuid': lambda x: list(x),
        'city': lambda x: list(x),
        'started_on': lambda x: list(x)
    })

    # Create an empty set to store unique org connections
    org_connections = set()

   # Iterate through each person's organizations
    for orgs, cities, dates in zip(person_orgs['org_uuid'], person_orgs['city'], person_orgs['started_on']):
        # Create all possible unique pairs of organizations
        for (org1, city1, date1), (org2, city2, date2) in combinations(zip(orgs, cities, dates), 2):
            # Determine the latest year
            latest_year = max(date1.year, date2.year)
            # Add the sorted tuple with orgs, cities, and latest year to the set to avoid duplicates
            if org1 == org2:
                continue
            org_pair = tuple(sorted([(org1, city1), (org2, city2)]))
            org_connections.add((org_pair[0][0], org_pair[1][0], org_pair[0][1], org_pair[1][1], latest_year))
    
    # Convert the set to a dataframe
    org_connections_df = pd.DataFrame(list(org_connections), columns=['org_uuid_1', 'org_uuid_2', 'city_1', 'city_2', 'year'])
    
    return org_connections_df


def createStartupNetwork(edgelist,source,target, city=None, year=None, edge_attr=None, create_using=None, edge_key=None):
    
    if city:
        edgelist = edgelist[(edgelist.city_1==city) & (edgelist.city_2==city)]
    if year:
        edgelist = edgelist[edgelist.year<=year]

    return nx.from_pandas_edgelist(edgelist, source=source, target=target, edge_attr=edge_attr, create_using=create_using, edge_key=edge_key)

    

In [107]:
df = extractBipartiteEdgelist(df_org_foundation,df_jobs_cleaned,'founded_on','started_on',year=2024)

In [125]:
def find_largest_component(G):
    # Find all connected components
    components = list(nx.connected_components(G))
    largest_component = max(components, key=len)
    return len(components),largest_component

In [130]:
def networkMetrics(G):
    number_of_components, largest_component = find_largest_component(G)
    G = G.subgraph(largest_component)
    density = nx.density(G)
    n_nodes = G.number_of_nodes()
    n_edges = G.number_of_edges()
    global_efficiency = nx.global_efficiency(G)
    diameter = nx.diameter(G)
    shortest_path = nx.average_shortest_path_length(G)

    return [number_of_components,n_nodes,n_edges,density,global_efficiency,diameter,shortest_path]


In [131]:
print('Year,Nodes,Edges')
for year in range(2000,2020):
    G = createStartupNetwork(df,'org_uuid_1','org_uuid_2',city='San Francisco',year=year)
    print(year,networkMetrics(G))

Year,Nodes,Edges
2000 [44, 25, 28, 0.09333333333333334, 0.40316666666666756, 6, 3.0233333333333334]
2001 [49, 31, 34, 0.07311827956989247, 0.3640604198668709, 7, 3.382795698924731]
2002 [57, 34, 38, 0.0677361853832442, 0.3631928528987336, 8, 3.376114081996435]
2003 [66, 73, 88, 0.0334855403348554, 0.2750389851988083, 11, 4.5156012176560125]
2004 [79, 91, 114, 0.02783882783882784, 0.2761783807021887, 11, 4.3992673992674]
2005 [88, 191, 239, 0.013171672637090107, 0.2203190352404822, 14, 5.469440617249931]
2006 [108, 288, 384, 0.009291521486643438, 0.2215709162864395, 12, 5.229045683313976]
2007 [130, 405, 569, 0.006955139958440289, 0.2177800795953782, 12, 5.247451411807847]
2008 [142, 577, 900, 0.005415944540727903, 0.2174781797867157, 12, 5.174459609089158]
2009 [156, 761, 1315, 0.004547340756622173, 0.21575499422151354, 13, 5.167145030776679]
2010 [193, 1039, 2042, 0.003786804044944654, 0.22220056949722247, 13, 4.946586034815602]
2011 [233, 1435, 3078, 0.0029915589054276674, 0.22350186

(127, 108)