In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

Exploring our Data

In [2]:
startups_filename = os.path.join(os.getcwd(), "startups.csv")

df = pd.read_csv(startups_filename)

df.head(50)

Unnamed: 0.1,Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors
0,0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S..."
1,1,SpaceX,$100.3,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,2,Stripe,$95,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG"
3,3,Klarna,$45.6,12/12/2011,Sweden,Stockholm,Fintech,"Institutional Venture Partners, Sequoia Capita..."
4,4,Canva,$40,1/8/2018,Australia,Surry Hills,Internet software & services,"Sequoia Capital China, Blackbird Ventures, Mat..."
5,5,Instacart,$39,12/30/2014,United States,San Francisco,"Supply chain, logistics, & delivery","Khosla Ventures, Kleiner Perkins Caufield & By..."
6,6,Databricks,$38,2/5/2019,United States,San Francisco,Data management & analytics,"Andreessen Horowitz, New Enterprise Associates..."
7,7,Revolut,$33,4/26/2018,United Kingdom,London,Fintech,"index Ventures, DST Global, Ribbit Capital"
8,8,Nubank,$30,3/1/2018,Brazil,Sao Paulo,Fintech,"Sequoia Capital, Redpoint e.ventures, Kaszek V..."
9,9,Epic Games,$28.7,10/26/2018,United States,Cary,Other,"Tencent Holdings, KKR, Smash Ventures"


In [3]:
df.describe()

df.describe(include='object')


Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors
count,936,936,936,936,921,936,935
unique,935,192,589,47,239,17,920
top,Bolt,$1,7/13/2021,United States,San Francisco,Fintech,"Speedinvest, Valar Ventures, Uniqa Ventures"
freq,2,225,8,477,134,190,2


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 936 entries, 0 to 935
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        936 non-null    int64 
 1   Company           936 non-null    object
 2   Valuation ($B)    936 non-null    object
 3   Date Joined       936 non-null    object
 4   Country           936 non-null    object
 5   City              921 non-null    object
 6   Industry          936 non-null    object
 7   Select Investors  935 non-null    object
dtypes: int64(1), object(7)
memory usage: 58.6+ KB
None


In [5]:
# Check for missing values
print(df.isnull().sum())

Unnamed: 0           0
Company              0
Valuation ($B)       0
Date Joined          0
Country              0
City                15
Industry             0
Select Investors     1
dtype: int64


In [6]:
# Check how many values for each categorical column
for col in df.columns:
    print(col, ": ", len(df[col].unique()), "unique values")
    print(df[col].unique())
    print("\n")

Unnamed: 0 :  936 unique values
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 23

In [7]:
df['Select Investors'].value_counts()
df['Company'].value_counts()
df['Industry'].value_counts()

Unnamed: 0_level_0,count
Industry,Unnamed: 1_level_1
Fintech,190
Internet software & services,167
E-commerce & direct-to-consumer,102
Artificial intelligence,69
Health,63
"Supply chain, logistics, & delivery",51
Other,51
Cybersecurity,41
Mobile & telecommunications,37
Data management & analytics,36


- get rid of "unnamed column"
- deal with 15 blanks in "city" col and 1 blank "investors"
- make valuatoin a number value


Feature Engineering

In [8]:
# drop irrelevant columns
df = df.drop('Unnamed: 0', axis=1)
print(df.columns)

Index(['Company', 'Valuation ($B)', 'Date Joined', 'Country', 'City',
       'Industry', 'Select Investors'],
      dtype='object')


In [None]:
# There are missing values in the 'city' column
df = df.drop('City')
print(df.columns)

In [9]:
df['Select Investors'] = df['Select Investors'].str.split(',')
expanded_data = df.explode('Select Investors')

# Trim any whitespace
expanded_data['Select Investors'] = expanded_data['Select Investors'].str.strip()

# There is now a row for each investor's investment into a firm
print(expanded_data.shape)
expanded_data.head()

(2647, 7)


Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors
0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,Sequoia Capital China
0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,SIG Asia Investments
0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,Sina Weibo
0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,Softbank Group
1,SpaceX,$100.3,12/1/2012,United States,Hawthorne,Other,Founders Fund


In [12]:
investor_group = expanded_data.groupby('Select Investors')
investor_group.head()

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors
0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,Sequoia Capital China
0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,SIG Asia Investments
0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,Sina Weibo
0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,Softbank Group
1,SpaceX,$100.3,12/1/2012,United States,Hawthorne,Other,Founders Fund
...,...,...,...,...,...,...,...
934,ReliaQuest,$1,12/1/2021,United States,Tampa,Cybersecurity,FTV Capital
934,ReliaQuest,$1,12/1/2021,United States,Tampa,Cybersecurity,Ten Eleven Ventures
935,Pet Circle,$1,12/7/2021,Australia,Alexandria,E-commerce & direct-to-consumer,Prysm Capital
935,Pet Circle,$1,12/7/2021,Australia,Alexandria,E-commerce & direct-to-consumer,Baillie Gifford & Co.


In [14]:
print(investor_group.groups.keys())
investor_group.get_group('Sequoia Capital China')


dict_keys(['', '.406 Ventures', '/td>', '01 Advisors', '10T Fund', '14W', '14W. ForgeLight', '3G Capital Management', '3L', '3i Group', '3one4 Capital Partners', '468 Capital', '500 Global', '500 Startups', '58.com', '5Y Capital', '83North', '8VC', 'A&E Television Networks', 'A&NN', 'A91 Partners', 'ACE & Company', 'AME Cloud Ventures', 'ARCH Venture Partners', 'AU21', 'AXA Venture Partners', 'Accel', 'Accel India', 'Accel Partners', 'Accelm Scania Growth Capital', 'Access Industries', 'Accomplice', 'Acero Capital', 'Activant Capital', 'Activant Capital Group', 'Acton Capital Partners', 'Adams Street Partners', 'Addition', 'Addor Capital', 'Advance Venture Partners', 'Advancit Capital', 'Advantech Capital', 'Advent International', 'Affirma Capital', 'Afore Capital', 'Aglae Ventures', 'Ainge Advisory', 'Airbus Ventures', 'Ajinomoto', 'Aker', 'Alaska Permanent Fund', 'Aleph', 'Alexandria Venture Investments', 'Alibaba Entrepreneurs Fund', 'Alibaba Group', 'Alibaba Pictures Group', 'Alkeo

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors
0,Bytedance,$140,4/7/2017,China,Beijing,Artificial intelligence,Sequoia Capital China
4,Canva,$40,1/8/2018,Australia,Surry Hills,Internet software & services,Sequoia Capital China
14,J&T Express,$20,4/7/2021,Indonesia,Jakarta,"Supply chain, logistics, & delivery",Sequoia Capital China
18,SHEIN,$15,7/3/2018,China,Shenzhen,E-commerce & direct-to-consumer,Sequoia Capital China
28,Bitmain Technologies,$12,7/6/2018,China,Beijing,Hardware,Sequoia Capital China
46,HEYTEA,$9.28,7/1/2019,China,Shenzhen,Other,Sequoia Capital China
51,Chehaoduo,$9,3/12/2016,China,Beijing,E-commerce & direct-to-consumer,Sequoia Capital China
61,Xingsheng Selected,$8,7/22/2020,China,Changsha,E-commerce & direct-to-consumer,Sequoia Capital China
79,Ziroom,$6.6,1/17/2018,China,Beijing,E-commerce & direct-to-consumer,Sequoia Capital China
91,Yuanqi Senlin,$6,3/1/2020,China,Beijing,Consumer & retail,Sequoia Capital China


In [15]:
investor_group.size()

Unnamed: 0_level_0,0
Select Investors,Unnamed: 1_level_1
,2
.406 Ventures,1
/td>,1
01 Advisors,2
10T Fund,1
...,...
iTech Capital,1
iVision Ventures,1
index Ventures,1
next47,3


Now, the dataset has an entry for each investment made by an investor. It is grouped by investor.

In [20]:
print(expanded_data.isnull().sum())

Company              0
Valuation ($B)       0
Date Joined          0
Country              0
City                43
Industry             0
Select Investors     1
dtype: int64


In [23]:
# Make the Valuation column numerical
expanded_data['Valuation ($B)'] = expanded_data['Valuation ($B)'].replace({'\$': '', ',': ''}, regex=True).astype(float)
expanded_data['Valuation ($B)'] = pd.to_numeric(expanded_data['Valuation ($B)'], errors='coerce')
print(expanded_data['Valuation ($B)'])
expanded_data.info()

0      140.0
0      140.0
0      140.0
0      140.0
1      100.3
       ...  
934      1.0
934      1.0
935      1.0
935      1.0
935      1.0
Name: Valuation ($B), Length: 2647, dtype: float64
<class 'pandas.core.frame.DataFrame'>
Index: 2647 entries, 0 to 935
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           2647 non-null   object 
 1   Valuation ($B)    2647 non-null   float64
 2   Date Joined       2647 non-null   object 
 3   Country           2647 non-null   object 
 4   City              2604 non-null   object 
 5   Industry          2647 non-null   object 
 6   Select Investors  2646 non-null   object 
dtypes: float64(1), object(6)
memory usage: 230.0+ KB


In [26]:
# Create the pivot table for investor-startup matrix
investor_startup_matrix = expanded_data.pivot_table(index='Select Investors',
                                                    columns='Company',
                                                    aggfunc='size', fill_value=0)

# Display the matrix (each row is an investor, each column is a startup)
print(investor_startup_matrix.head())

Company           1  1047 Games  1KMXC  1Password  4Paradigm  56PINGTAI  \
Select Investors                                                          
                  0           0      0          0          0          0   
.406 Ventures     0           0      0          0          0          0   
/td>              0           0      0          0          0          0   
01 Advisors       0           0      0          0          0          0   
10T Fund          0           0      0          0          0          0   

Company           58 Daojia  6Sense  ABL Space Systems  AIWAYS  ...  ezCater  \
Select Investors                                                ...            
                          0       0                  0       0  ...        0   
.406 Ventures             0       0                  0       0  ...        0   
/td>                      0       0                  0       0  ...        0   
01 Advisors               0       0                  0       0  ...       

In [28]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357280 sha256=cdafefcaa83b732076ec05c8454aa015d5e4b1ff1722ec926d8661cc127859b3
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succe

In [29]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

# Convert the matrix into Surprise dataset format
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(expanded_data[['Select Investors', 'Company', 'Valuation ($B)']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.25)

# Initialize the SVD model
svd_model = SVD()

# Train the model on the training set
svd_model.fit(trainset)

# Test the model on the test set
predictions = svd_model.test(testset)

# Calculate accuracy
accuracy.rmse(predictions)


RMSE: 8.8820


8.882034627486377

In [31]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    # Map the predictions to each investor
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Sort the predictions for each investor and return the top n
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# Get the top 10 recommended startups for each investor
top_n_recommendations = get_top_n(predictions, n=10)

# Display the recommendations for a specific investor
investor = 'Sequoia Capital China'  # Replace with a real investor's name
print(f"Top 10 startup recommendations for {investor}:")
for startup, score in top_n_recommendations[investor]:
    print(f"{startup}: {score}")


Top 10 startup recommendations for Sequoia Capital China:
JOLLY Information Technology: 1
Tezign: 1
4Paradigm: 1
Miaoshou Doctor: 1
Poizon: 1
Zuoyebang: 1
Agile Robots: 1
EcoFlow: 1
Chehaoduo: 1
Ziroom: 1
