# Analyzing Startup Fundraising Deals from Cruchbase 

The dataset of investments is from October 2013. Souce of dataset is from [Github](https://github.com/datahoarder/crunchbase-october-2013/blob/master/crunchbase-investments.csv). Throughout this project, there is only 10MB of available memory. dataset consumes 10.3MB of disk space. 

## Explore data

In [51]:
import pandas as pd 
import numpy as np
pd.options.display.max_columns = 99
np.set_printoptions(suppress = True)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

invest = pd.read_csv('crunchbase-investments.csv', nrows = 5, encoding = 'ISO-8859-1')
invest

Unnamed: 0,company_permalink,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
0,/company/advercar,AdverCar,advertising,USA,CA,SF Bay,San Francisco,/company/1-800-flowers-com,1-800-FLOWERS.COM,,USA,NY,New York,New York,series-a,2012-10-30,2012-10,2012-Q4,2012,2000000
1,/company/launchgram,LaunchGram,news,USA,CA,SF Bay,Mountain View,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-23,2012-01,2012-Q1,2012,20000
2,/company/utap,uTaP,messaging,USA,,United States - Other,,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-01,2012-01,2012-Q1,2012,20000
3,/company/zoopshop,ZoopShop,software,USA,OH,Columbus,columbus,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-02-15,2012-02,2012-Q1,2012,20000
4,/company/efuneral,eFuneral,web,USA,OH,Cleveland,Cleveland,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2011-09-08,2011-09,2011-Q3,2011,20000


In [52]:
# memory of datafarme in 5000 rows 
memory_5000 = pd.read_csv('crunchbase-investments.csv', nrows = 5000, encoding = 'ISO-8859-1')
memory_5000.memory_usage(deep = True).sum()/(2**20)

5.579240798950195

In [53]:
# Number of rows 
invest_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding = 'ISO-8859-1')

nrow = 0
for chunk in invest_iter : 
    nrow += len(chunk)
    ncol = len(chunk.columns)
print(nrow, ncol)

52870 20


## Check columns

### Columns missing value counts 

In [54]:
# Check columns have no missing values 
invest_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding = 'ISO-8859-1')

missing = []
for chunk in invest_iter : 
    missing.append(chunk.isnull().sum())
    
missing_series = pd.concat(missing)
missing_series = missing_series.groupby(missing_series.index).sum().sort_values(ascending = False)
missing_series

investor_category_code    50427
investor_state_code       16809
investor_city             12480
investor_country_code     12001
raised_amount_usd          3599
company_category_code       643
company_city                533
company_state_code          492
funding_round_type            3
funded_year                   3
funded_month                  3
funded_at                     3
funded_quarter                3
investor_name                 2
investor_permalink            2
investor_region               2
company_region                1
company_permalink             1
company_name                  1
company_country_code          1
dtype: int64

### Unique values of object coulmns

In [55]:
# Check unique value in object columns 
invest_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding = 'ISO-8859-1')

unique_series = {}
for chunk in invest_iter : 
    object_df = chunk.select_dtypes(include = ['object'])
    object_columns = object_df.columns
    for col in object_columns : 
        col_unique = chunk[col].value_counts() 
        if col in unique_series : 
            unique_series[col].append(col_unique)
        else : 
            unique_series[col] = [col_unique]

for col in unique_series : 
    col_concat = pd.concat(unique_series[col])
    col_group = col_concat.groupby(col_concat.index).sum()
    print(col_group)

/company/0xdata            1
/company/1                 8
/company/1-800-dentist     2
/company/1000memories     10
/company/100plus           4
                          ..
/company/zyngenia          1
/company/zyomyx-inc        2
/company/zypsee            1
/company/zyrra             1
57 Woodstock Road          1
Name: company_permalink, Length: 11573, dtype: int64
#waywire          5
0xdata            1
1-800-DENTIST     2
1000memories     10
100Plus           4
                 ..
yaM Labs          1
ybuy              4
zozi             38
zulily            6
zuuka!            3
Name: company_name, Length: 11573, dtype: int64
2/7/08                 1
advertising         3200
analytics           1863
automotive           164
biotech             4951
cleantech           1948
consulting           233
design                55
ecommerce           2168
education            783
enterprise          4489
fashion              368
finance              931
games_video         1893
government

In [56]:
nunique_series = {} 
for col in unique_series : 
    col_concat = pd.concat(unique_series[col])
    col_group = col_concat.groupby(col_concat.index).sum()
    nunique_series[col] = len(col_group)

nunique_series = pd.Series(nunique_series).sort_values(ascending = False)
nunique_series[nunique_series <= 50]

investor_state_code       50
company_state_code        50
company_category_code     43
investor_category_code    33
funding_round_type         9
company_country_code       2
dtype: int64

### Columns memory foot print 

In [57]:
# Check columns have no missing values 
invest_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding = 'ISO-8859-1')

col_memory = {} 
for chunk in invest_iter : 
    for col in chunk.columns : 
        if col in col_memory : 
            col_memory[col] += chunk[col].memory_usage(deep = True)
        else : 
            col_memory[col] = chunk[col].memory_usage(deep = True)

col_memory_series = pd.Series(col_memory)
col_memory_series.sort_values(ascending = False)

investor_permalink        4981996
company_permalink         4059236
investor_name             3917114
company_name              3592774
funded_at                 3543633
company_city              3507374
company_category_code     3422552
company_region            3413033
funding_round_type        3412155
investor_region           3397729
funded_quarter            3385032
funded_month              3385032
company_country_code      3173624
company_state_code        3107499
investor_city             2886531
investor_country_code     2648740
investor_state_code       2478055
investor_category_code     623872
funded_year                424408
raised_amount_usd          424408
dtype: int64

### Total memory footprint 

In [58]:
# Check memory usage of total columns 
invest_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding = 'ISO-8859-1')
object_memory = 0
numeric_memory = 0
total_memory = 0

for chunk in invest_iter :
    object_df = chunk.select_dtypes(include = ['object'])
    numeric_df = chunk.select_dtypes(exclude = ['object'])
    object_memory += object_df.memory_usage(deep = True).sum()/(2**20)
    numeric_memory += numeric_df.memory_usage(deep = True).sum()/(2**20)
    total_memory += chunk.memory_usage(deep = True).sum()/(2**20)
    
print(f"Memory usage of object columns : {round(object_memory, 3)}MB")
print(f"Memory usage of numeric columns : {round(numeric_memory, 3)}MB")
print(f"Memory usage of total columns : {round(total_memory, 3)}MB")

Memory usage of object columns : 55.637MB
Memory usage of numeric columns : 1.353MB
Memory usage of total columns : 56.989MB


## Choose columns for optimizing 

Because memory usage of object columns is 55.637MB, converting data types is needed. First of all, useful columns for analysis and corresponding types is below : 

1. company_name : object 
2. company_category_code : object
3. company_state_code : object
4. company_city : object
5. investor_name : object
6. investor_category_code : object
7. investor_state_code : object
8. investor_city : object
9. funding_round_type : object
10. funded_at : datetime
11. raised_amount_usd : numeric

In [59]:
col_use = ['company_name', 'company_category_code', 'company_state_code', 'company_city', 
           'investor_name', 'investor_category_code', 'investor_state_code', 'investor_city',
           'funding_round_type', 'funded_at', 'raised_amount_usd']

## Optimize columns 

In [60]:
# Optimize investor dataframe under 10 megabytes
print(f"Previous total memory : {round(total_memory,3)}MB")

invest_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding = 'ISO-8859-1',
                          parse_dates = ['funded_at'], usecols = col_use)

total_memory = 0
for chunk in invest_iter :
    total_memory += chunk.memory_usage(deep = True).sum()/(2**20)
    
print(f"Memory usage of total columns : {round(total_memory, 3)}MB")

Previous total memory : 56.989MB
Memory usage of total columns : 26.495MB


## Load each chunk into a table in a SQLite databse 

In [61]:
import sqlite3
conn = sqlite3.connect('crunchbase.db')

invest_iter = pd.read_csv('crunchbase-investments.csv', chunksize = 5000, encoding = 'ISO-8859-1',
                          parse_dates = ['funded_at'], usecols = col_use)
for chunk in invest_iter :
    chunk.to_sql('investment_table', conn, if_exists = 'append', index = False)

## Retrieve data from SQLite 

In [72]:
# Which category of company attracted the most investments?
query = """
    SELECT company_category_code, 
           COUNT(company_category_code) AS 'num_investors', 
           AVG(raised_amount_usd) AS 'Average dollars'
        FROM investment_table 
    GROUP BY company_category_code
    ORDER BY 2 DESC, 3 DESC
    LIMIT 10;
"""
result_iter = pd.read_sql(query, conn, chunksize = 5000)
for chunk in result_iter : 
    print(chunk)

  company_category_code  num_investors  Average dollars
0              software          14486   10652166.84507
1                   web          10030    8902919.71368
2               biotech           9902   22715313.38724
3            enterprise           8978   10890745.01852
4                mobile           8134   17732652.54640
5           advertising           6400    8344978.99468
6             ecommerce           4336   11374606.89062
7             cleantech           3896   28566517.63035
8           games_video           3786   11464647.78145
9             analytics           3726    8118272.01949


In [67]:
# Which investor contributed the most money (across all startups)?
query = """
    SELECT investor_name, raised_amount_usd
        FROM investment_table
    ORDER BY raised_amount_usd DESC
    LIMIT 1; 
"""

result_iter = pd.read_sql(query, conn, chunksize = 5000)
for chunk in result_iter : 
    print(chunk)

  investor_name  raised_amount_usd
0   BrightHouse   3200000000.00000
