# Guided Project - Analyzing Startup Fundraising Deals from Crunchbase

In [4]:
import pandas as pd
pd.options.display.max_columns = 99

data = pd.read_csv('crunchbase-investments.csv',
                   encoding='ISO-8859-1',
                   low_memory=False)
data.head(5)


Unnamed: 0,company_permalink,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
0,/company/advercar,AdverCar,advertising,USA,CA,SF Bay,San Francisco,/company/1-800-flowers-com,1-800-FLOWERS.COM,,USA,NY,New York,New York,series-a,2012-10-30,2012-10,2012-Q4,2012.0,2000000.0
1,/company/launchgram,LaunchGram,news,USA,CA,SF Bay,Mountain View,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-23,2012-01,2012-Q1,2012.0,20000.0
2,/company/utap,uTaP,messaging,USA,,United States - Other,,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-01,2012-01,2012-Q1,2012.0,20000.0
3,/company/zoopshop,ZoopShop,software,USA,OH,Columbus,columbus,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-02-15,2012-02,2012-Q1,2012.0,20000.0
4,/company/efuneral,eFuneral,web,USA,OH,Cleveland,Cleveland,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2011-09-08,2011-09,2011-Q3,2011.0,20000.0


In [5]:
chunks = pd.read_csv('crunchbase-investments.csv',
                     encoding='ISO-8859-1',
                     chunksize=5000)

memory_col= dict()
missing_value = dict()

memory = list()

for index,chunk in enumerate(chunks):
    memory.append(chunk.memory_usage(deep=True).sum()/(1024*1024))
    
    if index==0:
        for col in chunk.columns.tolist():
            memory_col[col]=list()
            missing_value[col]=list()
    
    for col in chunk.columns.tolist():
        memory_col[col].append(chunk[col].memory_usage(deep=True)/(1024*1024))   
        missing_value[col].append(chunk[col].isna().sum())


In [9]:
print('--'*15)    

for key,value in missing_value.items():
    print('Column {} - {} missing value(s)'.format(key,sum(value)))

print('--'*15)    
    
for key,value in memory_col.items():
    print('Column {} - Memory footprint {} MB'.format(key,round(sum(value),2)))

print('--'*15)    

print('Total Memory size : {} MB'.format(round(sum(memory),2)))


------------------------------
Column raised_amount_usd - 3599 missing value(s)
Column funding_round_type - 3 missing value(s)
Column company_region - 1 missing value(s)
Column investor_state_code - 16809 missing value(s)
Column funded_year - 3 missing value(s)
Column company_city - 533 missing value(s)
Column investor_country_code - 12001 missing value(s)
Column company_category_code - 643 missing value(s)
Column investor_city - 12480 missing value(s)
Column investor_region - 2 missing value(s)
Column funded_at - 3 missing value(s)
Column investor_category_code - 50427 missing value(s)
Column funded_month - 3 missing value(s)
Column company_name - 1 missing value(s)
Column company_state_code - 492 missing value(s)
Column funded_quarter - 3 missing value(s)
Column company_permalink - 1 missing value(s)
Column investor_name - 2 missing value(s)
Column company_country_code - 1 missing value(s)
Column investor_permalink - 2 missing value(s)
------------------------------
Column raised_amo

## Selecting Data Types

In [16]:
col_types = list()

chunks = pd.read_csv('crunchbase-investments.csv',
                     encoding='ISO-8859-1',
                     chunksize=5000)

for index,chunk in enumerate(chunks):     
    if index==0:
        print(chunk.dtypes)
        col_types = chunk.columns.tolist()

company_permalink          object
company_name               object
company_category_code      object
company_country_code       object
company_state_code         object
company_region             object
company_city               object
investor_permalink         object
investor_name              object
investor_category_code     object
investor_country_code      object
investor_state_code        object
investor_region            object
investor_city              object
funding_round_type         object
funded_at                  object
funded_month               object
funded_quarter             object
funded_year                 int64
raised_amount_usd         float64
dtype: object


In [19]:
col_str_num = list()

for col in col_types:
    if col.endswith('_code'):
        col_str_num.append(col)
        

In [23]:
chunks = pd.read_csv('crunchbase-investments.csv',
                     encoding='ISO-8859-1',
                     chunksize=5000)

for index,chunk in enumerate(chunks):    
    for col in col_str_num:
        try:
            chunk[col] = pd.to_numeric(chunk[col])
        except:
            pass
        

In [24]:
chunks = pd.read_csv('crunchbase-investments.csv',
                     encoding='ISO-8859-1',
                     chunksize=5000)

candidates_category = list()

for index,chunk in enumerate(chunks):
    for col in chunk.columns.tolist():
        number_values_unique = len(chunk[col].unique())
        
        if number_values_unique <20:
            candidates_category.append(col)
            
category_cols = set(candidates_category)            

print(category_cols)

{'funding_round_type', 'funded_year', 'investor_country_code', 'investor_city', 'investor_region', 'investor_category_code', 'investor_state_code', 'company_country_code'}


In [25]:
convert_col_dtypes = dict()

for col in category_cols:
    convert_col_dtypes[col]='category'

chunks = pd.read_csv('crunchbase-investments.csv',
                     encoding='ISO-8859-1',
                     dtype=convert_col_dtypes, 
                     chunksize=5000)



In [26]:
memory = list()

for index,chunk in enumerate(chunks):
    memory.append(chunk.memory_usage(deep=True).sum()/(1024*1024))
    
print('Total Memory size : {} MB'.format(round(sum(memory),2)))


Total Memory size : 39.75 MB


In [27]:
# Drop columns representing URL's or containing way too many missing values (>90% missing)
drop_cols = ['investor_permalink', 'company_permalink', 'investor_category_code']

chunks = pd.read_csv('crunchbase-investments.csv',
                     encoding='ISO-8859-1',
                     dtype=convert_col_dtypes, 
                     chunksize=5000)

for chunk in chunks:
    chunk = chunk.drop(columns=drop_cols)

#keep_cols = chunk.columns.drop(drop_cols)

## Loading Chunks into SQLite

In [28]:
import sqlite3

conn =sqlite3.connect('crunchbase.db')

chunks = pd.read_csv('crunchbase-investments.csv',
                     encoding='ISO-8859-1',
                     dtype=convert_col_dtypes, 
                     chunksize=5000)

for chunk in chunks:
    
    chunk = chunk.drop(columns=drop_cols)
    
    for col in col_str_num:
        try:
            chunk[col] = pd.to_numeric(chunk[col])
        except:
            pass
        
    chunk.to_sql("investments",conn
                              ,index=False
                              ,if_exists='append')
    
    

In [29]:
!wc 'crunchbase.db'

   6096  236230 7520256 crunchbase.db


## Data Exploration and Analysis

In [30]:
query = """
SELECT company_name,
        (SUM(raised_amount_usd)
        /(SELECT SUM(raised_amount_usd) FROM investments))
        AS Proportion
        FROM investments
        GROUP BY company_name
        ORDER BY Proportion DESC;
"""

top = pd.read_sql(query,conn)

In [31]:
top

Unnamed: 0,company_name,Proportion
0,Clearwire,0.043536
1,Groupon,0.014940
2,Nanosolar,0.006608
3,Facebook,0.006093
4,SurveyMonkey,0.004767
5,Zynga,0.004233
6,Fisker Automotive,0.004090
7,Dropbox,0.004055
8,LivingSocial,0.003938
9,sigmacare,0.003814


In [38]:
query = """
SELECT company_category_code,
        SUM(raised_amount_usd)/1000000 AS invest 
        
        FROM investments
        
        GROUP BY company_category_code
        ORDER BY invest DESC;
"""

category_invest = pd.read_sql(query,conn)


In [39]:
category_invest

Unnamed: 0,company_category_code,invest
0,biotech,110396.423062
1,software,73084.516724
2,mobile,64777.379752
3,cleantech,52705.225028
4,enterprise,45860.927273
5,web,40143.264989
6,medical,25367.105281
7,advertising,25076.661879
8,ecommerce,22567.220071
9,network_hosting,22419.68384


In [40]:
query = """
SELECT investor_name,
        SUM(raised_amount_usd)/1000000 AS invest 
        
        FROM investments
        
        GROUP BY investor_name
        ORDER BY invest DESC;
"""

investor = pd.read_sql(query,conn)

In [41]:
investor

Unnamed: 0,investor_name,invest
0,Kleiner Perkins Caufield & Byers,11217.826376
1,New Enterprise Associates,9692.542344
2,Accel Partners,6472.126199
3,Goldman Sachs,6375.459000
4,Sequoia Capital,6039.402410
5,Intel,5969.200000
6,Google,5808.800000
7,Time Warner,5730.000000
8,Comcast,5669.000000
9,Greylock Partners,4960.982939


In [43]:
query = """
SELECT *
        
        FROM investments
        
        ORDER BY raised_amount_usd DESC
        LIMIT 1;
"""

top_round = pd.read_sql(query,conn)


In [44]:
top_round

Unnamed: 0,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_name,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
0,Clearwire,mobile,USA,WA,Seattle,Kirkland,BrightHouse,USA,CA,Los Angeles,Santa Monica,post-ipo,2008-05-06,2008-05,2008-Q2,2008,3200000000.0


In [46]:
query = """
SELECT *
        
        FROM investments
        WHERE raised_amount_usd IS NOT NULL
        ORDER BY raised_amount_usd ASC
        LIMIT 1;
"""

worst_round = pd.read_sql(query,conn)


In [47]:
worst_round

Unnamed: 0,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_name,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
0,PictureMe Universe,photo_video,USA,WI,Eau Claire,Eau Claire,UW-Eau Claire Office of Research and Sponsored...,USA,WI,Eau Claire,Eau Claire,other,2011-04-12,2011-04,2011-Q2,2011,1000.0
