# Read Data  
Dataquest instructions: 
1. Read in data set of investments from October 2013 (https://github.com/datahoarder/crunchbase-october-2013/blob/master/crunchbase-investments.csv)  
    * Assume we only have 10 megabytes of available memory (set chunksize to 5000).  
2. Become familiar with:  
    * Each column's missing value counts  
    * Each column's memory footprint  
    * The total memory footprint of all the chunks combined  
    * Which column(s) can be dropped because they aren't useful for analysis

In [86]:
import pandas as pd
import pprint as pp
pd.options.display.max_columns = 99

Calculate the number of missing values in each column

In [7]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
missing_values = []
for chunk in chunk_iter:
    missing_values.append(chunk.isnull().sum())

In [10]:
combined_missing_values = pd.concat(missing_values)

In [24]:
unique_missing_values = combined_missing_values.groupby(combined_missing_values.index).sum().sort_values()

In [25]:
unique_missing_values

company_country_code          1
company_name                  1
company_permalink             1
company_region                1
investor_region               2
investor_permalink            2
investor_name                 2
funded_quarter                3
funded_at                     3
funded_month                  3
funded_year                   3
funding_round_type            3
company_state_code          492
company_city                533
company_category_code       643
raised_amount_usd          3599
investor_country_code     12001
investor_city             12480
investor_state_code       16809
investor_category_code    50427
dtype: int64

Calculate the memory footprint of each column (bytes)

In [28]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
memory_footprint = []
for chunk in chunk_iter:
    memory_footprint.append((chunk.memory_usage(deep=True)))

In [30]:
combined_memory_footprint = pd.concat(memory_footprint)

In [36]:
column_memory_footprint = combined_memory_footprint.groupby(combined_memory_footprint.index).sum().drop('Index')
column_memory_footprint

company_category_code     3421104
company_city              3505906
company_country_code      3172176
company_name              3591326
company_permalink         4057788
company_region            3411565
company_state_code        3106051
funded_at                 3542185
funded_month              3383584
funded_quarter            3383584
funded_year                422960
funding_round_type        3410707
investor_category_code     622424
investor_city             2885083
investor_country_code     2647292
investor_name             3915666
investor_permalink        4980548
investor_region           3396281
investor_state_code       2476607
raised_amount_usd          422960
dtype: int64

Calculate the total memory footprint of the data (megabytes)

In [38]:
column_memory_footprint.sum()/(2**20)

56.987568855285645

Create a list with columns that won't be used in the analysis. These include:  
* Columns with URL data  
* Columns with many missing values  

In [41]:
drop_cols = ['investor_permalink', 'company_permalink', 'investor_category_code']
keep_cols = chunk.columns.drop(drop_cols)

# Select Data Types  
Dataquest instructions:  
1. Identify the types for each column.  
2. Identify the numeric columns that can use for space efficient data types.  
3. For text columns:  
    * Analyze the unique value counts across all of the chunks to see if they can be converted to a numeric type.  
    * See if any text columns can be cleaned and converted to numeric types.   

Identify the types for each column  

In [110]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
dtype_dict = {}
for chunk in chunk_iter:
    for col in chunk.columns:
        if col not in dtype_dict:
            dtype_dict[col] = [str(chunk.dtypes[col])]
        else:
            dtype_dict[col].append(str(chunk.dtypes[col]))

In [116]:
unique_dtype_dict = {}
for key, value in dtype_dict.items():
    unique_dtype_dict[key] = set(dtype_dict[key])

In [117]:
unique_dtype_dict

{'company_permalink': {'object'},
 'company_name': {'object'},
 'company_category_code': {'object'},
 'company_country_code': {'object'},
 'company_state_code': {'object'},
 'company_region': {'object'},
 'company_city': {'object'},
 'investor_permalink': {'object'},
 'investor_name': {'object'},
 'investor_category_code': {'float64', 'object'},
 'investor_country_code': {'float64', 'object'},
 'investor_state_code': {'float64', 'object'},
 'investor_region': {'object'},
 'investor_city': {'float64', 'object'},
 'funding_round_type': {'object'},
 'funded_at': {'object'},
 'funded_month': {'object'},
 'funded_quarter': {'object'},
 'funded_year': {'float64', 'int64'},
 'raised_amount_usd': {'float64'}}

Investigate numeric columns to see which can be converted to more space efficient types

In [119]:
numeric_cols = ['investor_category_code', 'investor_country_code', 'investor_state_code', 'investor_city', 'funded_year', 'raised_amount_usd']
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
# Look at 2 chunks for investor_category_code since the dtype is only numeric in chunks 2-11. 
counter = 0
for chunk in chunk_iter:
    if counter < 2:
        print(chunk[numeric_cols[0]].groupby(chunk[numeric_cols[0]]).count()[:5])
        print('---------------------')
        #print(chunk[numeric_cols[1]].head(5))
        #print(chunk[numeric_cols[2]].head(5))
        counter += 1
    else:
        break

investor_category_code
advertising     22
biotech        161
cleantech       34
consulting      66
ecommerce       46
Name: investor_category_code, dtype: int64
---------------------
Series([], Name: investor_category_code, dtype: int64)
---------------------


Observations:  
1. investor_category_code is actually a string column. After the first chunk, this columns contains missing values which pandas reads in as float64.  

In [122]:
chunk[numeric_cols].head(5)

Unnamed: 0,investor_category_code,investor_country_code,investor_state_code,investor_city,funded_year,raised_amount_usd
10000,,USA,CT,Greenwich,2009,37400000.0
10001,,USA,CT,Greenwich,2009,6000000.0
10002,,USA,CT,Greenwich,2007,22300000.0
10003,,BEL,,Leuven,2007,8387128.0
10004,,BEL,,Leuven,2011,15700000.0


Observations continued...  
2. investor_country_code, investor_state_code, and investor_city also contain string data.  
3. funded_year contains the year which can be converted to a short integer type as long as rows where funded_year is missing are removed (integer type doesn't have the equivalent of NaN to fill in missing data).  
4. I'll use a float type for the amount raised in usd.

In [128]:
numeric_cols = ['funded_year', 'raised_amount_usd']

In [129]:
string_cols = keep_cols.drop(numeric_cols)

In [133]:
chunk[string_cols].head(5)

Unnamed: 0,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_name,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter
10000,Zag,search,USA,CA,Los Angeles,Santa Monica,Capricorn Management,USA,CT,New York,Greenwich,series-c+,2009-02-05,2009-02,2009-Q1
10001,FRX Polymers,cleantech,USA,MA,Boston,Chelmsford,Capricorn Management,USA,CT,New York,Greenwich,series-a,2009-07-27,2009-07,2009-Q3
10002,Targeted Growth,biotech,USA,WA,Seattle,Seattle,Capricorn Management,USA,CT,New York,Greenwich,venture,2007-02-07,2007-02,2007-Q1
10003,Mendel Biotechnology,biotech,USA,CA,SF Bay,Hayward,Capricorn Venture Partners,BEL,,Leuven,Leuven,private-equity,2007-06-13,2007-06,2007-Q2
10004,FRX Polymers,cleantech,USA,MA,Boston,Chelmsford,Capricorn Venture Partners,BEL,,Leuven,Leuven,series-b,2011-06-06,2011-06,2011-Q2


The following columns can be cleaned and converted to a numeric type

In [134]:
cols_to_clean = ['funded_at', 'funded_month', 'funded_quarter']

In [137]:
keep_cols

Index(['company_name', 'company_category_code', 'company_country_code',
       'company_state_code', 'company_region', 'company_city', 'investor_name',
       'investor_country_code', 'investor_state_code', 'investor_region',
       'investor_city', 'funding_round_type', 'funded_at', 'funded_month',
       'funded_quarter', 'funded_year', 'raised_amount_usd'],
      dtype='object')

# Write the data to a SQLite database  

In [159]:
# Connect to sqlite3
import sqlite3
conn = sqlite3.connect('crunchbase.db')

Write the data to the newly created table.  
I'm going to use float for all numeric columns so I can avoid dropping rows with null values in numeric columns.

In [155]:
final_cols = ['company_name', 'company_category_code', 'company_country_code',
       'company_state_code', 'company_region', 'company_city', 'investor_name',
       'investor_country_code', 'investor_state_code', 'investor_region',
       'investor_city', 'funding_round_type', 'funded_day', 'funded_month',
       'funded_quarter', 'funded_year', 'raised_amount_usd']

In [160]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
for chunk in chunk_iter:
    chunk['funded_day'] = chunk['funded_at'].str.split('-').str[-1].astype('float64')
    chunk['funded_month'] = chunk['funded_month'].str.split('-').str[-1].astype('float64')
    chunk['funded_quarter'] =  chunk['funded_quarter'].str.split('Q').str[-1].astype('float64')
    for col in numeric_cols:
        chunk[col] = chunk[col].astype('float64')
    chunk = chunk.filter(final_cols)
    chunk.to_sql('crunchbase_investments', conn, if_exists='append', index=False)

Query the table to make sure the data types match up to what I expect.

In [162]:
df = pd.read_sql('SELECT * FROM crunchbase_investments', conn)

In [165]:
df.dtypes

company_name              object
company_category_code     object
company_country_code      object
company_state_code        object
company_region            object
company_city              object
investor_name             object
investor_country_code     object
investor_state_code       object
investor_region           object
investor_city             object
funding_round_type        object
funded_day               float64
funded_month             float64
funded_quarter           float64
funded_year              float64
raised_amount_usd        float64
dtype: object

Use the !wc IPython command to return the file size of the database.

This unix command doesn't work in my Jupyter notebook since I'm working with Windows. However, I can run it in a bash terminal and I get 6,197,248 bytes when I run the following:  
```
wc -c crunchbase.db
```

In [208]:
db_size = 6197248
print('Database size (Mb):')
print(round(db_size/(2**20)))

Database size (Mb):
6


# Investigate Data  
Use the pandas SQLite workflow to answer the following questions:  
1. What proportion of the total amount of funds did the top 10% raise? What about the top 1%? Compare these values to the proportions the bottom 10% and bottom 1% raised.  
2. Which category of company attracted the most investments?  
3. Which investor contributed the most money (across all startups)?  
4. Which investors contributed the most money per startup?  
5. Which funding round was the most popular? Which was the least popular?  


In [186]:
# Select the part of the table that is needed to answer question 1. Use pandas for the computation part.
query = 'SELECT company_name, raised_amount_usd from crunchbase_investments'
funds_df = pd.read_sql(query, conn)
funds_unique = funds_df.groupby('company_name').sum().sort_values(by='raised_amount_usd', ascending=False)
number_of_companies = len(funds_unique)
ten_percent = round(number_of_companies*.1)
one_percent = round(number_of_companies*.01)

In [192]:
top_10_sum = funds_unique.iloc[0:ten_percent]['raised_amount_usd'].sum()
top_1_sum = funds_unique.iloc[0:one_percent]['raised_amount_usd'].sum()
bottom_10_sum = funds_unique.iloc[-ten_percent:]['raised_amount_usd'].sum()
bottom_1_sum = funds_unique.iloc[-one_percent:]['raised_amount_usd'].sum()
total_raised = funds_unique['raised_amount_usd'].sum()

What proportion of the total amount of funds did the top 10% raise? What about the top 1%? Compare these values to the proportions the bottom 10% and bottom 1% raised.

In [196]:
print('Proportion of the total funds raised by the top 10% of companies:')
print(top_10_sum/total_raised)
print('\nProportion of the total funds raised by the top 1% of companies:')
print(top_1_sum/total_raised)
print('\nProportion of the total funds raised by the bottom 10% of companies:')
print(bottom_10_sum/total_raised)
print('\nProportion of the total funds raised by the bottom 1% of companies:')
print(bottom_1_sum/total_raised)

Proportion of the total funds raised by the top 10% of companies:
0.6712772709906496

Proportion of the total funds raised by the top 1% of companies:
0.2631143683289076

Proportion of the total funds raised by the bottom 10% of companies:
0.0

Proportion of the total funds raised by the bottom 1% of companies:
0.0


Which category of company attracted the most investments?

In [198]:
# Select the part of the table that is needed to answer question 2. Use pandas for the computation part.
query = 'SELECT company_category_code, raised_amount_usd from crunchbase_investments'
category_df = pd.read_sql(query, conn)
category_df_grouped = category_df.groupby('company_category_code').sum().sort_values(by='raised_amount_usd', ascending=False)
category_df_grouped.head(5)

Unnamed: 0_level_0,raised_amount_usd
company_category_code,Unnamed: 1_level_1
biotech,110396400000.0
software,73084520000.0
mobile,64777380000.0
cleantech,52705230000.0
enterprise,45860930000.0


Which investor contributed the most money (across all startups)?

In [199]:
# Select the part of the table that is needed to answer question 3. Use pandas for the computation part.
query = 'SELECT investor_name, raised_amount_usd from crunchbase_investments'
investor_df = pd.read_sql(query, conn)
investor_df_grouped = investor_df.groupby('investor_name').sum().sort_values(by='raised_amount_usd', ascending=False)
investor_df_grouped.head(5)

Unnamed: 0_level_0,raised_amount_usd
investor_name,Unnamed: 1_level_1
Kleiner Perkins Caufield & Byers,11217830000.0
New Enterprise Associates,9692542000.0
Accel Partners,6472126000.0
Goldman Sachs,6375459000.0
Sequoia Capital,6039402000.0


Which investors contributed the most money per startup?

In [201]:
# Select the part of the table that is needed to answer question 4. Use pandas for the computation part.
query = 'SELECT investor_name, raised_amount_usd from crunchbase_investments'
investor_df = pd.read_sql(query, conn)
investor_df_sort = investor_df.sort_values(by='raised_amount_usd', ascending=False)
investor_df_sort.head(5)

Unnamed: 0,investor_name,raised_amount_usd
1350,Intel,3200000000.0
1172,Google,3200000000.0
716,Comcast,3200000000.0
522,BrightHouse,3200000000.0
2853,Time Warner,3200000000.0


Which funding round was the most popular? Which was the least popular?

In [202]:
# Select the part of the table that is needed to answer question 5. Use pandas for the computation part.
query = 'SELECT funding_round_type, raised_amount_usd from crunchbase_investments'
funding_round_df = pd.read_sql(query, conn)
funding_round_grouped = funding_round_df.groupby('funding_round_type').sum().sort_values(by='raised_amount_usd', ascending=False)
funding_round_grouped.head(5)

Unnamed: 0_level_0,raised_amount_usd
funding_round_type,Unnamed: 1_level_1
series-c+,265753500000.0
venture,130556500000.0
series-b,128326800000.0
series-a,86542150000.0
post-ipo,30917600000.0


In [204]:
# In order of least popular
funding_round_grouped.sort_values(by='raised_amount_usd', ascending=True).head(5)

Unnamed: 0_level_0,raised_amount_usd
funding_round_type,Unnamed: 1_level_1
crowdfunding,6491500.0
angel,4962075000.0
private-equity,16159880000.0
other,18507260000.0
post-ipo,30917600000.0
