In [47]:
import pandas as pd
import numpy as np
from itertools import groupby
import matplotlib.pyplot as plt

In [2]:
df_acquisitions = pd.read_csv('data/crunchbase-data-master/acquisitions.csv')
df_additions = pd.read_csv('data/crunchbase-data-master/additions.csv')
df_companies = pd.read_csv('data/crunchbase-data-master/companies.csv')
df_investments = pd.read_csv('data/crunchbase-data-master/investments.csv')
df_rounds = pd.read_csv('data/crunchbase-data-master/rounds.csv')

In [3]:
df_master = [df_acquisitions, df_additions, df_companies, df_investments, df_rounds]

In [4]:
for df in df_master:
    print(df.shape)
    print(df.columns)

(18968, 18)
Index(['company_permalink', 'company_name', 'company_category_list',
       'company_country_code', 'company_state_code', 'company_region',
       'company_city', 'acquirer_permalink', 'acquirer_name',
       'acquirer_category_list', 'acquirer_country_code',
       'acquirer_state_code', 'acquirer_region', 'acquirer_city',
       'acquired_at', 'acquired_month', 'price_amount', 'price_currency_code'],
      dtype='object')
(2213, 4)
Index(['content', 'month_str', 'year_str', 'value'], dtype='object')
(66368, 14)
Index(['permalink', 'name', 'homepage_url', 'category_list',
       'funding_total_usd', 'status', 'country_code', 'state_code', 'region',
       'city', 'funding_rounds', 'founded_at', 'first_funding_at',
       'last_funding_at'],
      dtype='object')
(168647, 18)
Index(['company_permalink', 'company_name', 'company_category_list',
       'company_country_code', 'company_state_code', 'company_region',
       'company_city', 'investor_permalink', 'investor_name',

**no information about gender of founders, but we can retrieve that from crunchbase and run an API that infers gender from names?**

## Country

In [5]:
df_companies.head(3)

Unnamed: 0,permalink,name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at
0,/organization/-fame,#fame,http://livfame.com,Media,10000000,operating,IND,16,Mumbai,Mumbai,1,,2015-01-05,2015-01-05
1,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,700000,operating,USA,DE,DE - Other,Delaware City,2,2014-09-04,2014-03-01,2014-10-14
2,/organization/-the-one-of-them-inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,3406878,operating,,,,,1,,2014-01-30,2014-01-30


In [80]:
df_companies.groupby(['country_code']).count()['permalink'].sort_values(ascending=False)[:10]

country_code
USA    37601
GBR     3688
CAN     1925
IND     1596
CHN     1544
FRA     1135
DEU     1082
ISR      965
ESP      746
AUS      503
Name: permalink, dtype: int64

## Category

Many companies belong to more than one category. What is the best way to categorize and formulate?

In [119]:
# check how many categories there are in total
# drop_category_na = df_companies['category_list'][df_companies['category_list'].notna()]
df_companies['category_list'].fillna('N/A', inplace=True)
category_list_raw = list(df_companies['category_list'].str.split('|'))

In [42]:
# flatten list and get unique values
category_list = set([item for sublist in category_list_raw for item in sublist])

In [43]:
len(category_list)

858

In [54]:
category_list = sorted(category_list)

that's a lot of categories! but when you look into it some could probably be grouped together

In [59]:
grouped_category_list = [list(i) for j, i in groupby(category_list, lambda a: a.partition(' ')[0])]

In [61]:
len(grouped_category_list)

610

In [62]:
grouped_category_list

[['3D', '3D Printing', '3D Technology'],
 ['Accounting'],
 ['Active Lifestyle'],
 ['Ad Targeting'],
 ['Adaptive Equipment'],
 ['Advanced Materials'],
 ['Adventure Travel'],
 ['Advertising',
  'Advertising Exchanges',
  'Advertising Networks',
  'Advertising Platforms'],
 ['Advice'],
 ['Aerospace'],
 ['Agriculture'],
 ['Air Pollution Control'],
 ['Algorithms'],
 ['All Markets', 'All Students'],
 ['Alternative Medicine'],
 ['Alumni'],
 ['Analytics'],
 ['Android'],
 ['Angels'],
 ['Animal Feed'],
 ['Anything Capital Intensive'],
 ['App Discovery', 'App Marketing', 'App Stores'],
 ['Application Performance Monitoring', 'Application Platforms'],
 ['Apps'],
 ['Aquaculture'],
 ['Architecture'],
 ['Archiving'],
 ['Art'],
 ['Artificial Intelligence'],
 ['Artists Globally'],
 ['Assisitive Technology'],
 ['Assisted Living'],
 ['Auctions'],
 ['Audio'],
 ['Audiobooks'],
 ['Augmented Reality'],
 ['Auto'],
 ['Automated Kiosk'],
 ['Automotive'],
 ['B2B', 'B2B Express Delivery'],
 ['BPO Services'],
 ['B

that's still a lot of companies. in crunchbase there is a column called "industry groups", but it is not available in this dataset

right now, i can manually use [GICS](https://en.wikipedia.org/wiki/Global_Industry_Classification_Standard) to group these, but it would be a lot of work

In [63]:
# sectors -- industry groups
sectors = {
    'Energy': ['Energy'],
    'Materials': ['Materials'],
    'Industrials': ['Capital Goods', 'Commercial & Professional Services', 'Transportation'],
    'Consumer Discretionary': ['Automobiles & Components', 'Consumer Durables & Apparel', 'Consumer Services', 'Retailing'],
    'Consumer Staples': ['Food & Staples Retailing', 'Food, Beverage & Tobacco', 'Household & Personal Products'],
    'Health Care': ['Health Care Equipment & Services', 'Pharmaceuticals, Biotechnology & Life Sciences'],
    'Financials': ['Banks', 'Diversified Financials', 'Insurance'],
    'Information Technology': ['Software & Services', 'Technology Hardware & Equipment', 'Semiconductors & Semiconductor Equipment'],
    'Communication Services': ['Telecommunication Services', 'Media & Entertainment'],
    'Utilities': ['Utilities'],
    'Real Estate': ['Real Estate']
}

In [87]:
first_category_list = sorted(set(drop_category_na.str.split('|').str[0]))

In [82]:
len(first_category_list)

728

In [88]:
grouped_first_category_list = [list(i) for j, i in groupby(first_category_list, lambda a: a.partition(' ')[0])]

In [89]:
len(grouped_first_category_list)

518

In [92]:
grouped_first_category_list

[['3D', '3D Printing', '3D Technology'],
 ['Accounting'],
 ['Active Lifestyle'],
 ['Ad Targeting'],
 ['Adaptive Equipment'],
 ['Advanced Materials'],
 ['Adventure Travel'],
 ['Advertising',
  'Advertising Exchanges',
  'Advertising Networks',
  'Advertising Platforms'],
 ['Advice'],
 ['Aerospace'],
 ['Agriculture'],
 ['Air Pollution Control'],
 ['Algorithms'],
 ['All Markets', 'All Students'],
 ['Alternative Medicine'],
 ['Alumni'],
 ['Analytics'],
 ['Android'],
 ['Angels'],
 ['Animal Feed'],
 ['Anything Capital Intensive'],
 ['App Discovery', 'App Marketing', 'App Stores'],
 ['Application Performance Monitoring', 'Application Platforms'],
 ['Apps'],
 ['Aquaculture'],
 ['Architecture'],
 ['Archiving'],
 ['Art'],
 ['Artificial Intelligence'],
 ['Artists Globally'],
 ['Assisitive Technology'],
 ['Assisted Living'],
 ['Auctions'],
 ['Audio'],
 ['Audiobooks'],
 ['Augmented Reality'],
 ['Auto'],
 ['Automated Kiosk'],
 ['Automotive'],
 ['B2B', 'B2B Express Delivery'],
 ['BPO Services'],
 ['B

In [94]:
energy_list = ['Air Pollution Control', 'Batteries', 'Carbon', 'Clean Energy', 'Clean Technology', 'Clean Technology IT',
              'Commercial Solar', 'Concentrated Solar Power', 'Energy', 'Energy Efficiency', 'Energy IT',
               'Energy Management', 'Energy Storage', 'Environmental Innovation', 'Fuels',
              'Green', 'Green Consumer Goods', 'GreenTech', 'Natural Gas Uses', 'Natural Resources',
               'Oil', 'Oil & Gas', 'Oil and Gas', 'Renewable Energies', 'Renewable Tech', 'Solar']

material_list = ['Advanced Materials', 'Chemicals', 'Gold', 'Material Science', 'Recycling',
                'Specialty Chemicals', 'Waste Management', 'Water', 'Water Purification']

industrial_list = ['Aerospace', 'All Markets', 'Archiving', 'Automotive', 'Bicycles', 'Boating Industry', 
                  'Career Management', 'Career Planning', 'Cars', 'College Campuses', 'College Recruiting',
                  'Colleges', 'Consulting', 'Contact Centers', 'Contact Management', 'Education',
                   'Enterprise Purchasing', 'Enterprise Resource Planning', 'Enterprise Search', 'Enterprises',
                   'Event Management', 'Events', 'Fleet Management', 'Governance', 
                   'Government Innovation', 'Governments', 'Gps', 'Human Resources', 'Industrial', 
                   'Infrastructure', 'Infrastructure Builders', 'Journalism', 'K-12 Education', 'Knowledge Management',
                   'Language Learning', 'Lead Generation', 'Lead Management', 'Legal', 'Licensing',
                   'Local Based Services', 'Local Businesses', 'Local Commerce', 'Local Search',
                   'Location Based Services', 'Logistics', 'Logistics Company', 'Manufacturing',
                   'Market Research', 'Marketplaces', 'Mechanical Solutions', 'Physical Security',
                   'Plumbers', 'Private School', 'Postal and Courier Services', 'Professional Networking', 
                   'Professional Services', 'Project Management', 'Public Relations', 'Public Safety',
                   'Public Transportation', 'Social Buying', 'Social CRM', 'Social Commerce',
                   'Specialty Retail', 'Training', 'Transportation', 'Tutoring', 'Universities',
                    'Wholesale']

consumer_list = ['Adaptive Equipment', 'Adventure Travel', 'Advice', 'All Students', 'Alumni', 'Babies',
                'Baby Accessories', 'Baby Boomers', 'Baby Safety', 'Building Products', 'Business Development',
                'Business Services', 'Business Travelers', 'Commodities', 'Consumer Electronics','Consumer Goods',
                'Consumers','E-Commerce', 'E-Commerce Platforms', 'EBooks', 'Fashion', 'Furniture', 'Gadget', 
                 'Gay & Lesbian', 'General Public Worldwide', 'Generation Y-Z', 'Golf Equipment', 
                 'Healthcare Services', 'Health Services Industry', 'High School Students', 'High Schools',
                 'Home & Garden', 'Home Automation', 'Home Decor', 'Home Renovation', 'Hospitality', 'Hotels',
                 'Interior Design', 'Jewelry', 'Kids', 'Lingerie', 'Local', 'Low Bid Auctions', 'Marketplaces',
                 'Mens Specific', 'Personal Branding', 'Pets', 'Physicians', 'Restaurants', 'Social Travel',
                 'Travel', 'Travel & Tourism', 'Weddings', 'University Students', 'Women', 'Young Adults']

staple_list = ['Agriculture', 'Animal Feed', 'Aquaculture', 'Brewing', 'Cannabis', 'Farmers Market', 'Farming',
              'Fertility', 'Food Processing', 'Fruit', 'Groceries', 'Specialty Foods', 'Wine And Spirits']

healthcare_list = ['Active Lifestyle', 'Beauty', 'Bio-Pharm', 'Biofuels', 'Bioinformatics', 'Biomass Power Generation',
                   'Biometrics', 'Biotechnology', 'Biotechnology and Semiconductor', 'Corporate Training', 'Corporate Wellness',
                   'Cosmetic Surgery', 'Cosmetics', 'Diabetes', 'Diagnostics', 'Dietary Supplements', 'Exercise',
                   'Eyewear', 'Fitness', 'Fuel Cells', 'Genetic Testing', 'Health Care', 'Health Diagnostics', 
                   'Health and Wellness', 'Hospitals', 'Independent Pharmacies', 'Life Sciences', 'Lifestyle',
                   'Lifestyle Businesses', 'Lifestyle Products', 'Nutrition', 'Organic', 'Organic Food',
                   'Personal Health', 'Pharmaceuticals', 'mHealth']

finance_list = ['Accounting', 'Anything Capital Intensive', 'Banking', 'Billing', 'Brokers', 'Consumer Lending',
               'Credit', 'Credit Cards', 'Crowdfunding', 'Crowdsourcing', 'Debt Collecting', 'Estimation and Quoting',
                'Finance', 'Financial Exchanges', 'Financial Services', 'Fraud Detection', 'Health and Insurance',
                'Hedge Funds', 'Impact Investing', 'Insurance', 'Intellectual Asset Management', 
                'Intellectual Property', 'Investment Management', 'Payments', 'Personal Finance',
                'Risk Management', 'Social Fundraising', 'Venture Capital']

tech_list = ['3D', '3D Printing', '3D Technology', 'Algorithms', 'Analytics', 'Android', 'Angels', 'App Discovery',
             'App Marketing', 'App Stores', 'Application Platforms', 'Apps', 'Artificial Intelligence', 
             'Assisitive Technology', 'Assisted Living', 'Augmented Reality', 'Auto', 'Automated Kiosk', 
             'Big Data', 'Big Data Analytics', 'Bitcoin', 'Bridging Online and Offline', 'Browser Extensions',
             'Business Analytics', 'Business Information Systems', 'Business Intelligence',
             'Call Center Automation', 'China Internet', 'Cloud Computing', 'Cloud Data Services',
             'Cloud Infrastructure', 'Cloud Management', 'Cloud Security', 'Cloud-Based Music',
             'Computer Vision', 'Computers', 'Consumer Internet', 'Corporate IT', 'Cyber', 'Cyber Security',
             'Data Center Automation', 'Data Center Infrastructure', 'Data Centers', 'Data Integration', 
             'Data Mining', 'Data Privacy', 'Data Security', 'Data Visualization', 'Databases',
             'Deep Information Technology', 'Developer APIs', 'Developer Tools', 'Development Platforms',
             'Digital Entertainment', 'Digital Media', 'Digital Rights Management', 'Digital Signage', 'Experience Design',
             'EDA Tools', 'Early-Stage Technology', 'EdTech', 'Electric Vehicles', 'Electrical Distribution',
             'Electronic Health Records', 'Electronics', 'Email', 'Email Marketing', 'Email Newsletters',
             'Embedded Hardware and Software', 'Engineering Firms', 'Enterprise 2.0', 'Enterprise Application',
             'Enterprise Hardware', 'Enterprise Security', 'Enterprise Software', 'Entrepreneur',
             'Face Recognition', 'Facebook Applications', 'File Sharing', 'FinTech', 'Finance Technology', 
             'Flash Sales', 'Flash Storage', 'Group Buying', 'Group Email', 'Group SMS', 'Hardware', 
             'Hardware + Software', 'Health Care Information Technology', 'High Tech', 'Human Computer Interaction',
             'Human Resource Automation', 'ICT', 'IT Management', 'IT and Cybersecurity', 'IaaS', 'Identity', 
             'Identity Management', 'Image Recognition', 'Incubators', 'Industrial Automation', 
             'Information Security', 'Information Services', 'Information Technology', 'Innovation Engineering', 
             'Innovation Management', 'Intelligent Systems', 'Interest Graph', 'Interface Design', 'Internet',
             'Internet Infrastructure', 'Internet Marketing', 'Internet Radio Market', 'Internet Service Providers',
             'Internet TV', 'Internet Technology', 'Internet of Things', 'Linux', 'Machine Learning', 
             'Marketing Automation', 'Mass Customization', 'Match-Making', 'Meeting Software', 
             'Mobile', 'Mobile Advertising', 'Mobile Analytics', 'Mobile Commerce', 'Mobile Devices',
             'Mobile Emergency&Health', 'Mobile Enterprise', 'Mobile Games', 'Mobile Health',
             'Mobile Infrastructure', 'Mobile Payments', 'Mobile Security', 'Mobile Shopping',
             'Mobile Social', 'Mobile Software Tools', 'Mobile Video', 'Natural Language Processing', 
             'Network Security', 'New Product Development', 'New Technologies', 'Online Dating',
             'Online Education', 'Online Gaming', 'Online Identity', 'Online Rental', 'Online Reservations',
             'Online Scheduling', 'Online Shopping', 'Online Travel', 'Online Video Advertising', 
             'Open Source', 'P2P Money Transfer', 'PaaS', 'Peer-to-Peer', 'Personal Data', 'Predictive Analytics',
             'Price Comparison', 'Product Design', 'Product Development Services', 'Product Search',
             'QR Codes', 'Retail Technology', 'Ride Sharing', 'Robotics', 'Social + Mobile + Local',
             'Social Bookmarking', 'Social Media', 'Social Media Advertising', 'Social Media Management',
             'Social Media Marketing', 'Social Media Platforms', 'Social Network Media',  'Social News',
             'Social Recruiting', 'Software', 'Speech Recognition', 'Technology', 'User Experience Design', 
             'User Interface', 'Video Conferencing', 'Video Streaming', 'Video on Demand', 'Virtual Workforces'
             'Wearables', 'Web Browsers', 'Web Design', 'Web Development', 'Web Hosting', 'Web Tools'
             'Wireless', 'iOS', 'iPad', 'iPhone']

communication_list = ['Ad Targeting', 'Advertising', 'Advertising Networks', 'Advertising Platforms', 'Art', 
                      'Artists Globally', 'Blogging Platforms', 'Brand Marketing', 'Broadcasting',
                      'Casual Games', 'Cause Marketing', 'Communications Hardware', 'Communications Infrastructure',
                      'Concerts', 'Console Gaming', 'Consumer Behavior', 'Content', 'Content Creators',
                      'Content Delivery', 'Content Discovery', 'Content Syndication', 'Creative', 'Creative Industries',
                      'Educational Games', 'Edutainment', 'Entertainment', 'Entertainment Industry', 'Fantasy Sports',
                      'Film', 'Film Distribution', 'Film Production', 'Gambling', 'Game', 'Game Mechanics',
                      'Games', 'Gamification', 'Independent Music Labels', 'Leisure', 'Local Advertising',
                      'Media', 'Messaging', 'Music', 'Music Education', 'Music Services', 'Music Venues',
                      'Musical Instruments', 'Musicians', 'Optical Communications', 'Photo Editing', 'Photo Sharing',
                      'Photography', 'Private Social Networking', 'Social Business', 'Social Games',
                      'Social Television', 'Telecommunications', 'Television', 'Theatre', 'Video', 'Video Games'] 

utility_list = ['Business Productivity', 'Productivity', 'Productivity Software', 'Task Management', 'Utilities']

estate_list = ['Architecture', 'Building Owners', 'Commercial Real Estate', 'Home Owners', 'Property Management',
              'Real Estate', 'Real Estate Investors', 'Real Time', 'Realtors']

In [120]:
def groupto_sector(category):
    if category in energy_list:
        return 'Energy'
    elif category in material_list:
        return 'Materials'
    elif category in industrial_list:
        return 'Industrials'
    elif category in consumer_list:
        return 'Consumer Discretionary'
    elif category in staple_list:
        return 'Consumer Staples'
    elif category in healthcare_list:
        return 'Health Care'
    elif category in finance_list:
        return 'Financials'
    elif category in tech_list:
        return 'Information Technology'
    elif category in communication_list:
        return 'Communication Services'
    elif category in utility_list:
        return 'Utilities'
    elif category in estate_list:
        return 'Real Estate'
    elif category in ['N/A']:
        return 'N/A'
    else:
        return 'Other Sectors'

In [121]:
df_companies['sector'] = df_companies['category_list'].str.split('|').str[0].apply(groupto_sector)

In [122]:
df_companies['sector'].value_counts()

Information Technology    20895
Other Sectors              9423
Health Care                8255
Communication Services     7204
Consumer Discretionary     6135
Industrials                5599
N/A                        3148
Financials                 2445
Energy                     2035
Real Estate                 646
Consumer Staples            413
Materials                   102
Utilities                    68
Name: sector, dtype: int64

In [128]:
df_companies.groupby(['country_code', 'sector']).count()['permalink'].sort_values(ascending=False)[:20]

country_code  sector                
USA           Information Technology    12072
              Health Care                5892
              Other Sectors              5356
              Communication Services     3879
              Industrials                3192
              Consumer Discretionary     2966
              Financials                 1320
              Energy                     1213
GBR           Information Technology     1165
USA           N/A                         946
CAN           Information Technology      596
GBR           Other Sectors               517
CHN           Information Technology      465
GBR           Communication Services      456
IND           Information Technology      450
USA           Real Estate                 433
FRA           Information Technology      426
GBR           Consumer Discretionary      391
              Health Care                 391
ISR           Information Technology      366
Name: permalink, dtype: int64

Get founders column from crunchbase and use API to infer gender.