# Crunchbase data full pipeline
The full pipeline includes three main steps:
1. data preprocessing
2. adding gender info to preprocessed data
3. compare industry diversification by gender

In [2]:
# import dependencies 
import pandas as pd
from ast import literal_eval
from collections import Counter

## I. Data Preprocessing

## #1 Get raw txt data from crunchbase

This is done manually by filtering, sorting, and copying-pasting from the crunchbase pro free trial.

Filtering:
- **Company Headquarters**: US, Europe/EU, China
- Financials:
    - **Last Funding Date**: Custom Date Range 2015-2019
    - **Last Funding Type**: Seed, Series A
- Company Status:
    - **Operating Status**: Active
    - **IPO Status**: Private
    
Sorting: by **Total Amount of Funding**

In [3]:
# manual input based on headquarters region filter
regions = ['US', 'EU', 'Europe', 'China']  #'Germany', France', Latvia'

## #2 Cleaning raw text data

### Cleaning Function

In [3]:
def cb_txt_to_csv(filename, i):
    
    text = ''
    
    with open(f'../data/crunchbase-raw/{filename}.txt', 'r') as f:
    
        # log row counts
        count = i

        # log previous line
        last_line = ''

        for line in f:

            # meets index, switch to next line
            if (line == f'{count}.\n') or (line == f'{count}.'):
                count += 1
                text += '\n'
                last_line = line

            # logo for company
            elif (last_line == f'{count}.\n') and ('Logo' in line):
                pass

            # logo for investors etc.
            elif 'Logo' in line:
                last_line = line
                pass

            # empty lines
            elif line == '\n':
                last_line = line
                pass

            # line with commas
            elif ',' in line:
                line = line.strip('\n').replace(',', ';') #note below
                text += line
                text += ','
                last_line = line

            # data on same row, add comma 
            else:
                text += line.strip('\n')
                text += ','
                last_line = line

    with open(f'../data/crunchbase-csv/{filename}.csv', 'w') as f:
        f.write(text)

*note*:
this is not the best approach, though it works well now for a list of values in a single column; e.g. sentences, dates, or money values also had their commas replaced might need to clean the data again before using it.

### Run Script

In [4]:
# because crunchbase only shows 50 rows per page 
first_n = list(range(1,1000,50))
last_n = list(range(50,1001,50))

In [5]:
for region in regions:
    for i in range(len(first_n)):
        filename = f'{region}_{first_n[i]}-{last_n[i]}'
#         cb_txt_to_csv(f'{region}/{filename}', first_n[i])

In [6]:
# check: if nothing printed means everything is good!
for region in regions:
    for i in range(len(first_n)):
        filename = f'{region}_{first_n[i]}-{last_n[i]}'
        
        try:
            df = pd.read_csv(f'../data/crunchbase-csv/{region}/{filename}.csv')
            
            # check correct format
            if (df.shape[0]!=50) or (df.shape[1]!=108):
                print(f'{filename}: {df.shape}')
        
        except:
            print(f'{filename}')

## #3 Combine data files
Currently, each file only includes 50 rows. Combine to get one single file for each region with 1000 rows.

### Run Script

In [7]:
for region in regions:
    df_1000 = pd.DataFrame()
    for i in range(len(first_n)):
        filename = f'{region}_{first_n[i]}-{last_n[i]}'
        df_50 = pd.read_csv(f'../data/crunchbase-csv/{region}/{filename}.csv')
        df_1000 = pd.concat([df_1000, df_50], ignore_index=True)
#     df_1000.to_csv(f'../data/crunchbase-aggregated/{region}1000.csv', index=False)

In [8]:
# check: if nothing printed means everything is good!
for region in regions:
    df = pd.read_csv(f'../data/crunchbase-aggregated/{region}1000.csv')
    if (df.shape[0]!=1000) or (df.shape[1]!=108):
                print(f'{region}: {df.shape}')

## II. Gender Inference

## #4 Run gender API on Founder Names

In [9]:
api_keys = []
with open(f'../data/api_key.txt', 'r') as f:
    for line in f:
        api_keys.append(line.strip())

### Run API

In [10]:
# for k in range(len(regions)):
    
#     # set up
#     df_region = pd.read_csv(f'../data/crunchbase-aggregated/{regions[k]}1000.csv')
#     key = api_keys[k+1]
    
#     # save api results
#     results_list = []
    
#     for i in range(1000):
#         founders_list = df_region['Founders'][i].split(';')
#         result_list = []
        
#         for name in founders_list:
#             name_block = name.split(' ')
#             first = ''.join(name_block[:-1])
#             last = name_block[-1]
#             result = !curl -X GET "https://v2.namsor.com/NamSorAPIv2/api2/json/gender/{first}/{last}" -H "accept: application/json" -H "X-API-KEY: {key}"
#             result_list.append(result)
#         results_list.append(result_list)
    
#     file = open(f'../data/api_results/{regions[k]}_api.txt','w')
#     file.writelines(str(results_list))
#     file.close()

### Extract important info from api result

In [5]:
def extract_api_results(df):

    gender_data = []
    prob_data = [] # will need to check later for prob<0.5
    female_data = [] # number of females

    # loop through rows
    for i in range(1000):

        row = df.iloc[i][0]

        gender_list = []
        prob_list = []

        count = 0

        # loop through name list
        for name in row:

            # yield results
            try: 
                gender = literal_eval(name[5])['likelyGender']
                gender_list.append(gender)

                prob = literal_eval(name[5])['probabilityCalibrated']
                prob_list.append(prob)
                
                # count female founders
                if (gender == 'female') and (prob > 0.5):
                    count += 1

            # if bad request, literal_eval can't read html on name[5]
            except SyntaxError:
                gender_list.append('N/A')
                prob_list.append('N/A')

        # founders col was entirely empty
        if gender_list == ['N/A']:
            count = 'N/A'
        
        # record data
        gender_data.append(gender_list)
        prob_data.append(prob_list)
        female_data.append(count)

    # add api results to df
    df['gender'] = gender_data
    df['prob'] = prob_data
    df['#female'] = female_data
    
    return df

### Run Script: Aggregate API results to original dataframe

In [13]:
for region in regions:
    
    # original df
    df = pd.read_csv(f'../data/crunchbase-aggregated/{region}1000.csv')
    
    # read file
    with open(f'../data/api_results/{region}_api.txt', 'r') as f:
        api_result_string = f.read()
    
    # string to dict
    api_result_data = literal_eval(api_result_string)
    
    # create dataframe
    df_gender = pd.DataFrame({'api_raw': api_result_data})
    
    # add extracted info to df
    df_gender = extract_api_results(df_gender)
    
    # merge api results to original df
    df = pd.merge(left=df, right=df_gender, left_index=True, right_index=True)
    
    # add >50% female representation
    df['Number of Founders'] = df['Number of Founders'].replace('—', 0)
    df['Number of Founders'] = df['Number of Founders'].astype(int)
    df['#female'] = df['#female'].fillna(0)
    df['#female'] = df['#female'].replace('N/A', 0)
    df['#female'] = df['#female'].astype(int)
    df['%female'] = df['#female']/df['Number of Founders']
    
    # export aggregated df
#     df.to_csv(f'../data/crunchbase-aggregated/{region}-gender.csv', index=False)

## III. Industry Diversification by Gender

### Count number of companies by industry & gender

In [8]:
def industry_gender(df, group=True, equal=True):
    '''
    - if `group`==False, use the detailed `Industries` column
    - if `equal`==True, get same number of male and female companies
    '''
    
    # group flag
    if group:
        col = 'Industry Groups'
    elif not group:
        col = 'Industries'

    # equal flag
    if equal:
        # update female def
        n_female = df[df['%female']>0.5].shape[0]
        #n_female = df[df['#female']>0].shape[0]

        n_not_female = df[df['%female']<=0.5].shape[0]

        if n_female <= n_not_female:
            n = n_female
            df_female = df[df['%female']>0.5]
            df_not_female = df[df['%female']<=0.5].sort_values(by=['Total Funding Amount'])[:n]
            df = df_female.append(df_not_female, ignore_index=True)

        elif n_female > n_not_female:
            n = n_not_female
            df_female = df[df['%female']>0.5].sort_values(by=['Total Funding Amount'])[:n]
            df_not_female = df[df['%female']<=0.5]
            df = df_female.append(df_not_female, ignore_index=True)

        
    # initiate data
    industry_full = []
    industry_female = []
    industry_not_female = []
        
    for i in range(df.shape[0]):
        
        # preprocess data
        industry_list_raw = df[col].str.split(';')[i]
        industry_list = [industry_name.strip() for industry_name in industry_list_raw]
        
        # industry counts
        industry_full += industry_list
        
        # gender counts
#         if (df['#female'][i] != 'N/A') and (df['#female'][i] > 0):
        if (df['%female'][i] > 0.5):
            industry_female += industry_list
        else:
            industry_not_female += industry_list
    
    industry_counts = Counter(industry_full)
    
    return industry_counts, industry_female, industry_not_female

### Run Script

In [9]:
for region in regions:

    df = pd.read_csv(f'../data/crunchbase-aggregated/{region}-gender.csv')

    by_industry_group = [True, False]
    by_gender_equal = [True, False]

    for big in by_industry_group:
        for bge in by_gender_equal:

            # get calculations
            industry_counts, industry_female, industry_not_female = industry_gender(df, group=big, equal=bge)

            # create new df with metrics
            df_metrics = pd.DataFrame({'#companies': industry_counts, 
                                       '#female_co': Counter(industry_female),
                                       '#not_female_co': Counter(industry_not_female)})

            # deal with nan
            df_metrics['#female_co'] = df_metrics['#female_co'].fillna(0)

            # derived data
            df_metrics['%female'] = df_metrics['#female_co']/df_metrics['#companies']*100
            df_metrics['%not_female'] = 100-df_metrics['%female']
            df_metrics.reset_index(inplace=True)
            df_metrics.rename(columns={'index': 'industry'}, inplace=True)

            # filename
            if big and bge:
                df_metrics.to_csv(f'../data/crunchbase-aggregated/{region}-industry_groups_updated.csv', index=False)
            elif not big and bge:
                df_metrics.to_csv(f'../data/crunchbase-aggregated/{region}-industries_updated.csv', index=False)

    #         if big and bge:
    #             df_metrics.to_csv(f'../data/crunchbase-aggregated/{region}-industry_groups_equal.csv', index=False)
            if big and not bge:
                df_metrics.to_csv(f'../data/crunchbase-aggregated/{region}-industry_groups.csv', index=False)
    #         elif not big and bge:
    #             df_metrics.to_csv(f'../data/crunchbase-aggregated/{region}-industries_equal.csv', index=False)
            elif not big and not bge:
                df_metrics.to_csv(f'../data/crunchbase-aggregated/{region}-industries.csv', index=False)

### Run Script
Get industry counts for cropped/equalized dataframe

In [53]:
for region in regions:
    for gender in ['f', 'm']:
        for n in [98, 32]:

            df = pd.read_csv(f'../data/crunchbase-cropped/{region}-{n}{gender}.csv')

            by_industry_group = [True, False]

            for big in by_industry_group:

                # get calculations
                industry_counts, _, _ = industry_gender(df, group=big, equal=bge)

                # create new df with metrics
                df_metrics = pd.DataFrame({'#companies': industry_counts})

                # derived data
                df_metrics.reset_index(inplace=True)
                df_metrics.rename(columns={'index': 'industry'}, inplace=True)

                # filename
#                 if big:
#                     df_metrics.to_csv(f'../data/crunchbase-cropped/{region}-{n}{gender}-industry_groups.csv', index=False)
#                 elif not big:
#                     df_metrics.to_csv(f'../data/crunchbase-cropped/{region}-{n}{gender}-industries.csv', index=False)

### Run Script
Aggregate cropped gender dataframes by region

In [54]:
for region in regions:
    for big in ['industry_groups', 'industries']:
        
        for n in [98, 32]:    
            df_agg = pd.DataFrame()
            for gender in ['f', 'm']:

                df = pd.read_csv(f'../data/crunchbase-cropped/{region}-{n}{gender}-{big}.csv')

                if gender=='f':
                    df.rename(columns={'#companies': '#female_co'}, inplace=True)
                elif gender=='m':
                    df.rename(columns={'#companies': '#not_female_co'}, inplace=True)
                    
                try:
                    df_agg = df_agg.merge(df, on='industry')
                except:
                    df_agg = df_agg.append(df)
                
#             df_agg.to_csv(f'../data/crunchbase-aggregated/{region}-{n}-{big}.csv', index=False)