# feature engineering

if the target variable is changed to `total_funding_amount_usd`, then no need to drop data with no male/female information...? also, instead of encoding the `female_led` info, better to just keep the percentage??

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# data
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt

# feature engineering
from numpy import asarray
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MultiLabelBinarizer

## 1. Aggregate Data
combine data of regions China, Europe, and the US.

In [2]:
regions = ['China', 'Europe', 'US']

In [3]:
df0 = pd.read_csv(f'../data/crunchbase-aggregated/{regions[0]}-gender.csv')
df1 = pd.read_csv(f'../data/crunchbase-aggregated/{regions[1]}-gender.csv')
df2 = pd.read_csv(f'../data/crunchbase-aggregated/{regions[2]}-gender.csv')

In [4]:
df = pd.concat([df0, df1, df2])
df.reset_index(inplace=True, drop=True)
df.shape

(3000, 113)

## 2. Feature Transformation
data scaling, discretization, dealing missing values etc.

In [5]:
df.head(1)

Unnamed: 0,Organization Name,Last Funding Date,IPO Status,Operating Status,Last Funding Type,Industries,Headquarters Location,Description,CB Rank (Company),Headquarters Regions,...,Most Recent Valuation Range,Date of Most Recent Valuation,Number of Private Notes,Tags,Unnamed: 107,api_raw,gender,prob,#female,%female
0,CMC,Jul 3; 2018,Private,Active,Series A,—,Shanghai; Shanghai; China,CMC Inc is a Chinese state-backed media group.,36;576,Asia-Pacific (APAC),...,—,—,—,—,,[[' % Total % Received % Xferd Average Sp...,['N/A'],['N/A'],0,0.0


## Defining column data types

Column Name | current dtype | new dtype
------------|---------------|-----------
'Organization Name' | text | -
'Last Funding Date' | text | date
'IPO Status' | text | category
'Operating Status' | text | category
'Last Funding Type' | text | category (ordinal)
'Industries' | text | multi-label categories
'Headquarters Location' | text | (city, state, country) categories
'Description' | text | NLP?
'CB Rank (Company)' | text | number
'Headquarters Regions' | text | category
'Diversity Spotlight (US Only)' | n/a | -
'Estimated Revenue Range' | text | category (ordinal)
'Founded Date' | text | date
'Exit Date'  | text | date
'Closed Date'  | text | date
'Company Type' | text | category
'Website' | text | n/a
'Twitter' | text | n/a
'Facebook' | text | n/a
'LinkedIn' | text | n/a
'Contact Email' | text | n/a
'Phone Number' | text | n/a
'Number of Articles' | text | number
'Hub Tags' | text | category
'Full Description' | text | NLP?
'Actively Hiring' | text | NULL
'Investor Type' | text | NULL
'Investment Stage' | text | NULL
'Number of Portfolio Organizations' | text | number
'Number of Investments' | text | number
'Number of Lead Investments' | text | number
'Number of Diversity Investments' | text | number
'Number of Exits' | text | number
'Number of Exits (IPO)' | text | number
'Accelerator Program Type' | text | NULL
'Accelerator Application Deadline' | text | NULL
'Accelerator Duration (in weeks)' | text | NULL
'School Type' | text | NULL
'School Program' | text | NULL
'Number of Enrollments' | text | number
'School Method' | text | NULL
'Number of Founders (Alumni)' | text | number
'Number of Alumni' | text | number
'Industry Groups' | text | multi-label categories
'Number of Founders' | text | number
'Founders' | text | text
'Number of Employees' | text | category (ordinal)
'Number of Funding Rounds' | text | number
'Funding Status' | text | category (ordinal)
'Last Funding Amount' | text | number (note currenncy)
'Last Equity Funding Amount' | text | number (note currenncy)
'Last Equity Funding Type'  | text | category (ordinal)
'Total Equity Funding Amount' | text | number (note currenncy)
'Total Funding Amount' | text | number (note currenncy)
'Top 5 Investors' | text | multi-label catgeories
'Number of Lead Investors' | text | number
'Number of Investors' | text | number
'Number of Acquisitions' | text | number
'Acquisition Status' | text | category
'Transaction Name' | text | text
'Acquired by' | text | text / category
'Announced Date' | text | date
'Price' | text | number (note currency)
'Acquisition Type' | text | category
'Acquisition Terms' | text | NULL
'IPO Date' | text | date
'Delisted Date' | text | date
'Money Raised at IPO' | text | number (note currency)
'Valuation at IPO' | text | NULL
'Stock Symbol' | text | NULL
'Stock Exchange' | text | NULL
'Last Leadership Hiring Date' | text | date
'Last Layoff Mention Date' | text | date
'Number of Events' | text | number
'CB Rank (Organization)' | text | number
'CB Rank (School)' | text | number
'Trend Score (7 Days)' | text | number
'Trend Score (30 Days)' | text | number
'Trend Score (90 Days)' | text | number
'Contact Job Departments' | text | multi-label categories
'Number of Contacts' | text | number
'Number of Private Contacts' | text | number
'Monthly Visits' | text | number
'Average Visits (6 months)' | text | number
'Monthly Visits Growth' | text | number (percentage)
'Visit Duration' | text | number
'Visit Duration Growth' | text | number (percentage)
'Page Views / Visit' | text | number
'Page Views / Visit Growth' | text | number (percentage)
'Bounce Rate' | text | number
'Bounce Rate Growth' | text | number (percentage)
'Global Traffic Rank' | text | number
'Monthly Rank Change (#)' | text | number
'Monthly Rank Growth' | text | number (percentage)
'Active Tech Count' | text | number
'Number of Apps' | text | number
'Downloads Last 30 Days' | text | number
'Total Products Active' | text | number
'Patents Granted' | text | number
'Trademarks Registered' | text | number
'Most Popular Patent Class' | text | catgeory?
'Most Popular Trademark Class' | text | catgeory?
'IT Spend' | text | NULL
'Most Recent Valuation Range' | text | NULL
'Date of Most Recent Valuation' | text | date
'Number of Private Notes' | text | number
'Tags' | text | NULL
'Unnamed: 107'  | NULL | NULL
'api_raw' | text | -
'gender' | text | -
'prob' | text | -
'#female' | number | int 
'%female' | number | float

## 2.1 skip rows

### rows already correctly labeled
- `Number of Founders`: int
- `Number of Funding Rounds`: int
- `Trend Score (7 Days)`: float
- `Trend Score (30 Days)`: float
- `Trend Score (90 Days)`: float
- `#female`: int
- `%female`: float

### all the same or too many NULLs
- `IPO Status` (all "private)
- `Operating Status` (all "active)
- `Diversity Spotlight (US Only)` (2821/3000 null)
- `Exit Date` (2546/3000 null)
- `Closed Date` (all null)
- `Company Type` (2977/3000 "for profit")
- `Hub Tags` (2971/3000 null)
- `Actively Hiring` (2757/3000 null)
- `Investor Type` (2990/3000 null)
- `Investment Stage` (2993/3000 null)
- `Number of Portfolio Organizations` (2931/3000 null)
- `Number of Investments` (2931/3000 null)
- `Number of Lead Investments` (2974/3000 null)
- `Number of Diversity Investments` (2994/3000 null)
- `Number of Exits` (2991/3000 null)
- `Number of Exits (IPO)` (2991/3000 null)
- `Accelerator Program Type` (all null)
- `Accelerator Application Deadline` (all null)
- `Accelerator Duration (in weeks)` (all null)
- `School Type` (all null)
- `School Program` (all null)
- `Number of Enrollments` (all null)
- `School Method` (all null)
- `Number of Founders (Alumni)` (all null)
- `Number of Alumni` (all null)
- `Acquired by` (2547/3000 null)
- `Announced Date` (2546/3000 null)
- `Price` (2928/3000 null)
- `Acquisition Type` (2566/3000 null)
- `Acquisition Terms` (2972/3000 null)
- `Acquisition Status` (2424/3000 null)
- `IPO Date` (all null)
- `Delisted Date` (all null)
- `Money Raised at IPO` (all null)
- `Valuation at IPO` (all null)
- `Stock Symbol` (all null)
- `Stock Exchange` (all null)
- `Number of Events` (2300/3000 null)
- `Last Leadership Hiring Date` (2961/3000 null)
- `Last Layoff Mention Date` (2999/3000 null)
- `IT Spend` (2609/3000 null)
- `Date of Most Recent Valuation` (2501/3000 null)
- `Number of Private Notes` (all null)
- `Most Popular Patent Class` (2537/3000 null)
- `Most Popular Trademark Class` (1956/3000 null)
- `Tags` (all null)
- `Unnamed: 107` (all null)

### equivalent to name
- 'Website'
- 'Twitter'
- 'Facebook'
- 'LinkedIn'
- 'Contact Email'
- 'Phone Number'
- 'Founders'
- 'Transaction Name'

### equivalent to total funding amount
*for early-stage startups, funding and equity funding amount is not too different, and because the number of funding rounds is also low, the last and total funding amount has no big difference, to simplify and avoid duplicate information in model, only input the most representative one in data*
- 'Last Funding Amount'
- 'Last Equity Funding Amount'
- 'Total Equity Funding Amount'

### irrelevant data
- 'Contact Job Departments'
- 'Number of Contacts'
- 'Number of Private Contacts'
- 'api_raw'
- 'gender'
- 'prob'

In [6]:
lower_cols = ['Number of Founders', 'Number of Funding Rounds', 
              'Trend Score (7 Days)', 'Trend Score (30 Days)', 'Trend Score (90 Days)']

In [7]:
df.rename(columns={'Number of Founders': 'number_of_founders',
                  'Number of Funding Rounds': 'number_of_funding_rounds',
                  'Trend Score (7 Days)': 'trend_score_7',
                  'Trend Score (30 Days)': 'trend_score_30',
                  'Trend Score (90 Days)': 'trend_score_90'}, inplace=True)

In [8]:
drop_cols = ['Description', 'Full Description', 
             'Website', 'Twitter', 'Facebook', 'LinkedIn',
             'Contact Email', 'Phone Number', 'Founders',
             'Transaction Name', 'Contact Job Departments',
             'Number of Contacts', 'Number of Private Contacts',
             'api_raw', 'gender', 'prob',
             'IPO Status', 'Operating Status', 'Diversity Spotlight (US Only)',
              'Exit Date', 'Closed Date', 'Company Type', 'Hub Tags',
              'Actively Hiring', 'Investor Type', 'Investment Stage',
              'Number of Portfolio Organizations','Number of Investments',
              'Number of Lead Investments', 'Number of Diversity Investments',
              'Number of Exits', 'Number of Exits (IPO)', 'Accelerator Program Type',
              'Accelerator Application Deadline', 'Accelerator Duration (in weeks)',
              'School Type', 'School Program', 'Number of Enrollments',
              'School Method', 'Number of Founders (Alumni)', 'Number of Alumni',
              'Acquired by', 'Announced Date', 'Price', 
              'Acquisition Type', 'Acquisition Terms', 'Acquisition Status',
              'IPO Date', 'Delisted Date', 'Money Raised at IPO',
              'Valuation at IPO', 'Stock Symbol', 'Stock Exchange', 'Number of Events',
              'Last Leadership Hiring Date', 'Last Layoff Mention Date',
              'IT Spend', 'Date of Most Recent Valuation', 'Number of Private Notes', 
              'Most Popular Trademark Class', 'Most Popular Patent Class',
              'Tags', 'Unnamed: 107', 
              'Industries', 'Funding Status',
              'Last Funding Amount', 'Last Equity Funding Amount', 'Total Equity Funding Amount']

In [9]:
df.drop(columns=drop_cols, inplace=True)
df.shape

(3000, 45)

## 2.2 encoding categorical data
### 2.2.1 convert text to equal categories

**are they equal or ordinal?**
- `Funding Status`: "Early Stage Venture", "Seed", "M&A" (overlaps with `Last Funding Type`)
- `Acquisition Status`: "Was Acquired", "Made Acquisitions", "Made Acquisitions; Was Acquired" (for early stage too many NULLs)
- `Headquarters Regions`: *after prepcocessing*

In [10]:
df['hq_region'] = df['Headquarters Regions'].str.lower().str.strip('').str.split('; ').str[-1]
df.drop(columns=['Headquarters Regions'], inplace=True)

In [11]:
def equal_cat(df, col):
    
    '''create new columns binary encoding each category'''
    
    # deal with NULL values
    new_col = col.lower().replace(' ', '_')
    df[new_col] = df[col].str.replace('—',f'{new_col}_null')
    
    # initiate binary encoder
    ohe = OneHotEncoder()
    
    # join original df with the created df with many new binary columns
    df_ohe = pd.DataFrame(ohe.fit_transform(asarray(df[new_col]).reshape(-1,1)).toarray(), 
                          columns=ohe.categories_, index=df.index)
    df_ohe.columns = df_ohe.columns.get_level_values(0)
    df = df.join(df_ohe)
    return df

In [12]:
def ohe_sanity_check(col):
    '''check correct conversion'''
    
    # deal with NULL values
    new_col = col.lower().replace(' ', '_')
    df[new_col] = df[col].str.replace('—',f'{new_col}_null')
    
    # convert data
    ohe = OneHotEncoder()
    df_ohe = pd.DataFrame(ohe.fit_transform(asarray(df[new_col]).reshape(-1,1)).toarray(), 
                          columns=ohe.categories_, index=df.index)
    return df_ohe

### 2.2.2 convert text to ORDINAL categories

- 'Last Funding Type'
- 'Estimated Revenue Range'
- 'Number of Employees'
- 'Last Equity Funding Type'
- 'Most Recent Valuation Range'

*Note: `.astype('category').cat.codes` is not a good method because it assigns the number in random order*

In [13]:
def ordinal_cat(df, col):
    
    '''create one new column with ordinal categories'''
    
    # get text for new column name
    new_col = col.lower().replace(' ', '_')
    
    
    # specify ordinal order
    if (col=='Last Funding Type') or (col=='Last Equity Funding Type'):
        labels = ['Seed', 'Series A']
    
    if (col=='Estimated Revenue Range') or (col=='Most Recent Valuation Range'):
        labels = ['—', 'Less than $1M', '$1M to $10M', '$10M to $50M', 
                  '$50M to $100M', '$100M to $500M', '$500M to $1B', 
                  '$1B to $10B', '$10B+']
    
    if col == 'Number of Employees':
        # some '1-10' were read incorrectly and automatically converted to date formats
        df['Number of Employees'] = df['Number of Employees'].str.replace('10-Jan', '1-10')
        labels = ['—', '1-10', '11-50', '51-100', '101-250', '251-500', 
                  '501-1000', '1001-5000', '5001-10000', '10001+']
    
    
    # convert text to ordinal categories
    cat = list(np.array(labels).reshape(1,len(labels)))
    oe = OrdinalEncoder(categories=cat)
    df[new_col] = oe.fit_transform(asarray(df[col]).reshape(-1, 1))
    df[new_col] = df[new_col].astype('int')

In [14]:
def oe_sanity_check(col, labels):
    '''check correct conversion'''
    cat = list(np.array(labels).reshape(1,len(labels)))
    oe = OrdinalEncoder(categories=cat)
    df[new_col] = oe.fit_transform(asarray(df[col]).reshape(-1, 1))
    df[new_col] = df[new_col].astype('int')
    return df[col].value_counts(), df[new_col].value_counts()

### 2.2.3 convert text list to MULTI-LABEL categories
- `Industries` (ignore because overlaps with `Industry Groups`)
- `Headquarters Location` **_(city, state, country) where many city=state (e.g. New York)_**
- `Headquarters Regions` (to avoid overlap with prev, only take last region and use OneHotEncoder)
- `Industry Groups` 
- `Top 5 Investors` (extremely sparse matrix but could be useful...?)

In [15]:
def multilabel_cat(df, col):
    '''create multiple one-hot encoded columns for each tag/label in a row'''
    
    # dealing with null valuess (so that null_cols for each newly created col is a different name)
    new_col = col.lower().replace(' ', '_')
    df[new_col] = df[col].str.replace('—', f'{new_col}_null')
    
    # get list of labels from text in each row
    df[f'{new_col}_lst'] = df[new_col].str.lower().str.strip('').str.split('; ')
    
    # initiate multi-label binary encoder
    mlb = MultiLabelBinarizer()
    
    # join original df with the created df with many new binary columns
    try:
        df = df.join(pd.DataFrame(mlb.fit_transform(df[f'{new_col}_lst']),
                                  columns=mlb.classes_, index=df.index))
    
    # add this exception to deal with when different columns have same category 
    except:
        df = df.join(pd.DataFrame(mlb.fit_transform(df[f'{new_col}_lst']),
                              columns=mlb.classes_, index=df.index),
                    lsuffix='', rsuffix=new_col)
    return df

In [16]:
def mlb_sanity_check(col):
    '''check sparsity of matrix'''
    
    # clean data
    try:
        df[f'{col}_lst'] = df[col].str.strip('').str.split(';')
    except:
        pass
    
    # convert data
    mlb = MultiLabelBinarizer()
    df_mlb = pd.DataFrame(mlb.fit_transform(df[f'{col}_lst']), columns=mlb.classes_, index=df.index)
    
    # count number of 0 in each category
    return (df_mlb == 0).sum().value_counts()

## 2.4 convert text to separate dates
(1) have full date (format, e.g. "Dec 31; 1999"), 
(2) some have full date but most only have year

- `Last Funding Date`: (1)
- `Founded Date`: (2)

In [17]:
def text_date(df, col):
    
    '''create new columns separating date into day, month, year'''
    
    # (1) have full date info (format, e.g. "Dec 31; 1999")
    if all(df[col].str.len()>10):
    
        # get text for new column name
        new_col1 = col.lower().replace(' ', '_').replace('date', 'day')
        new_col2 = col.lower().replace(' ', '_').replace('date', 'month')
        new_col3 = col.lower().replace(' ', '_').replace('date', 'year')

        # convert day and year
        df[new_col3] = df[col].str[-4:]
        df[new_col1] = df[col].str[3:5]

        # convert month
        # df[new_col2] = df[col].str[:3] #text
        df[new_col2] = pd.to_datetime(df[col].str[:3], format='%b').dt.month
    
    
    # (2) some rows have full date but most only have year info
    else:
        
        # get text for new column name
        new_col = col.lower().replace(' ', '_').replace('date', 'year')

        # convert day and year
        df[new_col] = df[col].str[-4:]

## 2.5 convert text to number

### (1) integer
- 'CB Rank (Company)'
- 'Number of Articles'
- 'Number of Lead Investors'
- 'Number of Investors'
- 'Number of Acquisitions'
- 'CB Rank (Organization)'
- 'CB Rank (School)'
- 'Monthly Visits'
- 'Visit Duration'
- 'Global Traffic Rank'
- 'Monthly Rank Change (#)'
- 'Active Tech Count'
- 'Number of Apps'
- 'Downloads Last 30 Days'
- 'Total Products Active'
- 'Patents Granted'
- 'Trademarks Registered'

### (2) float (percentage)
- 'Monthly Visits Growth'
- 'Visit Duration Growth'
- 'Page Views / Visit'
- 'Page Views / Visit Growth'
- 'Bounce Rate'
- 'Bounce Rate Growth'
- 'Monthly Rank Growth'
- 'Average Visits (6 months)'

### (3) currency (multiply and union)
- 'Last Funding Amount'
- 'Last Equity Funding Amount'
- 'Total Equity Funding Amount'
- 'Total Funding Amount'


In [18]:
def text_num(df, col, type='int'):
    
    '''update original column converting text to appropriate numerical format'''
    
    # get new column name
    new_col = col.lower().replace(' ', '_')
    
    # common cleaning: deal with NULL values
    df[new_col] = df[col].str.replace('—','0')
    
    # (1) integer
    if type=='int':
        
        # convert text to int
        df[new_col] = df[new_col].str.replace(';','').astype('int')
        
    # (2) float (percentage)
    if type=='float':
        
        # additional step to strip sign
        df[new_col] = df[new_col].str.replace('%','')
        
        # convert text to float
        df[new_col].str.replace(';','').astype('float')
        

In [19]:
def text_curr(df, col):
    '''create new column converting all amount to USD'''
    
    # get new column name
    new_col = col.lower().replace(' ', '_')
    
    # clean text
    df[new_col] = df[col].str.replace(';','')
    
    # add new col "conversion rate" of usd:currency = 1:x
    df['cvr'] = 0
    
    # strip currency signs and update conversion rate
    # us dollar
    df[new_col] = df[new_col].str.replace('$','')
    df.loc[df[col].str[0]=='$', 'cvr'] = 1
    
    # euro
    df[new_col] = df[new_col].str.replace('€','')
    df.loc[df[col].str[0]=='€', 'cvr'] = 1.1
    
    # uk pound
    df[new_col] = df[new_col].str.replace('£','')
    df.loc[df[col].str[0]=='£', 'cvr'] = 1.34
    
    # japanese yen
    df[new_col] = df[new_col].str.replace('¥','')
    df.loc[df[col].str[0]=='¥', 'cvr'] = 0.0087
    
    # chinese yuan ('CN¥')
    df[new_col] = df[new_col].str.replace('CN','')
    df.loc[df[col].str[0:2]=='CN', 'cvr'] = 0.16
    
    # canadian dollar ('CA$')
    df[new_col] = df[new_col].str.replace('CA','')
    df.loc[df[col].str[0:2]=='CA', 'cvr'] = 0.79
    
    # swiss franc
    df[new_col] = df[new_col].str.replace('CHF','')
    df.loc[df[col].str[0:3]=='CHF', 'cvr'] = 1.09
    
    # swedish krona
    df[new_col] = df[new_col].str.replace('SEK','')
    df.loc[df[col].str[0:3]=='SEK', 'cvr'] = 0.1
    
    # russian ruble
    df[new_col] = df[new_col].str.replace('RUB','')
    df.loc[df[col].str[0:3]=='RUB', 'cvr'] = 0.01
        
    # norwegian krone
    df[new_col] = df[new_col].str.replace('NOK','')
    df.loc[df[col].str[0:3]=='NOK', 'cvr'] = 0.11
    
    # new zealand dollar ('NZ$')
    df[new_col] = df[new_col].str.replace('NZ','')
    df.loc[df[col].str[0:2]=='NZ', 'cvr'] = 0.69
    
    # poland ztoty
    df[new_col] = df[new_col].str.replace('PLN','')
    df.loc[df[col].str[0:3]=='PLN', 'cvr'] = 0.24
        
    # icelandic krona
    df[new_col] = df[new_col].str.replace('ISK','')
    df.loc[df[col].str[0:3]=='ISK', 'cvr'] = 0.008
    
    # hungarian forint
    df[new_col] = df[new_col].str.replace('HUF','')
    df.loc[df[col].str[0:3]=='HUF', 'cvr'] = 0.003
    
    # null value
    df[new_col] = df[new_col].str.replace('—','0')
    
    
    '''cannot strip currency and convert to int the multipl only for parts of the data 
       so the best implementation is to split it into two steps'''
    
    # multiply number by conversion rate to get amount all in usd
    df[new_col] = df[new_col].astype('int')
    df[f'{new_col}_usd'] = df[new_col]*df['cvr']

## 2.6 convert text to NLP (bag of words?)
- `Description`
- `Full Description`

## run all conversions

In [20]:
equal_cats = ['hq_region']
for cat1 in equal_cats:
    df = equal_cat(df, cat1)
df.shape

(3000, 55)

In [21]:
ord_cats = ['Last Funding Type', 'Estimated Revenue Range', 'Number of Employees', 
                'Last Equity Funding Type', 'Most Recent Valuation Range']
for cat2 in ord_cats:
    ordinal_cat(df, cat2)
df.shape

(3000, 60)

In [22]:
multi_cats = ['Industry Groups', 'Headquarters Location', 'Top 5 Investors']
for cat3 in multi_cats:
    df = multilabel_cat(df, cat3)
df.shape

(3000, 5688)

In [23]:
date_cols = ['Last Funding Date', 'Founded Date']
for date_col in date_cols:
    text_date(df, date_col)
df.shape

(3000, 5692)

In [24]:
int_cols = ['CB Rank (Company)', 'Number of Articles', 'Number of Lead Investors', 
            'Number of Investors', 'Number of Acquisitions', 'CB Rank (Organization)', 
            'CB Rank (School)', 'Monthly Visits', 
            'Visit Duration', 'Global Traffic Rank', 'Monthly Rank Change (#)', 
            'Active Tech Count', 'Number of Apps', 'Downloads Last 30 Days',
            'Total Products Active', 'Patents Granted', 'Trademarks Registered']
for num1 in int_cols:
        text_num(df, num1, type='int')
df.shape

(3000, 5709)

In [25]:
float_cols = ['Monthly Visits Growth', 'Visit Duration Growth', 'Page Views / Visit', 
              'Page Views / Visit Growth', 'Bounce Rate', 'Bounce Rate Growth', 
              'Monthly Rank Growth', 'Average Visits (6 months)']
for num2 in float_cols:
    text_num(df, num2, type='float')
df.shape

(3000, 5717)

In [26]:
curr_cols = ['Total Funding Amount']
for num3 in curr_cols:
    text_curr(df, num3)
df.shape

(3000, 5720)

In [27]:
# redundant cols generated from feature engineering
multi_cats_lst = []
for col in multi_cats:
    new_col = col.lower().replace(' ', '_')
    multi_cats_lst.append(f'{new_col}_lst')

### quick sanity check region distribution

In [28]:
df['hq_region'].value_counts()#[:5]

asia-pacific (apac)    1000
european union (eu)     529
western us              424
hq_region_null          373
northeastern us         196
west coast              147
southern us             121
scandinavia              98
new england              62
midwestern us            50
Name: hq_region, dtype: int64

^ this means China (1000), Europe (627), US (1000), null (373)

### remove additional cols
- old cols that is no longer needed after new processing
- midway processing cols used to produce new cols

In [29]:
old_cols = equal_cats + ord_cats + multi_cats + multi_cats_lst + date_cols + int_cols + float_cols + curr_cols + ['cvr', 'total_funding_amount']

In [30]:
df.drop(columns=old_cols, inplace=True)
df.shape

(3000, 5678)

## 3. Data Post-Processing

In [31]:
# # boolean: 2 labels (female-led or not female-led, i.e. including co-led, male-led, and no info)
# df['female_led'] = (df['%female']>0.5).astype(int)

In [32]:
# # 3 labels
# conditions = [(df['%female']>0.5), (df['%female']==0.5), (df['%female']<0.5)]
# values = [0, 1, 2]
# # values = ['female-led', 'co-led', 'male-led']

# df['female_led'] = np.select(conditions, values)

since no longer a classification task, `%female` can be kept as variable!

In [33]:
# also drop the two columns that would give away
df.drop(columns=['#female'], inplace=True)

<span style="color:red">
encode no info as no female in company (need to justify this decision!!!)

In [34]:
df['%female'].fillna(0, inplace=True)

### dealing with missing data

In [35]:
df['total_funding_amount_usd'].isnull().sum()#value_counts()

0

In [36]:
df[df['total_funding_amount_usd']==0].shape

(0, 5677)

### set name as index 
so that the rest of the columns are all numerical data that could fit in the model

In [37]:
df.set_index('Organization Name', inplace=True)
num_cols = df.describe().columns #this takes awhile to load
new_df = df[num_cols]
new_df.shape

(3000, 5662)

### export data

In [38]:
# binary labels, dropna, improved processing
new_df.to_csv('../data/feature_engineering/combined_feng_reg.csv')

In [39]:
# new_df.to_csv('../data/feature_engineering/combined_feng.csv')
# new_df.to_csv('../data/feature_engineering/combined_feng_dropna.csv')
# new_df.to_csv('../data/feature_engineering/combined_feng3.csv')

In [40]:
# new_df['female_led'].value_counts()