In [None]:
import pandas as pd
import numpy as np

# cleaned data?
df = pd.read_csv('../processed_data/companies_cleaned_data.csv')

In [None]:
df.columns

In [None]:
df['funding_velocity'] = df['funding_total_usd'] / df['company_age_years']

In [None]:
# funding_total_usd / avg_funding_in_category
total_avg_funding = df['funding_total_usd'].mean()
df['funding_vs_avg'] = df['funding_total_usd'] / total_avg_funding

category_avg_funding = df.groupby('category_code')['funding_total_usd'].transform('mean')
df['funding_vs_industry_avg'] = df['funding_total_usd'] / np.where(
    df['category_code'].isna(),
    total_avg_funding,
    category_avg_funding
)

In [None]:
# bucket by founding year
def categorize_founding_year(year):
    if year <= 1990:
        return "Pre-1990"
    elif year <= 2000:
        return "1991-2000"
    elif year <= 2008:
        return "2001-2008"
    else:
        return "2009-2014"

df["founding_era"] = df["founded_year"].apply(categorize_founding_year)

In [None]:
# estimate funding stage by total amount of funding and company status
def funding_stage(row):
    f = row['funding_total_usd']
    s = row['status']
    
    if s == 'closed':
        return 'Closed'
    elif s == 'acquired':
        return 'Exit (Acquired)'
    elif s == 'ipo':
        return 'Exit (IPO)'
    
    if f < 1e6:
        return 'Pre-seed'
    elif f < 10e6:
        return 'Seed'
    elif f < 15e6:
        return 'Early'
    elif f < 100e6:
        return 'Growth'
    else:
        return 'Late'
    
df['funding_stage'] = df.apply(funding_stage, axis=1)


In [None]:
# industry growth index

industry_stats = (
    df.groupby(['category_code', 'founded_year'])
      .agg(
          num_startups=('id', 'count'),
          total_funding=('funding_total_usd', 'sum')
      )
      .reset_index()
)

industry_stats['num_startups_norm'] = industry_stats.groupby('category_code')['num_startups'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)
industry_stats['total_funding_norm'] = industry_stats.groupby('category_code')['total_funding'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)

industry_stats['industry_growth_index'] = (
    0.5 * industry_stats['num_startups_norm'] +
    0.5 * industry_stats['total_funding_norm']
)

df = df.merge(
    industry_stats[['category_code', 'founded_year', 'industry_growth_index']],
    on=['category_code', 'founded_year'],
    how='left'
)


In [None]:
industry_stats['industry_growth_rate'] = industry_stats.groupby('category_code')['industry_growth_index'].pct_change()

df = df.merge(
    industry_stats[['category_code', 'founded_year', 'industry_growth_rate']],
    on=['category_code', 'founded_year'],
    how='left'
)

In [None]:
df.to_csv('../processed_data/companies_feature_engineering.csv', index=False)