# Feature Engineering

**Features Added**
 - funding_velocity
 - funding_vs_avg
 - funding_vs_industry_avg
 - founding_era
 - funding_stage
 - industry_growth_index
 - industry_growth_rate

In [29]:
import pandas as pd
import numpy as np

df = pd.read_csv('./companies_optimized_targets.csv')

In [30]:
df.columns

Index(['id', 'name', 'status', 'category_code', 'country_code', 'state_code',
       'region', 'founded_at', 'founded_year', 'company_age_years',
       'age_group', 'funding_total_usd', 'has_funding', 'failure_risk',
       'risk_tier', 'risk_tier_label'],
      dtype='object')

In [31]:
df['funding_velocity'] = df['funding_total_usd'] / df['company_age_years']

In [32]:
# funding_total_usd / avg_funding_in_category
total_avg_funding = df['funding_total_usd'].mean()
df['funding_vs_avg'] = df['funding_total_usd'] / total_avg_funding

category_avg_funding = df.groupby('category_code')['funding_total_usd'].transform('mean')
df['funding_vs_industry_avg'] = df['funding_total_usd'] / np.where(
    df['category_code'].isna(),
    total_avg_funding,
    category_avg_funding
)

In [33]:
# bucket by founding year
def categorize_founding_year(year):
    if year <= 1990:
        return "Pre-1990"
    elif year <= 2000:
        return "1991-2000"
    elif year <= 2008:
        return "2001-2008"
    else:
        return "2009-2014"

df["founding_era"] = df["founded_year"].apply(categorize_founding_year)



In [34]:
# estimate funding stage by total amount of funding and company status
def funding_stage(row):
    f = row['funding_total_usd']
    s = row['status']
    
    if s == 'closed':
        return 'Closed'
    elif s == 'acquired':
        return 'Exit (Acquired)'
    elif s == 'ipo':
        return 'Exit (IPO)'
    
    if f < 1e6:
        return 'Pre-seed'
    elif f < 10e6:
        return 'Seed'
    elif f < 15e6:
        return 'Early'
    elif f < 100e6:
        return 'Growth'
    else:
        return 'Late'
    
df['funding_stage'] = df.apply(funding_stage, axis=1)


In [35]:
# industry growth index

industry_stats = (
    df.groupby(['category_code', 'founded_year'])
      .agg(
          num_startups=('id', 'count'),
          total_funding=('funding_total_usd', 'sum')
      )
      .reset_index()
)

industry_stats['num_startups_norm'] = industry_stats.groupby('category_code')['num_startups'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)
industry_stats['total_funding_norm'] = industry_stats.groupby('category_code')['total_funding'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)

industry_stats['industry_growth_index'] = (
    0.5 * industry_stats['num_startups_norm'] +
    0.5 * industry_stats['total_funding_norm']
)

df = df.merge(
    industry_stats[['category_code', 'founded_year', 'industry_growth_index']],
    on=['category_code', 'founded_year'],
    how='left'
)


In [36]:
# industry growth rate
industry_stats['industry_growth_rate'] = industry_stats.groupby('category_code')['industry_growth_index'].pct_change()

df = df.merge(
    industry_stats[['category_code', 'founded_year', 'industry_growth_rate']],
    on=['category_code', 'founded_year'],
    how='left'
)

In [37]:
# Calculate risk by country/region
country_risk = df.groupby('country_code')['failure_risk'].agg(['mean', 'std', 'count']).reset_index()
country_risk.columns = ['country_code', 'country_risk_mean', 'country_risk_std', 'country_count']
country_risk['country_risk_confidence'] = country_risk['country_count'] / country_risk['country_count'].max()
country_risk_mean = country_risk['country_risk_mean'].mean()

df = df.merge(country_risk[['country_code', 'country_risk_mean', 'country_risk_confidence']], 
              on='country_code', how='left')

# Fill missing values with global average
global_risk_mean = df['failure_risk'].mean()
if 'country_risk_mean' not in df.columns:
    df['country_risk_mean'] = global_risk_mean
else:
    df['country_risk_mean'] = df['country_risk_mean'].fillna(global_risk_mean)

if 'country_risk_confidence' not in df.columns:
    df['country_risk_confidence'] = 0.1
else:
    df['country_risk_confidence'] = df['country_risk_confidence'].fillna(0.1)


In [38]:
# Calculate risk by industry
industry_risk = df.groupby('category_code')['failure_risk'].agg(['mean', 'std', 'count']).reset_index()
industry_risk.columns = ['category_code', 'industry_risk_mean', 'industry_risk_std', 'industry_count']
industry_risk['industry_risk_confidence'] = industry_risk['industry_count'] / industry_risk['industry_count'].max()

df = df.merge(industry_risk[['category_code', 'industry_risk_mean', 'industry_risk_confidence']], 
              on='category_code', how='left')

# Fill missing values
df['industry_risk_mean'] = df['industry_risk_mean'].fillna(global_risk_mean)
df['industry_risk_confidence'] = df['industry_risk_confidence'].fillna(0.1)

In [39]:
# Calculate risk by funding stage
stage_risk = df.groupby('funding_stage')['failure_risk'].agg(['mean', 'count']).reset_index()
stage_risk.columns = ['funding_stage', 'stage_risk_mean', 'stage_count']

df = df.merge(stage_risk[['funding_stage', 'stage_risk_mean']], on='funding_stage', how='left')
df['stage_risk_mean'] = df['stage_risk_mean'].fillna(global_risk_mean)

In [40]:
# Age × Funding interactions
df['age_funding_ratio'] = df['company_age_years'] * df['funding_vs_avg']
df['age_funding_velocity'] = df['company_age_years'] * df['funding_velocity']

In [41]:
# Geographic × Industry interactions
df['geo_industry_risk'] = df['country_risk_mean'] * df['industry_risk_mean']

In [42]:
# Funding × Industry interactions
df['funding_industry_fit'] = df['funding_vs_industry_avg'] * df['industry_growth_index']

In [43]:
# Age × Industry maturity
df['age_industry_maturity'] = df['company_age_years'] * df['industry_growth_rate']

In [44]:
# Experience-based risk score
df['experience_risk_score'] = (
    0.5 * (df['company_age_years'] / df['company_age_years'].max()) +  # Normalized age
    0.3 * df['country_risk_confidence'] +  # Geographic experience
    0.2 * df['industry_risk_confidence']   # Industry experience
)

In [45]:
# Years since founding (for survival analysis perspective)
current_year = 2025
df['years_since_founding'] = current_year - df['founded_year']

In [46]:
# Funding efficiency relative to company age
df['funding_efficiency'] = df['funding_total_usd'] / (df['company_age_years'] + 1)  # +1 to avoid division by zero

In [47]:
# Funding momentum (how much above/below expected for age)
age_funding_median = df.groupby('company_age_years')['funding_total_usd'].transform('median')
df['funding_momentum'] = df['funding_total_usd'] / (age_funding_median + 1)

In [48]:
# Funding relative to founding era
era_funding_median = df.groupby('founding_era')['funding_total_usd'].transform('median')
df['era_adjusted_funding'] = df['funding_total_usd'] / (era_funding_median + 1)

In [49]:
# Competitive landscape intensity
industry_competition = df.groupby(['category_code', 'founded_year']).size().reset_index(name='industry_competition')
df = df.merge(industry_competition, on=['category_code', 'founded_year'], how='left')
df['industry_competition'] = df['industry_competition'].fillna(df['industry_competition'].median())

In [50]:
# Geographic market saturation
geo_saturation = df.groupby(['country_code', 'founded_year']).size().reset_index(name='geo_market_saturation')
df = df.merge(geo_saturation, on=['country_code', 'founded_year'], how='left')
df['geo_market_saturation'] = df['geo_market_saturation'].fillna(df['geo_market_saturation'].median())

In [51]:
# Risk deviation from peers
df['risk_vs_country_peers'] = df['failure_risk'] - df['country_risk_mean']
df['risk_vs_industry_peers'] = df['failure_risk'] - df['industry_risk_mean']
df['risk_vs_stage_peers'] = df['failure_risk'] - df['stage_risk_mean']

In [52]:
# Funding deviation from expectations
df['funding_vs_age_expectation'] = df['funding_total_usd'] - age_funding_median
df['funding_vs_era_expectation'] = df['funding_total_usd'] - era_funding_median

In [53]:
# Composite peer comparison score
df['peer_performance_score'] = (
    0.4 * (-df['risk_vs_industry_peers']) +  # Lower risk vs peers = better
    0.3 * df['funding_vs_industry_avg'] +   # Higher funding vs industry = better
    0.3 * (-df['risk_vs_country_peers'])    # Lower risk vs country = better
)

In [54]:
df.to_csv('./companies_featured_targets.csv', index=False)