# Feature Engineering

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../processed_data/companies_cleaned_data.csv')

In [2]:
df.shape

(196530, 47)

In [3]:
df.columns

Index(['id', 'name', 'normalized_name', 'country_code', 'state_code', 'city',
       'region', 'lat', 'lng', 'status', 'category_code',
       'category_code_clean', 'description', 'overview', 'tag_list',
       'founded_year', 'founding_era', 'company_age_years', 'age_group',
       'funding_total_usd', 'funding_rounds', 'has_funding',
       'first_funding_at', 'last_funding_at', 'days_to_first_funding',
       'months_to_first_funding', 'months_since_last_funding',
       'funding_velocity_category', 'funding_recency', 'milestones',
       'first_milestone_at', 'last_milestone_at', 'investment_rounds',
       'invested_companies', 'relationships', 'homepage_url', 'domain',
       'twitter_username', 'logo_url', 'failure_risk', 'risk_tier',
       'risk_tier_label', 'under_capitalized', 'created_at', 'created_by',
       'updated_at', 'closed_at'],
      dtype='object')

In [4]:
print(df.columns)
df['funding_velocity'] = df['funding_total_usd'] / df['company_age_years']

Index(['id', 'name', 'normalized_name', 'country_code', 'state_code', 'city',
       'region', 'lat', 'lng', 'status', 'category_code',
       'category_code_clean', 'description', 'overview', 'tag_list',
       'founded_year', 'founding_era', 'company_age_years', 'age_group',
       'funding_total_usd', 'funding_rounds', 'has_funding',
       'first_funding_at', 'last_funding_at', 'days_to_first_funding',
       'months_to_first_funding', 'months_since_last_funding',
       'funding_velocity_category', 'funding_recency', 'milestones',
       'first_milestone_at', 'last_milestone_at', 'investment_rounds',
       'invested_companies', 'relationships', 'homepage_url', 'domain',
       'twitter_username', 'logo_url', 'failure_risk', 'risk_tier',
       'risk_tier_label', 'under_capitalized', 'created_at', 'created_by',
       'updated_at', 'closed_at'],
      dtype='object')


In [5]:
# funding_total_usd / avg_funding_in_category
total_avg_funding = df['funding_total_usd'].mean()
df['funding_vs_avg'] = df['funding_total_usd'] / total_avg_funding

category_avg_funding = df.groupby('category_code')['funding_total_usd'].transform('mean')
df['funding_vs_industry_avg'] = df['funding_total_usd'] / np.where(
    df['category_code'].isna(),
    total_avg_funding,
    category_avg_funding
)

In [6]:
# bucket by founding year
def categorize_founding_year(year):
    if year <= 1990:
        return "Pre-1990"
    elif year <= 2000:
        return "1991-2000"
    elif year <= 2008:
        return "2001-2008"
    else:
        return "2009-2014"

df["founding_era"] = df["founded_year"].apply(categorize_founding_year)



In [7]:
# estimate funding stage by total amount of funding and company status
def funding_stage(row):
    f = row['funding_total_usd']
    s = row['status']
    
    if s == 'closed':
        return 'Closed'
    elif s == 'acquired':
        return 'Exit (Acquired)'
    elif s == 'ipo':
        return 'Exit (IPO)'
    
    if f < 1e6:
        return 'Pre-seed'
    elif f < 10e6:
        return 'Seed'
    elif f < 15e6:
        return 'Early'
    elif f < 100e6:
        return 'Growth'
    else:
        return 'Late'
    
df['funding_stage'] = df.apply(funding_stage, axis=1)


In [8]:
# industry growth index

industry_stats = (
    df.groupby(['category_code', 'founded_year'])
      .agg(
          num_startups=('id', 'count'),
          total_funding=('funding_total_usd', 'sum')
      )
      .reset_index()
)

industry_stats['num_startups_norm'] = industry_stats.groupby('category_code')['num_startups'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)
industry_stats['total_funding_norm'] = industry_stats.groupby('category_code')['total_funding'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)

industry_stats['industry_growth_index'] = (
    0.5 * industry_stats['num_startups_norm'] +
    0.5 * industry_stats['total_funding_norm']
)

df = df.merge(
    industry_stats[['category_code', 'founded_year', 'industry_growth_index']],
    on=['category_code', 'founded_year'],
    how='left'
)


In [9]:
# industry growth rate
industry_stats['industry_growth_rate'] = industry_stats.groupby('category_code')['industry_growth_index'].pct_change()

df = df.merge(
    industry_stats[['category_code', 'founded_year', 'industry_growth_rate']],
    on=['category_code', 'founded_year'],
    how='left'
)

In [10]:
# Calculate risk by country/region
country_risk = df.groupby('country_code')['failure_risk'].agg(['mean', 'std', 'count']).reset_index()
country_risk.columns = ['country_code', 'country_risk_mean', 'country_risk_std', 'country_count']
country_risk['country_risk_confidence'] = country_risk['country_count'] / country_risk['country_count'].max()
country_risk_mean = country_risk['country_risk_mean'].mean()

df = df.merge(country_risk[['country_code', 'country_risk_mean', 'country_risk_confidence']], 
              on='country_code', how='left')

# Fill missing values with global average
global_risk_mean = df['failure_risk'].mean()
if 'country_risk_mean' not in df.columns:
    df['country_risk_mean'] = global_risk_mean
else:
    df['country_risk_mean'] = df['country_risk_mean'].fillna(global_risk_mean)

if 'country_risk_confidence' not in df.columns:
    df['country_risk_confidence'] = 0.1
else:
    df['country_risk_confidence'] = df['country_risk_confidence'].fillna(0.1)


In [11]:
# Calculate risk by industry
industry_risk = df.groupby('category_code')['failure_risk'].agg(['mean', 'std', 'count']).reset_index()
industry_risk.columns = ['category_code', 'industry_risk_mean', 'industry_risk_std', 'industry_count']
industry_risk['industry_risk_confidence'] = industry_risk['industry_count'] / industry_risk['industry_count'].max()

df = df.merge(industry_risk[['category_code', 'industry_risk_mean', 'industry_risk_confidence']], 
              on='category_code', how='left')

# Fill missing values
df['industry_risk_mean'] = df['industry_risk_mean'].fillna(global_risk_mean)
df['industry_risk_confidence'] = df['industry_risk_confidence'].fillna(0.1)

In [12]:
# Calculate risk by funding stage
stage_risk = df.groupby('funding_stage')['failure_risk'].agg(['mean', 'count']).reset_index()
stage_risk.columns = ['funding_stage', 'stage_risk_mean', 'stage_count']

df = df.merge(stage_risk[['funding_stage', 'stage_risk_mean']], on='funding_stage', how='left')
df['stage_risk_mean'] = df['stage_risk_mean'].fillna(global_risk_mean)

In [13]:
# Age × Funding interactions
df['age_funding_ratio'] = df['company_age_years'] * df['funding_vs_avg']
df['age_funding_velocity'] = df['company_age_years'] * df['funding_velocity']

In [14]:
# Geographic × Industry interactions
df['geo_industry_risk'] = df['country_risk_mean'] * df['industry_risk_mean']

In [15]:
# Funding × Industry interactions
df['funding_industry_fit'] = df['funding_vs_industry_avg'] * df['industry_growth_index']

In [16]:
# Age × Industry maturity
df['age_industry_maturity'] = df['company_age_years'] * df['industry_growth_rate']

In [17]:
# Experience-based risk score
df['experience_risk_score'] = (
    0.5 * (df['company_age_years'] / df['company_age_years'].max()) +  # Normalized age
    0.3 * df['country_risk_confidence'] +  # Geographic experience
    0.2 * df['industry_risk_confidence']   # Industry experience
)

In [18]:
# Years since founding (for survival analysis perspective)
current_year = 2025
df['years_since_founding'] = current_year - df['founded_year']

In [19]:
# Funding efficiency relative to company age
df['funding_efficiency'] = df['funding_total_usd'] / (df['company_age_years'] + 1)  # +1 to avoid division by zero

In [20]:
# Funding momentum (how much above/below expected for age)
age_funding_median = df.groupby('company_age_years')['funding_total_usd'].transform('median')
df['funding_momentum'] = df['funding_total_usd'] / (age_funding_median + 1)

In [21]:
# Funding relative to founding era
era_funding_median = df.groupby('founding_era')['funding_total_usd'].transform('median')
df['era_adjusted_funding'] = df['funding_total_usd'] / (era_funding_median + 1)

In [22]:
# Competitive landscape intensity
industry_competition = df.groupby(['category_code', 'founded_year']).size().reset_index(name='industry_competition')
df = df.merge(industry_competition, on=['category_code', 'founded_year'], how='left')
df['industry_competition'] = df['industry_competition'].fillna(df['industry_competition'].median())

In [23]:
# Geographic market saturation
geo_saturation = df.groupby(['country_code', 'founded_year']).size().reset_index(name='geo_market_saturation')
df = df.merge(geo_saturation, on=['country_code', 'founded_year'], how='left')
df['geo_market_saturation'] = df['geo_market_saturation'].fillna(df['geo_market_saturation'].median())

In [24]:
# Risk deviation from peers
df['risk_vs_country_peers'] = df['failure_risk'] - df['country_risk_mean']
df['risk_vs_industry_peers'] = df['failure_risk'] - df['industry_risk_mean']
df['risk_vs_stage_peers'] = df['failure_risk'] - df['stage_risk_mean']

In [25]:
# Funding deviation from expectations
df['funding_vs_age_expectation'] = df['funding_total_usd'] - age_funding_median
df['funding_vs_era_expectation'] = df['funding_total_usd'] - era_funding_median

In [26]:
# Composite peer comparison score
df['peer_performance_score'] = (
    0.4 * (-df['risk_vs_industry_peers']) +  # Lower risk vs peers = better
    0.3 * df['funding_vs_industry_avg'] +   # Higher funding vs industry = better
    0.3 * (-df['risk_vs_country_peers'])    # Lower risk vs country = better
)

In [27]:
# Investment activity level
df['investment_activity_score'] = (
    0.6 * (df['investment_rounds'] / (df['investment_rounds'].max() + 1)) +
    0.4 * (df['invested_companies'] / (df['invested_companies'].max() + 1))
)

# Investment to funding ratio (investment activity vs seeking funding)
df['investment_to_funding_ratio'] = df['investment_rounds'] / (df['funding_rounds'] + 1)

# Network effect proxy - convert relationships to numeric first
df['relationships_count'] = pd.to_numeric(df['relationships'], errors='coerce').fillna(0)
df['network_connectivity'] = df['investment_rounds'] + df['invested_companies'] + df['relationships_count']

In [28]:
# Milestones per year of existence
df['milestones_per_year'] = df['milestones'] / (df['company_age_years'] + 1)

# Companies with high milestone activity
df['is_milestone_active'] = (df['milestones'] >= df['milestones'].quantile(0.75)).astype(int)

# Achievement score (combination of milestones and funding success)
df['achievement_score'] = (
    0.4 * (df['milestones'] / (df['milestones'].max() + 1)) +
    0.3 * (df['funding_rounds'] / (df['funding_rounds'].max() + 1))
)

In [29]:
# Basic digital presence indicators
df['has_domain'] = (df['domain'] != 'None').astype(int)
df['has_twitter'] = (df['twitter_username'] != 'None').astype(int)
df['has_logo'] = (df['logo_url'].notna()).astype(int)

# Digital presence score
df['digital_presence_score'] = (
    0.4 * df['has_domain'] +
    0.3 * df['has_twitter'] +
    0.3 * df['has_logo']
)

# Extract domain insights
def extract_domain_features(row):
    domain = row['domain']
    if domain == 'None' or pd.isna(domain):
        return 'none', 0, 0
    
    # Domain type
    if any(ext in domain.lower() for ext in ['.gov', '.edu', '.org']):
        domain_type = 'institutional'
    elif any(ext in domain.lower() for ext in ['.com', '.net', '.biz']):
        domain_type = 'commercial'
    else:
        domain_type = 'other'
    
    # Domain complexity
    domain_length = len(domain)
    subdomain_count = domain.count('.') - 1  # Subtract 1 for the main domain
    
    return domain_type, domain_length, subdomain_count

# Apply domain feature extraction
domain_features = df.apply(extract_domain_features, axis=1, result_type='expand')
df['domain_type'] = domain_features[0]
df['domain_length'] = domain_features[1]
df['subdomain_count'] = domain_features[2]

print(f"  - Companies with domains: {df['has_domain'].sum():,}")
print(f"  - Companies with Twitter: {df['has_twitter'].sum():,}")

  - Companies with domains: 196,530
  - Companies with Twitter: 196,530


In [30]:
# Content availability
df['has_description'] = (df['description'] != 'Unknown').astype(int)
df['has_overview'] = (df['overview'] != 'Unknown').astype(int)
df['has_tags'] = (df['tag_list'] != 'Unknown').astype(int)

# Content richness score
df['content_richness'] = df['has_description'] + df['has_overview'] + df['has_tags']

# Description length analysis
def safe_len(text):
    if text == 'Unknown' or pd.isna(text):
        return 0
    return len(str(text))

df['description_length'] = df['description'].apply(safe_len)
df['overview_length'] = df['overview'].apply(safe_len)

# Tag analysis
def analyze_tags(tag_string):
    if tag_string == 'Unknown' or pd.isna(tag_string):
        return 0, 0
    
    tags = str(tag_string).split(',')
    tag_count = len(tags)
    avg_tag_length = sum(len(tag.strip()) for tag in tags) / tag_count if tag_count > 0 else 0
    
    return tag_count, avg_tag_length

tag_features = df['tag_list'].apply(analyze_tags)
df['tag_count'] = [x[0] for x in tag_features]
df['avg_tag_length'] = [x[1] for x in tag_features]

# Text sophistication score
df['text_sophistication'] = (
    0.3 * (df['description_length'] / (df['description_length'].max() + 1)) +
    0.3 * (df['overview_length'] / (df['overview_length'].max() + 1)) +
    0.4 * (df['tag_count'] / (df['tag_count'].max() + 1))
)

In [31]:
# Convert date columns to datetime
date_cols = ['first_funding_at', 'last_funding_at', 'first_milestone_at', 'last_milestone_at', 'created_at', 'updated_at', 'closed_at']
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Funding timeline features
df['funding_duration_days'] = (df['last_funding_at'] - df['first_funding_at']).dt.days
df['funding_duration_days'] = df['funding_duration_days'].fillna(0)

# Time to first funding (from founding) - check if founded_at exists, otherwise use founded_year
if 'founded_at' in df.columns:
    df['founded_at'] = pd.to_datetime(df['founded_at'], errors='coerce')
    df['time_to_first_funding_days'] = (df['first_funding_at'] - df['founded_at']).dt.days
else:
    # Create founded_at from founded_year
    df['founded_at_estimated'] = pd.to_datetime(df['founded_year'], format='%Y', errors='coerce')
    df['time_to_first_funding_days'] = (df['first_funding_at'] - df['founded_at_estimated']).dt.days

df['time_to_first_funding_years'] = df['time_to_first_funding_days'] / 365.25

# Milestone timeline features
df['milestone_duration_days'] = (df['last_milestone_at'] - df['first_milestone_at']).dt.days
df['milestone_duration_days'] = df['milestone_duration_days'].fillna(0)

# Activity recency (days since last update)
current_date = pd.Timestamp.now()
df['days_since_last_update'] = (current_date - df['updated_at']).dt.days
df['days_since_last_funding'] = (current_date - df['last_funding_at']).dt.days
df['days_since_last_milestone'] = (current_date - df['last_milestone_at']).dt.days

# Fill NaN values for companies without funding/milestones
df['days_since_last_funding'] = df['days_since_last_funding'].fillna(df['days_since_last_update'])
df['days_since_last_milestone'] = df['days_since_last_milestone'].fillna(df['days_since_last_update'])

# Activity frequency (avoid division by zero)
df['funding_frequency'] = df['funding_rounds'] / (df['funding_duration_days'] / 365.25 + 1)
df['milestone_frequency'] = df['milestones'] / (df['milestone_duration_days'] / 365.25 + 1)

# Recently active flags
df['recently_funded'] = (df['days_since_last_funding'] <= 365).astype(int)  # Funded in last year
df['recently_milestone'] = (df['days_since_last_milestone'] <= 365).astype(int)  # Milestone in last year
df['recently_updated'] = (df['days_since_last_update'] <= 90).astype(int)  # Updated in last 3 months

In [32]:
# Overall business maturity score
df['business_maturity_score'] = (
    0.2 * df['digital_presence_score'] +
    0.2 * df['achievement_score'] +
    0.2 * (df['recently_funded'] + df['recently_milestone'] + df['recently_updated']) / 3 +
    0.2 * df['text_sophistication'] +
    0.2 * df['investment_activity_score']
)

# Ecosystem engagement (how connected/active the company is)
df['ecosystem_engagement'] = (
    0.3 * (df['network_connectivity'] / (df['network_connectivity'].max() + 1)) +
    0.2 * df['has_twitter'] +
    0.3 * (df['content_richness'] / 3)
)

# Strategic positioning (investor appeal factors)
df['strategic_positioning'] = (
    0.25 * df['funding_vs_industry_avg'] +
    0.25 * df['digital_presence_score'] +
    0.25 * df['achievement_score'] +
    0.25 * df['ecosystem_engagement']
) 

# Operational sophistication (professional setup indicators)
df['operational_sophistication'] = (
    0.3 * df['has_domain'] +
    0.2 * df['has_twitter'] +
    0.3 * (df['content_richness'] / 3)
)

In [33]:
df.to_csv('../processed_data/companies_feature_engineering.csv', index=False)

# 1. Under-Capitalized Startup Features


## Step 1: Under-Capitalization Identification & Core Features

```
ALGORITHM: Under-Cap Population Feature Engineering
1. Load funding percentiles from data_cleaning phase:
   - Use 2% threshold established in previous notebook
   - Validate under_capitalized flag exists and is accurate
   - Create under-cap specific feature derivatives

2. Core Under-Cap Features:
   - undercap_survival_months = company_age_years * 12 (survival with minimal funding)
   - bootstrap_efficiency = milestones / (funding_total_usd + 1) (achievements per dollar)
   - capital_efficiency = revenue_proxy / (funding_total_usd + 1) (value creation per dollar)
   - undercap_longevity_score = survival_months / industry_avg_survival

EXPECTED OUTPUT:
- under_capitalized flag validated: "X,XXX companies (X.X%)"
- Bootstrap efficiency distribution analysis
- Capital efficiency metrics by industry
- Survival analysis features for under-cap population
```

In [34]:
# Ensure under_capitalized flag exists
if 'under_capitalized' not in df.columns:
    df['under_capitalized'] = 0  # default to 0 if missing

# Step 1: Under-Cap Core Features
df['undercap_survival_months'] = df['company_age_years'] * 12
df['bootstrap_efficiency'] = df['milestones'].fillna(0) / (df['funding_total_usd'].fillna(0) + 1)

# Capital efficiency: handle missing revenue_proxy
if 'revenue_proxy' in df.columns:
    df['capital_efficiency'] = df['revenue_proxy'] / (df['funding_total_usd'] + 1)
else:
    df['capital_efficiency'] = np.nan

# Industry average survival proxy (or global if unavailable)
if 'category_code' in df.columns:
    industry_avg_survival = df.groupby('category_code')['company_age_years'].transform('mean')
else:
    industry_avg_survival = df['company_age_years'].mean()

df['undercap_longevity_score'] = df['undercap_survival_months'] / (industry_avg_survival * 12 + 1e-6)

# Step 2: Print summary stats
total_undercap = df['under_capitalized'].sum()
pct_undercap = df['under_capitalized'].mean() * 100

print(f"Total under-capitalized companies: {total_undercap:,} ({pct_undercap:.1f}%)")
print(f"Bootstrap efficiency (mean): {df['bootstrap_efficiency'].mean():.2f}")
if df['capital_efficiency'].notna().any():
    print(f"Capital efficiency (mean): {df['capital_efficiency'].mean():.2f}")
else:
    print("Capital efficiency: N/A (no revenue_proxy available)")
print(f"Undercap longevity score (mean): {df['undercap_longevity_score'].mean():.2f}")

Total under-capitalized companies: 169,216 (86.1%)
Bootstrap efficiency (mean): 0.43
Capital efficiency: N/A (no revenue_proxy available)
Undercap longevity score (mean): 1.00


## Step 2: Under-Cap Peer Comparison Features

```
ALGORITHM: Under-Cap Comparative Analysis Features
1. Create under-cap peer groups:
   - Group by [category_code, founding_era] for under-cap companies
   - Calculate under-cap specific benchmarks and percentiles
   - Generate peer comparison metrics

2. Under-Cap Peer Features:
   - undercap_peer_success_rate = success_rate_in_undercap_peer_group
   - undercap_peer_survival_comparison = survival_vs_undercap_peers
   - undercap_sector_density = undercap_companies_in_sector / total_sector_companies
   - undercap_geographic_density = undercap_companies_in_region / total_region_companies

EXPECTED OUTPUT:
- Peer group success rate benchmarks by sector
- Under-cap concentration metrics by geography/industry
- Relative positioning within under-cap ecosystem
- Competitive analysis within funding-constrained environment
```

In [35]:
# Create under-cap peer dataset
undercap_df = df[df['under_capitalized'] == 1].copy()

# Determine grouping columns
merge_cols = [c for c in ['category_code', 'founding_era'] if c in df.columns]

# Initialize peer columns
peer_cols = [
    'undercap_peer_count', 
    'undercap_peer_success_rate',
    'undercap_peer_avg_survival', 
    'undercap_peer_median_milestones',
    'undercap_peer_avg_efficiency',
    'undercap_sector_density',
    'undercap_geographic_density'
]

# Drop existing peer columns to avoid merge conflicts
df = df.drop(columns=[c for c in peer_cols if c in df.columns], errors='ignore')

# Compute peer stats if possible
if len(merge_cols) > 0 and len(undercap_df) > 0:
    # Ensure 'id' exists, else use a surrogate
    id_col = 'id' if 'id' in df.columns else df.index.name if df.index.name else df.index
    undercap_peer_stats = (
        undercap_df.groupby(merge_cols)
        .agg(
            undercap_peer_count=(id_col, 'count'),
            undercap_peer_success_rate=('failure_risk', lambda x: 1 - x.mean() if len(x) > 0 else np.nan),
            undercap_peer_avg_survival=('company_age_years', 'mean'),
            undercap_peer_median_milestones=('milestones', 'median'),
            undercap_peer_avg_efficiency=('bootstrap_efficiency', 'mean')
        )
        .reset_index()
    )
    df = df.merge(undercap_peer_stats, on=merge_cols, how='left')

# Compute sector and geographic density
if 'category_code' in df.columns:
    df['undercap_sector_density'] = df.groupby('category_code')['under_capitalized'].transform('mean')
else:
    df['undercap_sector_density'] = np.nan

if 'country_code' in df.columns:
    df['undercap_geographic_density'] = df.groupby('country_code')['under_capitalized'].transform('mean')
else:
    df['undercap_geographic_density'] = np.nan

# Fill missing values with global averages
global_success = 1 - undercap_df['failure_risk'].mean() if len(undercap_df) > 0 else 0
global_survival = undercap_df['company_age_years'].mean() if len(undercap_df) > 0 else 0
global_efficiency = undercap_df['bootstrap_efficiency'].mean() if len(undercap_df) > 0 else 0
global_milestones = undercap_df['milestones'].median() if len(undercap_df) > 0 else 0

df['undercap_peer_success_rate'] = df.get('undercap_peer_success_rate', np.nan).fillna(global_success)
df['undercap_peer_avg_survival'] = df.get('undercap_peer_avg_survival', np.nan).fillna(global_survival)
df['undercap_peer_count'] = df.get('undercap_peer_count', 0).fillna(0)
df['undercap_peer_median_milestones'] = df.get('undercap_peer_median_milestones', np.nan).fillna(global_milestones)
df['undercap_peer_avg_efficiency'] = df.get('undercap_peer_avg_efficiency', np.nan).fillna(global_efficiency)
df['undercap_sector_density'] = df['undercap_sector_density'].fillna(0)
df['undercap_geographic_density'] = df['undercap_geographic_density'].fillna(0)

# Comparative feature: survival vs undercap peers
df['undercap_peer_survival_comparison'] = df['company_age_years'] - df['undercap_peer_avg_survival']

# Print summary
print(f"Total under-cap companies: {len(undercap_df):,}")
print(f"Average under-cap peer success rate: {df['undercap_peer_success_rate'].mean():.2%}")
print(f"Average under-cap peer survival (years): {df['undercap_peer_avg_survival'].mean():.2f}")
print(f"Average under-cap bootstrap efficiency: {df['undercap_peer_avg_efficiency'].mean():.2f}")
print(f"Average under-cap sector density: {df['undercap_sector_density'].mean():.2%}")
print(f"Average under-cap geographic density: {df['undercap_geographic_density'].mean():.2%}")

Total under-cap companies: 169,216
Average under-cap peer success rate: 0.24%
Average under-cap peer survival (years): 15.67
Average under-cap bootstrap efficiency: 0.49
Average under-cap sector density: 86.10%
Average under-cap geographic density: 86.10%


## Step 3: Bootstrap Success Indicators

```
ALGORITHM: Zero/Minimal Funding Success Features
1. Identify bootstrap success patterns:
   - Companies with $0 funding but high milestones
   - Long survival (3+ years) with minimal funding
   - Growth indicators without traditional VC funding

2. Bootstrap Features:
   - zero_funding_survivor = (funding == 0) & (age > 3 years)
   - bootstrap_milestone_velocity = milestones / months_without_funding
   - organic_growth_indicator = milestone_growth_rate without funding_events
   - self_sustaining_score = operational_longevity without external_capital

EXPECTED OUTPUT:
- Bootstrap success identification: "X companies survived 3+ years with $0 funding"
- Milestone velocity for unfunded companies
- Organic growth pattern analysis
- Self-sustainability scoring framework
```

In [36]:
# Zero funding survivors (3+ years)
df['zero_funding_survivor'] = ((df['funding_total_usd'] == 0) & (df['company_age_years'] >= 3)).astype(int)

# Minimal funding long survivors (under-cap + 5+ years)
df['minimal_funding_long_survivor'] = ((df['under_capitalized'] == 1) & (df['company_age_years'] >= 5)).astype(int)

# High milestones among minimal funding (75th percentile)
milestone_75th = df['milestones'].quantile(0.75)
df['high_milestone_minimal_funding'] = ((df['under_capitalized'] == 1) & (df['milestones'] >= milestone_75th)).astype(int)

# Months to first funding (use survival months for missing)
if 'time_to_first_funding_days' in df.columns:
    df['months_to_first_funding'] = df['time_to_first_funding_days'] / 30.44
else:
    df['months_to_first_funding'] = np.nan
df['months_to_first_funding'] = df['months_to_first_funding'].fillna(df.get('undercap_survival_months', 36))  # default 36 months

# Milestone velocity per month
df['bootstrap_milestone_velocity'] = df['milestones'] / (df['months_to_first_funding'] + 1)

# Organic growth indicator (milestones per year without recent funding)
if 'recently_funded' in df.columns and 'milestones_per_year' in df.columns:
    df['organic_growth_indicator'] = np.where(df['recently_funded'] == 0, df['milestones_per_year'], 0)
else:
    df['organic_growth_indicator'] = 0

# Growth without VC (minimal funding but above median milestones)
milestone_median = df['milestones'].median()
funding_25th = df['funding_total_usd'].quantile(0.25)
df['growth_without_vc'] = ((df['funding_total_usd'] < funding_25th) & (df['milestones'] > milestone_median)).astype(int)

# Self-sustaining score (composite metric)
df['self_sustaining_score'] = (
    0.3 * (df.get('undercap_survival_months', 0) / (df.get('undercap_survival_months', 36).max() + 1)) +
    0.3 * np.clip(df.get('bootstrap_efficiency', 0) / df.get('bootstrap_efficiency', pd.Series([1])).quantile(0.99), 0, 1) +
    0.2 * df['zero_funding_survivor'] +
    0.2 * np.clip(df.get('undercap_longevity_score', 0), 0, 1)
)
# Clip to 0–1
df['self_sustaining_score'] = np.clip(df['self_sustaining_score'], 0, 1)

# Sustainability tiers
df['sustainability_tier'] = pd.cut(
    df['self_sustaining_score'],
    bins=[0, 0.25, 0.5, 0.75, 1.0],
    labels=['Low', 'Medium', 'High', 'Very High']
)

# Print summary outputs
print(f"Companies survived 3+ years with $0 funding: {df['zero_funding_survivor'].sum():,}")
print(f"Under-cap long survivors (5+ years): {df['minimal_funding_long_survivor'].sum():,}")
print(f"High milestone under-cap companies (>= 75th percentile): {df['high_milestone_minimal_funding'].sum():,}")
print(f"Average bootstrap milestone velocity: {df['bootstrap_milestone_velocity'].mean():.2f} per month")
print(f"Average organic growth indicator: {df['organic_growth_indicator'].mean():.2f}")
print(f"Companies showing growth without VC: {df['growth_without_vc'].sum():,}")
print(f"Average self-sustaining score: {df['self_sustaining_score'].mean():.2f}")

tier_counts = df['sustainability_tier'].value_counts().reindex(['Low', 'Medium', 'High', 'Very High'], fill_value=0)
print("Sustainability tiers distribution:")
for tier, count in tier_counts.items():
    print(f"  {tier}: {count:,}")

Companies survived 3+ years with $0 funding: 0
Under-cap long survivors (5+ years): 169,216
High milestone under-cap companies (>= 75th percentile): 76,693
Average bootstrap milestone velocity: 0.00 per month
Average organic growth indicator: 0.04
Companies showing growth without VC: 3,276
Average self-sustaining score: 0.34
Sustainability tiers distribution:
  Low: 50,188
  Medium: 136,398
  High: 9,651
  Very High: 293


# 2. Founder Team Features

## Step 1: Team Size Extraction from Available Data

```
ALGORITHM: Founder Team Feature Engineering
1. Parse relationships column for team insights:
   - Extract founder, co-founder, employee counts from relationships string
   - Parse created_by field for founding team information
   - Use text analysis to identify team size indicators

2. Team Size Features:
   - founder_count = extracted_founder_count from relationships
   - cofounder_count = extracted_cofounder_count from relationships  
   - team_size_proxy = founder_count + cofounder_count + key_employees
   - solo_founder = (founder_count == 1) & (cofounder_count == 0)

EXPECTED OUTPUT:
- Team size distribution: "Solo: X%, Small team (2-3): X%, Large team (4+): X%"
- Founder team extraction success rate
- Team composition analysis by industry/success rate
```

In [37]:
# Ensure relationships column exists
if 'relationships' not in df.columns:
    df['relationships'] = None

# Initialize founder and cofounder counts
df['founder_count'] = 0
df['cofounder_count'] = 0

# Extract founder/cofounder counts
def extract_founder_counts(rel):
    founder = 0
    cofounder = 0
    if isinstance(rel, list):
        for r in rel:
            role = r.get('title', '').lower() if isinstance(r, dict) else ''
            if 'founder' in role and 'co' not in role:
                founder += 1
            elif 'co-founder' in role or 'cofounder' in role:
                cofounder += 1
    return pd.Series([founder, cofounder])

df[['founder_count', 'cofounder_count']] = df['relationships'].apply(extract_founder_counts)

# Employee count handling
if 'employee_count' in df.columns:
    df['employee_count'] = df['employee_count'].fillna(0).clip(lower=0)
    df['team_size_proxy'] = df['founder_count'] + df['cofounder_count'] + df['employee_count']
else:
    df['team_size_proxy'] = df['founder_count'] + df['cofounder_count']

# Solo founder flag
df['solo_founder'] = ((df['founder_count'] == 1) & (df['cofounder_count'] == 0)).astype(int)

# Team size categories
df['team_size_category'] = pd.cut(
    df['team_size_proxy'],
    bins=[0, 1, 3, 1000],
    labels=['Solo', 'Small (2-3)', 'Large (4+)']
)

# Summary statistics
print(f"Total companies: {len(df):,}")
print(f"Total solo founder companies: {df['solo_founder'].sum():,}")
print(f"Average number of founders: {df['founder_count'].mean():.2f}")
print(f"Average number of cofounders: {df['cofounder_count'].mean():.2f}")
print(f"Average team size proxy: {df['team_size_proxy'].mean():.2f}")

print("\nTeam size category distribution:")
team_size_counts = df['team_size_category'].value_counts().reindex(['Solo', 'Small (2-3)', 'Large (4+)'], fill_value=0)
for category, count in team_size_counts.items():
    print(f"  {category}: {count:,}")

Total companies: 196,530
Total solo founder companies: 0
Average number of founders: 0.00
Average number of cofounders: 0.00
Average team size proxy: 0.00

Team size category distribution:
  Solo: 0
  Small (2-3): 0
  Large (4+): 0



## Step 2: Team-Based Success Analysis

```
ALGORITHM: Team Dynamics Impact Features
1. Analyze team size impact on success:
   - Success rates by team size categories
   - Funding patterns by team composition
   - Industry preferences for team sizes

2. Team Impact Features:
   - team_size_success_correlation = success_rate_for_team_size_category
   - team_funding_advantage = funding_vs_avg by team_size
   - team_survival_benefit = survival_rate by team_composition
   - optimal_team_size_flag = team_size in optimal_range_for_industry

EXPECTED OUTPUT:
- Team size success correlation by industry
- Optimal team size recommendations by sector
- Team composition impact on funding acquisition
- Solo founder success rate analysis
```

In [38]:
# Ensure 'team_size_category' exists
if 'team_size_category' not in df.columns:
    df['team_size_category'] = np.nan

# Convert to string for consistent comparison
df['team_size_category'] = df['team_size_category'].astype(str)

# Compute optimal team size per industry
if 'category_code' in df.columns:
    optimal_team_size = (
        df.groupby(['category_code', 'team_size_category'], observed=True)
        .agg(success_rate=('failure_risk', lambda x: 1 - x.mean() if len(x.dropna()) > 0 else np.nan))
        .reset_index()
    )

    if not optimal_team_size.empty:
        # Get the team_size_category with max success_rate per industry
        optimal_team_size_idx = optimal_team_size.groupby('category_code')['success_rate'].idxmax()
        optimal_team_df = optimal_team_size.loc[optimal_team_size_idx, ['category_code', 'team_size_category']].copy()
        optimal_team_df = optimal_team_df.rename(columns={'team_size_category': 'optimal_team_size_category'})
        df = df.merge(optimal_team_df, on='category_code', how='left')
    else:
        df['optimal_team_size_category'] = 'None'
else:
    df['optimal_team_size_category'] = 'None'

# Ensure optimal team size column is string
df['optimal_team_size_category'] = df['optimal_team_size_category'].astype(str)

# Create flag
df['optimal_team_size_flag'] = (df['team_size_category'] == df['optimal_team_size_category']).astype(int)

# Summary statistics
print(f"Total industries analyzed: {df['category_code'].nunique():,}")
print("Optimal team size category per industry:")
print(df[['category_code', 'optimal_team_size_category']].drop_duplicates().sort_values('category_code').to_string(index=False))
print(f"\nCompanies matching optimal team size: {df['optimal_team_size_flag'].sum():,} ({df['optimal_team_size_flag'].mean():.2%})")

Total industries analyzed: 42
Optimal team size category per industry:
   category_code optimal_team_size_category
     advertising                        nan
       analytics                        nan
      automotive                        nan
         biotech                        nan
       cleantech                        nan
      consulting                        nan
          design                        nan
       ecommerce                        nan
       education                        nan
      enterprise                        nan
         fashion                        nan
         finance                        nan
     games_video                        nan
      government                        nan
        hardware                        nan
          health                        nan
     hospitality                        nan
           legal                        nan
           local                        nan
   manufacturing                        nan
     

## Step 3: Team Quality Proxy Features 

```
ALGORITHM: Team Experience and Quality Indicators
1. Create team quality proxies from available data:
   - Network connectivity as team experience indicator
   - Digital sophistication as team professionalism
   - Content quality as team communication skills

2. Team Quality Features:
   - team_network_strength = relationships_count / team_size_proxy
   - team_digital_maturity = digital_presence_score / team_size_proxy
   - team_communication_quality = text_sophistication / team_size_proxy
   - founding_team_experience = years_experience_proxy from network_data

EXPECTED OUTPUT:
- Team quality scoring methodology
- Experience proxy validation metrics
- Team competency assessment framework
- Founding team strength indicators
```

In [39]:
# Ensure necessary columns exist
for col in ['relationships_count', 'team_size_proxy', 'digital_presence_score', 
            'text_sophistication', 'years_experience_proxy']:
    if col not in df.columns:
        df[col] = 0  # fallback default

# Compute team quality proxies
df['team_network_strength'] = df['relationships_count'] / df['team_size_proxy'].replace(0, 1)
df['team_digital_maturity'] = df['digital_presence_score'] / df['team_size_proxy'].replace(0, 1)
df['team_communication_quality'] = df['text_sophistication'] / df['team_size_proxy'].replace(0, 1)
df['founding_team_experience'] = df['years_experience_proxy']

# Fill any remaining NaNs with 0
team_cols = ['team_network_strength', 'team_digital_maturity', 
             'team_communication_quality', 'founding_team_experience']
df[team_cols] = df[team_cols].fillna(0)

# Composite team score
experience_max = df['founding_team_experience'].max() if df['founding_team_experience'].max() > 0 else 1
df['team_composite_score'] = (
    0.25 * df['team_network_strength'] +
    0.25 * df['team_digital_maturity'] +
    0.25 * df['team_communication_quality'] +
    0.25 * (df['founding_team_experience'] / experience_max)
)

# Optional: clip composite score between 0 and 1
df['team_composite_score'] = df['team_composite_score'].clip(0, 1)

# Print summary statistics
print(f"Average team network strength: {df['team_network_strength'].mean():.2f}")
print(f"Average team digital maturity: {df['team_digital_maturity'].mean():.2f}")
print(f"Average team communication quality: {df['team_communication_quality'].mean():.2f}")
print(f"Average founding team experience (years): {df['founding_team_experience'].mean():.2f}")
print(f"Average composite team score: {df['team_composite_score'].mean():.2f}")

Average team network strength: 1.88
Average team digital maturity: 0.87
Average team communication quality: 0.13
Average founding team experience (years): 0.00
Average composite team score: 0.53


# 3. Stage Transition Dynamics

## Step 1: Dynamic Stage Progression Analysis

```
ALGORITHM: Stage Transition Feature Engineering
1. Enhance existing funding_stage with transition dynamics:
   - Calculate stage progression velocity (stages_per_year)
   - Identify stage transition patterns and timing
   - Measure time spent in each funding stage

2. Stage Transition Features:
   - funding_stage_numeric = map stages to numeric scale (0-6)
   - stage_progression_rate = funding_stage_numeric / (company_age_years + 1)
   - time_in_current_stage = estimated_time_in_current_funding_stage
   - stage_transition_velocity = stages_progressed / years_active

EXPECTED OUTPUT:
- Stage progression rate distribution
- Average time spent in each funding stage
- Stage transition success patterns
- Velocity benchmarks by industry
```

In [40]:
# Funding stage mapping (numeric scale 0-6)
funding_stage_mapping = {
    'pre-seed': 0,
    'seed': 1,
    'angel': 2,
    'series_a': 3,
    'series_b': 4,
    'series_c': 5,
    'series_d+': 5,
    'ipo': 6,
    'acquired': 6,
    'closed': 0
}

# Ensure funding_stage column exists
if 'funding_stage' not in df.columns:
    df['funding_stage'] = np.nan

df['funding_stage_clean'] = df['funding_stage'].astype(str).str.strip().str.lower()
df['funding_stage_numeric'] = df['funding_stage_clean'].map(funding_stage_mapping)

# Inspect unmapped stages
unmapped_stages = df[df['funding_stage_numeric'].isna()]['funding_stage'].unique()
if len(unmapped_stages) > 0:
    print("Warning: Unmapped funding stages:", unmapped_stages)

# Safe company age to avoid division by zero
df['company_age_years_safe'] = df['company_age_years'].replace(0, np.nan)

# Stage progression metrics
df['stage_progression_rate'] = df['funding_stage_numeric'] / df['company_age_years_safe']

# Stage transition velocity: can be refined if previous stage data exists
df['stage_transition_velocity'] = df['funding_stage_numeric'] / df['company_age_years_safe']

# Time in current stage (months)
if 'months_since_last_funding' in df.columns:
    df['time_in_current_stage'] = df['months_since_last_funding']
elif 'last_funding_at' in df.columns and 'current_date' not in locals():
    current_date = pd.Timestamp.now()
    df['time_in_current_stage'] = ((current_date - df['last_funding_at']).dt.days / 30.44).fillna(np.nan)
else:
    df['time_in_current_stage'] = np.nan

# Summary statistics
valid_numeric = df['funding_stage_numeric'].notna().sum()
print(f"Valid companies with numeric stage: {valid_numeric:,}")
print(f"Average funding stage numeric: {df['funding_stage_numeric'].mean(skipna=True):.2f}")
print(f"Average stage progression rate: {df['stage_progression_rate'].mean(skipna=True):.2f} stages/year")
print(f"Median stage progression rate: {df['stage_progression_rate'].median(skipna=True):.2f} stages/year")
print(f"Average time in current stage (months): {df['time_in_current_stage'].mean(skipna=True):.2f}")
print(f"Median time in current stage (months): {df['time_in_current_stage'].median(skipna=True):.2f}")
print(f"Average stage transition velocity: {df['stage_transition_velocity'].mean(skipna=True):.2f} stages/year")

Valid companies with numeric stage: 19,995
Average funding stage numeric: 0.47
Average stage progression rate: 0.03 stages/year
Median stage progression rate: 0.00 stages/year
Average time in current stage (months): 174.65
Median time in current stage (months): 166.00
Average stage transition velocity: 0.03 stages/year


## Step 2: Stage Transition Risk Indicators

```
ALGORITHM: Stage Stagnation and Progression Analysis
1. Identify stage transition risk patterns:
   - Companies stuck in early stages too long
   - Rapid progression indicating hot startups
   - Stage regression indicators (funding decline)

2. Stage Risk Features:
   - stuck_in_early_stage = (stage <= Seed) & (age > 5 years)
   - rapid_progression = (stage >= Growth) & (age <= 3 years)
   - stage_appropriate_age = age_matches_expected_stage_timeline
   - progression_risk_score = deviation_from_normal_progression_pattern

EXPECTED OUTPUT:
- Early stage stagnation identification: "X companies stuck in seed stage 5+ years"
- Rapid progression patterns: "X companies reached growth stage in <3 years"
- Stage-age alignment analysis
- Progression risk scoring distribution
```

In [41]:
# Ensure funding_stage_numeric exists
if 'funding_stage_numeric' not in df.columns:
    raise ValueError("Column 'funding_stage_numeric' must exist. Run Step 1 first.")

# Safe company age to avoid division by zero
df['company_age_years_safe'] = df['company_age_years'].replace(0, np.nan)

# Define stage thresholds
early_stage_threshold = 1  # Seed/Angel or lower
growth_stage_threshold = 2  # Series A or above

# 1. Stuck in early stage (>5 years in Seed/Angel)
df['stuck_in_early_stage'] = ((df['funding_stage_numeric'] <= early_stage_threshold) &
                              (df['company_age_years_safe'] > 5)).astype(int)

# 2. Rapid progression (Growth stage reached <=3 years)
df['rapid_progression'] = ((df['funding_stage_numeric'] >= growth_stage_threshold) &
                           (df['company_age_years_safe'] <= 3)).astype(int)

# 3. Stage-appropriate age (heuristic)
expected_age_per_stage = {0: 0.5, 1: 2, 2: 3, 3: 5, 4: 6, 5: 8, 6: 10}  # Adjusted minimums for early stages
df['expected_age_for_stage'] = df['funding_stage_numeric'].map(expected_age_per_stage)

# Flag if company age is within 50-150% of expected stage age
df['stage_appropriate_age'] = ((df['company_age_years_safe'] >= 0.5 * df['expected_age_for_stage']) &
                               (df['company_age_years_safe'] <= 1.5 * df['expected_age_for_stage'])).astype(int)

# 4. Progression risk score: deviation from expected age
df['progression_risk_score'] = abs(df['company_age_years_safe'] - df['expected_age_for_stage']) / \
                               df['expected_age_for_stage'].replace(0, np.nan)

# Print summary statistics
print(f"Companies stuck in early stage (>5 years): {df['stuck_in_early_stage'].sum():,}")
print(f"Companies with rapid progression (Growth stage <=3 years): {df['rapid_progression'].sum():,}")
print(f"Average progression risk score: {df['progression_risk_score'].mean(skipna=True):.2f}")
print(f"Companies with stage-appropriate age: {df['stage_appropriate_age'].sum():,}")

Companies stuck in early stage (>5 years): 19,995
Companies with rapid progression (Growth stage <=3 years): 0
Average progression risk score: 19.68
Companies with stage-appropriate age: 0


## Step 3: Industry-Specific Stage Benchmarks

```
ALGORITHM: Industry Stage Transition Benchmarks
1. Create industry-specific stage progression norms:
   - Calculate industry average progression timelines
   - Identify sector-specific stage transition patterns
   - Generate industry stage transition benchmarks

2. Industry Stage Features:
   - industry_stage_norm = normal_stage_for_age_in_industry
   - stage_vs_industry_expectation = current_stage - expected_stage_for_industry
   - industry_progression_percentile = percentile_ranking_in_industry_progression
   - sector_stage_advantage = stage_advancement_vs_industry_avg

EXPECTED OUTPUT:
- Industry-specific stage progression timelines
- Sector stage transition benchmark tables
- Stage advancement percentile rankings
- Industry progression advantage analysis
```

In [42]:
# Ensure necessary columns exist
required_cols = ['category_code', 'funding_stage_numeric', 'company_age_years_safe']
missing_cols = [c for c in required_cols if c not in df.columns]
if missing_cols:
    raise ValueError(f"Missing required columns for industry benchmarking: {missing_cols}")

# Compute industry-specific average stage per company age
industry_stage_stats = (
    df.groupby('category_code', observed=True)
      .agg(
          industry_avg_stage=('funding_stage_numeric', 'mean'),
          industry_avg_age=('company_age_years_safe', 'mean')
      )
      .reset_index()
)

# Merge industry benchmarks back to main df
df = df.merge(industry_stage_stats, on='category_code', how='left')

# Stage vs industry expectation
df['stage_vs_industry_expectation'] = df['funding_stage_numeric'] - df['industry_avg_stage']

# Industry progression percentile (rank within category by funding_stage_numeric)
df['industry_progression_percentile'] = df.groupby('category_code')['funding_stage_numeric']\
                                         .rank(pct=True)

# Sector stage advantage: positive if ahead of industry average
df['sector_stage_advantage'] = df['funding_stage_numeric'] - df['industry_avg_stage']

# Print validation outputs
print("Sample industry stage benchmarking metrics:")
print(df[['category_code', 'funding_stage_numeric', 'industry_avg_stage', 
          'stage_vs_industry_expectation', 'industry_progression_percentile', 
          'sector_stage_advantage']].head(10).to_string(index=False))

# Summary statistics
print(f"\nAverage stage vs industry expectation: {df['stage_vs_industry_expectation'].mean():.2f}")
print(f"Average industry progression percentile: {df['industry_progression_percentile'].mean():.2f}")
print(f"Average sector stage advantage: {df['sector_stage_advantage'].mean():.2f}")

Sample industry stage benchmarking metrics:
  category_code  funding_stage_numeric  industry_avg_stage  stage_vs_industry_expectation  industry_progression_percentile  sector_stage_advantage
            web                    NaN            0.304680                            NaN                              NaN                     NaN
    games_video                    NaN            0.398089                            NaN                              NaN                     NaN
    games_video                    NaN            0.398089                            NaN                              NaN                     NaN
network_hosting                    NaN            0.474510                            NaN                              NaN                     NaN
    games_video                    NaN            0.398089                            NaN                              NaN                     NaN
    advertising                    NaN            0.464043                

# 4. Bias Detection Features

## Step 1: Geographic Funding Bias Analysis

```
ALGORITHM: Geographic Bias Feature Engineering
1. Calculate geographic funding bias indicators:
   - State-level funding gaps vs national averages
   - Regional funding bias patterns
   - Urban vs rural funding accessibility

2. Geographic Bias Features:
   - state_funding_bias = company_funding - state_median_funding
   - regional_funding_disadvantage = funding_gap_vs_regional_average
   - geographic_funding_percentile = funding_percentile_within_state
   - urban_rural_bias = funding_advantage_based_on_location_type

EXPECTED OUTPUT:
- State-level funding bias analysis
- Regional funding gap identification
- Geographic disadvantage scoring
- Urban/rural funding accessibility metrics
```

In [43]:
# Ensure geographic columns exist
if 'state' not in df.columns:
    df['state'] = 'unknown'
else:
    df['state'] = df['state'].fillna('unknown')

if 'region' not in df.columns:
    df['region'] = 'unknown'
else:
    df['region'] = df['region'].fillna('unknown')

if 'location_type' not in df.columns:
    df['location_type'] = 'unknown'
else:
    df['location_type'] = df['location_type'].fillna('unknown')

# Only compute metrics for rows with valid funding
funding_mask = df['funding_total_usd'].notna()

# State median funding
state_median = (
    df[funding_mask]
    .groupby('state', observed=True)['funding_total_usd']
    .median()
    .reset_index()
    .rename(columns={'funding_total_usd': 'state_median_funding'})
)
df = df.merge(state_median, on='state', how='left')

# State funding bias
df['state_funding_bias'] = df['funding_total_usd'] - df['state_median_funding']

# Regional funding disadvantage
region_avg = (
    df[funding_mask]
    .groupby('region', observed=True)['funding_total_usd']
    .mean()
    .reset_index()
    .rename(columns={'funding_total_usd': 'region_avg_funding'})
)
df = df.merge(region_avg, on='region', how='left')
df['regional_funding_disadvantage'] = df['region_avg_funding'] - df['funding_total_usd']

# Geographic funding percentile within state
df['geographic_funding_percentile'] = df.groupby('state')['funding_total_usd']\
                                        .rank(pct=True, method='max')

# Urban/rural bias
df['urban_rural_bias'] = df.groupby('location_type')['funding_total_usd']\
                           .transform(lambda x: x - x.median())

# Print validation
print(df[['state', 'funding_total_usd', 'state_funding_bias', 
          'region', 'regional_funding_disadvantage',
          'geographic_funding_percentile', 'location_type', 'urban_rural_bias']].head(10))

     state  funding_total_usd  state_funding_bias       region  \
0  unknown         39750000.0          37185000.0      seattle   
1  unknown                NaN                 NaN  los angeles   
2  unknown                NaN                 NaN       sf bay   
3  unknown                NaN                 NaN      unknown   
4  unknown                NaN                 NaN      unknown   
5  unknown                NaN                 NaN       agadir   
6  unknown                NaN                 NaN     vadodara   
7  unknown                NaN                 NaN      unknown   
8  unknown                NaN                 NaN     new york   
9  unknown                NaN                 NaN      unknown   

   regional_funding_disadvantage  geographic_funding_percentile location_type  \
0                  -1.782740e+07                       0.915976       unknown   
1                            NaN                            NaN       unknown   
2                            N

## Step 2: Industry Sector Bias Analysis

```
ALGORITHM: Industry Funding Bias Feature Engineering
1. Identify industry-specific funding bias patterns:
   - Sector funding preferences and discrimination
   - Industry funding accessibility gaps
   - Technology vs traditional industry bias

2. Industry Bias Features:
   - industry_funding_bias = company_funding - industry_median_funding
   - sector_discrimination_score = funding_gap_vs_similar_companies
   - tech_industry_advantage = funding_premium_for_tech_sectors
   - traditional_industry_penalty = funding_disadvantage_for_traditional_sectors

EXPECTED OUTPUT:
- Industry funding bias matrix
- Sector discrimination identification
- Technology sector funding advantages
- Traditional industry funding challenges
```

In [44]:
# Ensure columns exist
if 'category_code' not in df.columns:
    df['category_code'] = 'Unknown'
else:
    df['category_code'] = df['category_code'].fillna('Unknown')

if 'funding_total_usd' not in df.columns:
    df['funding_total_usd'] = 0
else:
    df['funding_total_usd'] = df['funding_total_usd'].fillna(0)

# Industry median funding
industry_median = df.groupby('category_code')['funding_total_usd'].median()
df['industry_median_funding'] = df['category_code'].map(industry_median)

# Industry funding bias
df['industry_funding_bias'] = df['funding_total_usd'] - df['industry_median_funding']
df['sector_discrimination_score'] = df['industry_funding_bias'] / (df['industry_median_funding'] + 1)

# Define categories
tech_categories = ['software', 'internet', 'biotech', 'hardware']
traditional_categories = ['manufacturing', 'retail', 'construction']

# Tech advantage and traditional penalty
df['tech_industry_advantage'] = np.where(
    df['category_code'].isin(tech_categories),
    df['funding_total_usd'] - df['industry_median_funding'],
    0
)

df['traditional_industry_penalty'] = np.where(
    df['category_code'].isin(traditional_categories),
    df['funding_total_usd'] - df['industry_median_funding'],
    0
)

# --- Print validation ---
print(df[['category_code', 'funding_total_usd', 'industry_median_funding', 
          'industry_funding_bias', 'sector_discrimination_score', 
          'tech_industry_advantage', 'traditional_industry_penalty']].head(10))

print(f"\nAverage industry funding bias: {df['industry_funding_bias'].mean():.2f}")
print(f"Average sector discrimination score: {df['sector_discrimination_score'].mean():.2f}")
print(f"Companies with tech advantage >0: {(df['tech_industry_advantage'] > 0).sum():,}")
print(f"Companies with traditional penalty <0: {(df['traditional_industry_penalty'] < 0).sum():,}")

     category_code  funding_total_usd  industry_median_funding  \
0              web         39750000.0                      0.0   
1      games_video                0.0                      0.0   
2      games_video                0.0                      0.0   
3  network_hosting                0.0                      0.0   
4      games_video                0.0                      0.0   
5      advertising                0.0                      0.0   
6        cleantech                0.0                  50000.0   
7            other                0.0                      0.0   
8      advertising                0.0                      0.0   
9       enterprise                0.0                      0.0   

   industry_funding_bias  sector_discrimination_score  \
0             39750000.0                 3.975000e+07   
1                    0.0                 0.000000e+00   
2                    0.0                 0.000000e+00   
3                    0.0                 0.00

## Step 3: Temporal and Systemic Bias Analysis

```
ALGORITHM: Era and Systemic Bias Feature Engineering
1. Analyze temporal funding bias patterns:
   - Founding era bias (pre vs post financial crisis)
   - Economic cycle impact on funding accessibility
   - Systemic bias accumulation over time

2. Temporal Bias Features:
   - era_funding_bias = company_funding - era_median_funding
   - economic_cycle_impact = funding_advantage_disadvantage_by_era
   - systemic_bias_accumulation = combined_bias_score_across_dimensions
   - funding_era_disadvantage = penalty_for_unfavorable_founding_timing

EXPECTED OUTPUT:
- Era-based funding bias analysis
- Economic cycle impact assessment
- Systemic bias accumulation scoring
- Temporal disadvantage identification
```

In [45]:
# Ensure founding_era and funding_total_usd exist
df['founding_era'] = df.get('founding_era', 'Unknown')
df['funding_total_usd'] = df.get('funding_total_usd', 0)

# Compute era median funding
era_median_funding = df.groupby('founding_era')['funding_total_usd'].median()
df['era_median_funding'] = df['founding_era'].map(era_median_funding)

# Era funding bias
df['era_funding_bias'] = df['funding_total_usd'] - df['era_median_funding']

# Economic cycle impact
# Example: Define eras by economic cycles, then compute relative deviation
economic_cycles = {
    'pre_2008': ['pre_2000', '2000-2007'],
    'post_2008': ['2008-2015', '2016+']
}

# Map eras to cycles
def map_cycle(era):
    for cycle, eras in economic_cycles.items():
        if era in eras:
            return cycle
    return 'other'

df['economic_cycle'] = df['founding_era'].map(map_cycle)

# Compute average funding per cycle
cycle_median_funding = df.groupby('economic_cycle')['funding_total_usd'].median()
df['economic_cycle_impact'] = df['economic_cycle'].map(cycle_median_funding)
df['economic_cycle_impact'] = df['funding_total_usd'] - df['economic_cycle_impact']

# Systemic bias accumulation
# Sum normalized biases across dimensions (industry, era, sector)
df['systemic_bias_accumulation'] = (
    df.get('industry_funding_bias', 0)/df['industry_median_funding'].replace(0,1) +
    df['era_funding_bias']/df['era_median_funding'].replace(0,1) +
    df.get('sector_discrimination_score', 0)
)

# Funding era disadvantage
# Penalize eras with median funding below overall median
overall_median = df['funding_total_usd'].median()
df['funding_era_disadvantage'] = df['era_median_funding'].apply(lambda x: max(0, overall_median - x))

print(df[['founding_era', 'funding_total_usd', 'era_median_funding', 
          'era_funding_bias', 'economic_cycle', 'economic_cycle_impact', 
          'systemic_bias_accumulation', 'funding_era_disadvantage']].head(10))

# Summary statistics
print(f"\nAverage era funding bias: {df['era_funding_bias'].mean():.2f}")
print(f"Average economic cycle impact: {df['economic_cycle_impact'].mean():.2f}")
print(f"Average systemic bias accumulation: {df['systemic_bias_accumulation'].mean():.2f}")
print(f"Average funding era disadvantage: {df['funding_era_disadvantage'].mean():.2f}")

# Optional: counts per era and cycle
print("\nCompany counts by founding era:")
print(df['founding_era'].value_counts())
print("\nCompany counts by economic cycle:")
print(df['economic_cycle'].value_counts())

  founding_era  funding_total_usd  era_median_funding  era_funding_bias  \
0    2001-2008         39750000.0                 0.0        39750000.0   
1    2001-2008                0.0                 0.0               0.0   
2    2001-2008                0.0                 0.0               0.0   
3    2001-2008                0.0                 0.0               0.0   
4    2001-2008                0.0                 0.0               0.0   
5    2001-2008                0.0                 0.0               0.0   
6    2001-2008                0.0                 0.0               0.0   
7    2009-2014                0.0                 0.0               0.0   
8    2001-2008                0.0                 0.0               0.0   
9    2009-2014                0.0                 0.0               0.0   

  economic_cycle  economic_cycle_impact  systemic_bias_accumulation  \
0          other             39750000.0                1.192500e+08   
1          other                

# 5. Geographic Heatmap-Ready Features

## Step 1: State-Level Aggregation Features

```
ALGORITHM: State-Level Heatmap Data Preparation
1. Create state-level startup ecosystem metrics:
   - Startup density per state (companies per capita)
   - State success rate aggregations
   - State funding concentration metrics
   - State risk profile aggregations

2. State Aggregation Features:
   - state_startup_density = companies_in_state / state_population_proxy
   - state_success_rate = success_rate_for_companies_in_state
   - state_avg_funding = average_funding_amount_in_state
   - state_risk_score = average_risk_score_for_state

EXPECTED OUTPUT:
- State-level startup ecosystem metrics
- Success rate heatmap data by state
- Funding concentration heatmap data
- Risk profile geographic distribution
```

In [46]:
# Ensure state_code exists
df['state_code'] = df.get('state_code', 'Unknown')

# Example: mock state population proxy if not available
# Replace with real population data if available
state_population = {
    'CA': 39538223, 'NY': 20201249, 'TX': 29145505, 'Unknown': 1
}
df['state_population'] = df['state_code'].map(lambda x: state_population.get(x, 1))

# Aggregate state-level metrics
state_stats = df.groupby('state_code').agg(
    companies_in_state=('id', 'count'),
    state_success_rate=('failure_risk', lambda x: 1 - x.mean() if len(x) > 0 else np.nan),
    state_avg_funding=('funding_total_usd', 'mean'),
    state_risk_score=('risk_tier', 'mean')
).reset_index()

# Compute density per population proxy
state_stats['state_startup_density'] = state_stats['companies_in_state'] / state_stats['state_code'].map(state_population)

# Merge back to main dataframe
df = df.merge(
    state_stats[['state_code', 'state_startup_density', 'state_success_rate', 'state_avg_funding', 'state_risk_score']],
    on='state_code',
    how='left'
)

print(state_stats.head(10))

# Summary statistics
print(f"\nAverage state startup density: {state_stats['state_startup_density'].mean():.6f}")
print(f"Average state success rate: {state_stats['state_success_rate'].mean():.2%}")
print(f"Average state funding: ${state_stats['state_avg_funding'].mean():,.2f}")
print(f"Average state risk score: {state_stats['state_risk_score'].mean():.2f}")

# Optional: distribution of companies per state
print("\nCompanies per state:")
print(state_stats[['state_code', 'companies_in_state']].sort_values('companies_in_state', ascending=False))

  state_code  companies_in_state  state_success_rate  state_avg_funding  \
0         ak                  16            0.187500       3.188881e+05   
1         al                 178            0.286517       8.323243e+06   
2         ar                  85            0.270588       2.522311e+06   
3         az                 760            0.223684       2.821740e+06   
4         ca               16488            0.372877       8.354919e+06   
5         co                1173            0.367434       5.286005e+06   
6         ct                 537            0.374302       4.368240e+06   
7         dc                 373            0.227882       4.059957e+06   
8         de                 181            0.171271       1.920272e+06   
9         fl                2154            0.208914       2.738450e+06   

   state_risk_score  state_startup_density  
0          1.687500                    NaN  
1          1.494382                    NaN  
2          1.541176                    

## Step 2: Regional and Metropolitan Area Features

```
ALGORITHM: Regional Heatmap Granularity Enhancement
1. Create multi-level geographic aggregations:
   - Regional startup ecosystem analysis
   - Metropolitan area startup concentrations
   - Rural vs urban startup patterns

2. Regional Features:
   - region_startup_concentration = startup_density_by_region
   - metro_area_advantage = funding_advantage_in_major_cities
   - rural_startup_challenges = disadvantages_outside_major_metros
   - geographic_cluster_strength = startup_ecosystem_network_effects

EXPECTED OUTPUT:
- Regional startup concentration analysis
- Metropolitan area ecosystem strength
- Rural startup ecosystem challenges
- Geographic cluster network effects
```

In [47]:
# Ensure region and city exist
df['region'] = df.get('region', 'Unknown')
df['city'] = df.get('city', 'Unknown')

# Regional startup concentration and metrics
region_stats = df.groupby('region').agg(
    region_startup_count=('id', 'count'),
    region_avg_funding_region=('funding_total_usd', 'mean'),
    region_success_rate=('failure_risk', lambda x: 1 - x.mean() if len(x) > 0 else np.nan)
).reset_index()

# Max region count for normalization
max_region_count = region_stats['region_startup_count'].max()
region_stats['geographic_cluster_strength'] = region_stats['region_startup_count'] / max_region_count

# Merge regional metrics back to main dataframe
df = df.merge(region_stats, on='region', how='left')

# Metro area flag
major_metros = ['San Francisco', 'New York', 'Los Angeles', 'Boston', 'Chicago']
df['metro_area_flag'] = df['city'].isin(major_metros).astype(int)

# Metro area funding advantage (vectorized)
metro_avg_funding = df.loc[df['metro_area_flag'] == 1, 'funding_total_usd'].mean()
df['metro_area_advantage'] = np.where(df['metro_area_flag'] == 1,
                                      df['funding_total_usd'] - metro_avg_funding,
                                      0)

# Rural startup challenges (vectorized)
df['rural_startup_challenges'] = np.where(df['metro_area_flag'] == 0,
                                          metro_avg_funding - df['funding_total_usd'],
                                          0)

## Step 3: Geographic Investment Opportunity Mapping

```
ALGORITHM: Investment Opportunity Geographic Features
1. Create investment opportunity geographic indicators:
   - Under-capitalized geographic concentrations
   - High-potential low-funded geographic areas
   - Geographic arbitrage opportunities

2. Geographic Opportunity Features:
   - state_undercap_concentration = undercap_companies_per_state / total_state_companies
   - geographic_opportunity_score = high_potential_low_funded_areas
   - state_investment_gap = funding_opportunity_vs_current_investment
   - regional_arbitrage_potential = undervalued_geographic_markets

EXPECTED OUTPUT:
- Under-capitalized geographic concentration heatmap data
- Investment opportunity geographic scoring
- Geographic funding gap analysis
- Regional investment arbitrage identification
```

In [48]:
# Ensure necessary columns exist
df['state_code'] = df.get('state_code', 'Unknown')
df['under_capitalized'] = df.get('under_capitalized', 0)

# First, remove any existing geographic columns to avoid duplicates
columns_to_remove = ['state_undercap_concentration', 'state_avg_funding', 
                    'geographic_opportunity_score', 'state_investment_gap', 
                    'regional_arbitrage_potential']
df = df.drop(columns=[col for col in columns_to_remove if col in df.columns])

# State-level under-capitalized concentration
state_undercap_stats = df.groupby('state_code').agg(
    total_state_companies=('id', 'count'),
    undercap_companies=('under_capitalized', 'sum')
).reset_index()

state_undercap_stats['state_undercap_concentration'] = (
    state_undercap_stats['undercap_companies'] / state_undercap_stats['total_state_companies']
)

# Merge back
df = df.merge(state_undercap_stats[['state_code', 'state_undercap_concentration']], 
              on='state_code', how='left')

# State average funding
state_avg_funding = df.groupby('state_code')['funding_total_usd'].mean().reset_index()
state_avg_funding = state_avg_funding.rename(columns={'funding_total_usd': 'state_avg_funding'})

# Merge state average funding with suffixes to handle duplicates
df = df.merge(state_avg_funding, on='state_code', how='left', suffixes=('', '_state'))

# Now create the geographic features
df['geographic_opportunity_score'] = df['state_undercap_concentration'] * (
    df['state_avg_funding'].max() - df['state_avg_funding']
)

# State investment gap
df['state_investment_gap'] = df['state_avg_funding'].max() - df['state_avg_funding']

# Regional arbitrage potential (handle division by zero)
df['regional_arbitrage_potential'] = df['state_undercap_concentration'] / df['state_avg_funding'].replace(0, 1)

# Validation
print("Geographic features created successfully:")
print(f"Average state under-cap concentration: {df['state_undercap_concentration'].mean():.3f}")
print(f"Average geographic opportunity score: {df['geographic_opportunity_score'].mean():.2f}")
print(f"Average state investment gap: ${df['state_investment_gap'].mean():,.2f}")
print(f"Average regional arbitrage potential: {df['regional_arbitrage_potential'].mean():.4f}")

# Check top opportunity states
top_opportunity = df.groupby('state_code').agg({
    'state_undercap_concentration': 'mean',
    'geographic_opportunity_score': 'mean'
}).nlargest(10, 'geographic_opportunity_score')

print(f"\nTop 10 states by geographic opportunity:")
print(top_opportunity)

Geographic features created successfully:
Average state under-cap concentration: 0.861
Average geographic opportunity score: 7154564.67
Average state investment gap: $7,930,207.16
Average regional arbitrage potential: 0.0000

Top 10 states by geographic opportunity:
            state_undercap_concentration  geographic_opportunity_score
state_code                                                            
unknown                         0.934203                  8.706411e+06
ak                              0.750000                  7.284568e+06
wy                              0.714286                  6.729475e+06
nv                              0.760599                  6.671900e+06
de                              0.812155                  6.587690e+06
ky                              0.711009                  6.374389e+06
wv                              0.692308                  6.294219e+06
nd                              0.774194                  6.252254e+06
sd                     

# 6. Feature Quality Validation

## Step 1: Feature Quality Assessment

```
ALGORITHM: Feature Engineering Quality Control
1. Validate all engineered features for ML readiness:
   - Check for infinite values and extreme outliers
   - Assess feature correlation with target variables
   - Identify multicollinearity issues
   - Validate feature distributions and scaling needs

2. Quality Control Checks:
   - infinite_values_check = identify_features_with_infinite_values
   - target_correlation_analysis = correlation_with_failure_risk_and_success
   - multicollinearity_assessment = identify_highly_correlated_features
   - feature_distribution_analysis = assess_scaling_and_normalization_needs

EXPECTED OUTPUT:
- Feature quality report: "X features ready, X need attention"
- Target correlation ranking: Top 20 most predictive features
- Multicollinearity identification: Features with >0.9 correlation
- Scaling requirements: Features needing normalization
```

In [49]:
# Identify numeric features
numeric_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Infinite values check
infinite_values_check = df[numeric_features].isin([np.inf, -np.inf]).sum()
print("Features with infinite values:")
print(infinite_values_check[infinite_values_check > 0])

# Replace infinite values with NaN
df[numeric_features] = df[numeric_features].replace([np.inf, -np.inf], np.nan)

# Extreme outliers check using IQR method
outlier_counts = {}
for col in numeric_features:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    outliers = ((df[col] < (q1 - 3*iqr)) | (df[col] > (q3 + 3*iqr))).sum()
    if outliers > 0:
        outlier_counts[col] = outliers
print("\nFeatures with extreme outliers (IQR > 3*IQR rule):")
print(outlier_counts)

# Target correlation analysis
if 'failure_risk' in df.columns:
    # Get correlation with target
    corr_with_target = df[numeric_features + ['failure_risk']].corr()['failure_risk'].drop('failure_risk')
    
    # Convert to Series if it's a DataFrame
    if isinstance(corr_with_target, pd.DataFrame):
        corr_with_target = corr_with_target.iloc[:, 0] if len(corr_with_target.columns) > 0 else corr_with_target.iloc[0]
    
    # Sort by absolute correlation
    top_corr_indices = corr_with_target.abs().sort_values(ascending=False).head(20).index
    top_corr = corr_with_target[top_corr_indices]
    
    print("\nTop 20 features correlated with failure_risk:")
    print(top_corr)
else:
    print("\nTarget column 'failure_risk' not found. Skipping correlation analysis.")

# Alternative simpler approach for correlation analysis:
if 'failure_risk' in df.columns:
    print("\nAlternative approach - Top 20 features correlated with failure_risk:")
    corr_results = df[numeric_features].corrwith(df['failure_risk']).sort_values(key=abs, ascending=False).head(20)
    print(corr_results)

# Multicollinearity assessment (|corr| > 0.9)
high_corr_pairs = []
corr_matrix_abs = df[numeric_features].corr().abs()
for i in range(len(corr_matrix_abs.columns)):
    for j in range(i + 1, len(corr_matrix_abs.columns)):
        if corr_matrix_abs.iloc[i, j] > 0.9:
            high_corr_pairs.append((corr_matrix_abs.columns[i], corr_matrix_abs.columns[j], corr_matrix_abs.iloc[i, j]))
print("\nHighly correlated feature pairs (|corr| > 0.9):")
for f1, f2, val in high_corr_pairs:
    print(f"{f1} & {f2} = {val:.2f}")

# Feature distribution analysis using skewness
feature_skewness = df[numeric_features].skew()
print("\nFeatures with high skewness (|skew| > 1):")
print(feature_skewness[abs(feature_skewness) > 1])

# Summary of feature quality
ready_features = [f for f in numeric_features if f not in infinite_values_check[infinite_values_check > 0].index.tolist()]
need_attention = list(set(numeric_features) - set(ready_features))
print("\nFeature quality summary:")
print(f"Features ready for modeling: {len(ready_features)}")
print(f"Features needing attention: {len(need_attention)}")

Features with infinite values:
industry_growth_rate     547
age_industry_maturity    547
dtype: int64

Features with extreme outliers (IQR > 3*IQR rule):
{'lat': np.int64(2647), 'founded_year': np.int64(8209), 'company_age_years': np.int64(8209), 'funding_total_usd': np.int64(27873), 'funding_rounds': np.int64(31705), 'has_funding': np.int64(27873), 'months_to_first_funding': np.int64(35020), 'months_since_last_funding': np.int64(100), 'milestones': np.int64(217), 'investment_rounds': np.int64(2589), 'invested_companies': np.int64(2589), 'failure_risk': np.int64(25999), 'risk_tier': np.int64(25999), 'funding_velocity': np.int64(2010), 'funding_vs_avg': np.int64(2093), 'funding_vs_industry_avg': np.int64(1940), 'industry_growth_rate': np.int64(448), 'stage_risk_mean': np.int64(35941), 'age_funding_ratio': np.int64(2266), 'age_funding_velocity': np.int64(2093), 'funding_industry_fit': np.int64(2162), 'age_industry_maturity': np.int64(1249), 'years_since_founding': np.int64(8209), 'fundin

  c /= stddev[:, None]
  c /= stddev[None, :]


failure_risk                     1.000000
risk_tier                        0.968730
has_funding                     -0.960477
minimal_funding_long_survivor    0.957318
stage_risk_mean                  0.939940
risk_vs_country_peers            0.905835
risk_vs_industry_peers           0.899575
funding_frequency               -0.854133
funding_rounds                  -0.748466
stuck_in_early_stage            -0.727542
months_to_first_funding          0.527836
geo_industry_risk                0.492037
region_success_rate             -0.488723
achievement_score               -0.467986
industry_risk_mean               0.436767
undercap_sector_density          0.436495
funding_duration_days           -0.433460
country_risk_mean                0.423630
undercap_geographic_density      0.423400
progression_risk_score           0.411169
dtype: float64

Highly correlated feature pairs (|corr| > 0.9):
founded_year & company_age_years = 0.94
founded_year & years_since_founding = 1.00
founded_year 

## Step 2: Project-Specific Feature Validation

```
ALGORITHM: Project Goal Alignment Validation
1. Validate features support project objectives:
   - Under-capitalized startup analysis capability
   - Risk profiling and clustering readiness
   - Geographic heatmap data completeness
   - Month 2 ML model preparation

2. Project Alignment Checks:
   - undercap_feature_coverage = features_supporting_undercap_analysis
   - risk_profiling_readiness = features_for_clustering_and_personas
   - heatmap_data_completeness = geographic_features_for_dashboard
   - ml_model_preparation = target_variables_and_feature_quality

EXPECTED OUTPUT:
- Under-cap analysis feature inventory: "X features ready for under-cap focus"
- Risk profiling feature set: "X features ready for clustering"
- Heatmap data validation: "Geographic coverage for X% of companies"
- ML readiness assessment: "Dataset ready for Month 2 modeling"
```

In [50]:
# Under-capitalized analysis features
undercap_features = [f for f in df.columns if 'under_cap' in f or 'geographic_opportunity' in f or 'regional_arbitrage' in f]
undercap_feature_coverage = len(undercap_features)
print(f"Features ready for under-cap focus: {undercap_feature_coverage}")
print("List of under-cap related features:", undercap_features)

# Risk profiling and clustering readiness
risk_features = [f for f in df.columns if 'risk' in f or 'failure_risk' in f or 'risk_tier' in f or 'state_risk_score' in f]
risk_profiling_readiness = len(risk_features)
print(f"\nFeatures ready for clustering and risk profiling: {risk_profiling_readiness}")
print("List of risk-related features:", risk_features)

# Heatmap and geographic coverage validation
geo_features = [f for f in df.columns if 'state' in f or 'region' in f or 'city' in f or 'metro' in f]
heatmap_data_completeness = df[geo_features].notna().all(axis=1).mean() * 100
print(f"\nGeographic coverage for dashboard: {heatmap_data_completeness:.2f}% of companies")
print("Geographic features included:", geo_features)

# ML readiness assessment
# Check that target exists and numeric features are prepared
target_ready = 'failure_risk' in df.columns
numeric_ready = len([f for f in numeric_features if f not in infinite_values_check[infinite_values_check > 0].index.tolist()]) > 0
ml_model_preparation = target_ready and numeric_ready
print(f"\nDataset ready for Month 2 modeling: {ml_model_preparation}")

Features ready for under-cap focus: 3
List of under-cap related features: ['under_capitalized', 'geographic_opportunity_score', 'regional_arbitrage_potential']

Features ready for clustering and risk profiling: 15
List of risk-related features: ['failure_risk', 'risk_tier', 'risk_tier_label', 'country_risk_mean', 'country_risk_confidence', 'industry_risk_mean', 'industry_risk_confidence', 'stage_risk_mean', 'geo_industry_risk', 'experience_risk_score', 'risk_vs_country_peers', 'risk_vs_industry_peers', 'risk_vs_stage_peers', 'progression_risk_score', 'state_risk_score']

Geographic coverage for dashboard: 0.00% of companies
Geographic features included: ['state_code', 'city', 'region', 'funding_velocity_category', 'funding_velocity', 'age_funding_velocity', 'bootstrap_milestone_velocity', 'stage_transition_velocity', 'state', 'state_median_funding', 'state_funding_bias', 'region_avg_funding', 'regional_funding_disadvantage', 'state_population', 'state_startup_density', 'state_success_r

## Step 3: Enhanced Dataset Output & Documentation

```
ALGORITHM: Feature Engineering Output Generation
1. Create comprehensive feature-engineered dataset:
   - Include all original features + new engineered features
   - Add feature documentation and metadata
   - Generate feature importance preliminary rankings
   - Create handoff documentation for EDA phase

2. Output Generation:
   - enhanced_feature_dataset = original_features + engineered_features
   - feature_documentation = descriptions_and_calculation_methods
   - feature_importance_preview = correlation_based_initial_rankings
   - eda_handoff_requirements = next_phase_analysis_requirements

EXPECTED OUTPUT:
- companies_feature_engineering.csv with 100+ features
- Feature documentation reference guide
- Preliminary feature importance rankings
- EDA phase requirements and recommendations
```

Again this all should be documented in markdown format that is included in this repo. Please do so ASAP.

In [51]:
import os 

# Ensure the target directory exists
output_dir = '../processed_data/'
os.makedirs(output_dir, exist_ok=True)

# Create enhanced dataset: original + engineered features
enhanced_feature_dataset = df.copy()

# Generate feature metadata/documentation
feature_documentation = pd.DataFrame({
    'feature_name': df.columns,
    'data_type': df.dtypes.astype(str),
    'description': ['' for _ in df.columns],           # Fill manually or later programmatically
    'calculation_method': ['' for _ in df.columns]     # Fill manually or later
})

# Preliminary feature importance using correlation with target
if 'failure_risk' in df.columns:
    numeric_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    numeric_features = [f for f in numeric_features if f != 'failure_risk']
    feature_documentation['prelim_target_correlation'] = feature_documentation['feature_name'].map(
        df[numeric_features].corrwith(df['failure_risk']).to_dict()
    )
else:
    feature_documentation['prelim_target_correlation'] = None

# Save outputs to processed_data folder
enhanced_feature_dataset.to_csv(os.path.join(output_dir, 'companies_feature_engineering.csv'), index=False)
feature_documentation.to_csv(os.path.join(output_dir, 'feature_documentation_reference.csv'), index=False)

  c /= stddev[:, None]
  c /= stddev[None, :]
