# Feature Engineering

In [206]:
import pandas as pd
import numpy as np

df = pd.read_csv('./companies_cleaned_targets.csv')

In [207]:
df.shape

(196530, 41)

In [208]:
df.columns

Index(['id', 'name', 'normalized_name', 'category_code', 'status', 'closed_at',
       'domain', 'homepage_url', 'twitter_username', 'logo_url', 'logo_width',
       'logo_height', 'description', 'overview', 'tag_list', 'country_code',
       'state_code', 'city', 'region', 'investment_rounds',
       'invested_companies', 'first_funding_at', 'last_funding_at',
       'funding_rounds', 'funding_total_usd', 'first_milestone_at',
       'last_milestone_at', 'milestones', 'relationships', 'created_by',
       'created_at', 'updated_at', 'lat', 'lng', 'founded_year',
       'company_age_years', 'has_funding', 'failure_risk', 'risk_tier',
       'risk_tier_label', 'age_group'],
      dtype='object')

In [209]:
df['funding_velocity'] = df['funding_total_usd'] / df['company_age_years']

In [210]:
# funding_total_usd / avg_funding_in_category
total_avg_funding = df['funding_total_usd'].mean()
df['funding_vs_avg'] = df['funding_total_usd'] / total_avg_funding

category_avg_funding = df.groupby('category_code')['funding_total_usd'].transform('mean')
df['funding_vs_industry_avg'] = df['funding_total_usd'] / np.where(
    df['category_code'].isna(),
    total_avg_funding,
    category_avg_funding
)

In [211]:
# bucket by founding year
def categorize_founding_year(year):
    if year <= 1990:
        return "Pre-1990"
    elif year <= 2000:
        return "1991-2000"
    elif year <= 2008:
        return "2001-2008"
    else:
        return "2009-2014"

df["founding_era"] = df["founded_year"].apply(categorize_founding_year)



In [212]:
# estimate funding stage by total amount of funding and company status
def funding_stage(row):
    f = row['funding_total_usd']
    s = row['status']
    
    if s == 'closed':
        return 'Closed'
    elif s == 'acquired':
        return 'Exit (Acquired)'
    elif s == 'ipo':
        return 'Exit (IPO)'
    
    if f < 1e6:
        return 'Pre-seed'
    elif f < 10e6:
        return 'Seed'
    elif f < 15e6:
        return 'Early'
    elif f < 100e6:
        return 'Growth'
    else:
        return 'Late'
    
df['funding_stage'] = df.apply(funding_stage, axis=1)


In [213]:
# industry growth index

industry_stats = (
    df.groupby(['category_code', 'founded_year'])
      .agg(
          num_startups=('id', 'count'),
          total_funding=('funding_total_usd', 'sum')
      )
      .reset_index()
)

industry_stats['num_startups_norm'] = industry_stats.groupby('category_code')['num_startups'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)
industry_stats['total_funding_norm'] = industry_stats.groupby('category_code')['total_funding'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)

industry_stats['industry_growth_index'] = (
    0.5 * industry_stats['num_startups_norm'] +
    0.5 * industry_stats['total_funding_norm']
)

df = df.merge(
    industry_stats[['category_code', 'founded_year', 'industry_growth_index']],
    on=['category_code', 'founded_year'],
    how='left'
)


In [214]:
# industry growth rate
industry_stats['industry_growth_rate'] = industry_stats.groupby('category_code')['industry_growth_index'].pct_change()

df = df.merge(
    industry_stats[['category_code', 'founded_year', 'industry_growth_rate']],
    on=['category_code', 'founded_year'],
    how='left'
)

In [215]:
# Calculate risk by country/region
country_risk = df.groupby('country_code')['failure_risk'].agg(['mean', 'std', 'count']).reset_index()
country_risk.columns = ['country_code', 'country_risk_mean', 'country_risk_std', 'country_count']
country_risk['country_risk_confidence'] = country_risk['country_count'] / country_risk['country_count'].max()
country_risk_mean = country_risk['country_risk_mean'].mean()

df = df.merge(country_risk[['country_code', 'country_risk_mean', 'country_risk_confidence']], 
              on='country_code', how='left')

# Fill missing values with global average
global_risk_mean = df['failure_risk'].mean()
if 'country_risk_mean' not in df.columns:
    df['country_risk_mean'] = global_risk_mean
else:
    df['country_risk_mean'] = df['country_risk_mean'].fillna(global_risk_mean)

if 'country_risk_confidence' not in df.columns:
    df['country_risk_confidence'] = 0.1
else:
    df['country_risk_confidence'] = df['country_risk_confidence'].fillna(0.1)


In [216]:
# Calculate risk by industry
industry_risk = df.groupby('category_code')['failure_risk'].agg(['mean', 'std', 'count']).reset_index()
industry_risk.columns = ['category_code', 'industry_risk_mean', 'industry_risk_std', 'industry_count']
industry_risk['industry_risk_confidence'] = industry_risk['industry_count'] / industry_risk['industry_count'].max()

df = df.merge(industry_risk[['category_code', 'industry_risk_mean', 'industry_risk_confidence']], 
              on='category_code', how='left')

# Fill missing values
df['industry_risk_mean'] = df['industry_risk_mean'].fillna(global_risk_mean)
df['industry_risk_confidence'] = df['industry_risk_confidence'].fillna(0.1)

In [217]:
# Calculate risk by funding stage
stage_risk = df.groupby('funding_stage')['failure_risk'].agg(['mean', 'count']).reset_index()
stage_risk.columns = ['funding_stage', 'stage_risk_mean', 'stage_count']

df = df.merge(stage_risk[['funding_stage', 'stage_risk_mean']], on='funding_stage', how='left')
df['stage_risk_mean'] = df['stage_risk_mean'].fillna(global_risk_mean)

In [218]:
# Age × Funding interactions
df['age_funding_ratio'] = df['company_age_years'] * df['funding_vs_avg']
df['age_funding_velocity'] = df['company_age_years'] * df['funding_velocity']

In [219]:
# Geographic × Industry interactions
df['geo_industry_risk'] = df['country_risk_mean'] * df['industry_risk_mean']

In [220]:
# Funding × Industry interactions
df['funding_industry_fit'] = df['funding_vs_industry_avg'] * df['industry_growth_index']

In [221]:
# Age × Industry maturity
df['age_industry_maturity'] = df['company_age_years'] * df['industry_growth_rate']

In [222]:
# Experience-based risk score
df['experience_risk_score'] = (
    0.5 * (df['company_age_years'] / df['company_age_years'].max()) +  # Normalized age
    0.3 * df['country_risk_confidence'] +  # Geographic experience
    0.2 * df['industry_risk_confidence']   # Industry experience
)

In [223]:
# Years since founding (for survival analysis perspective)
current_year = 2025
df['years_since_founding'] = current_year - df['founded_year']

In [224]:
# Funding efficiency relative to company age
df['funding_efficiency'] = df['funding_total_usd'] / (df['company_age_years'] + 1)  # +1 to avoid division by zero

In [225]:
# Funding momentum (how much above/below expected for age)
age_funding_median = df.groupby('company_age_years')['funding_total_usd'].transform('median')
df['funding_momentum'] = df['funding_total_usd'] / (age_funding_median + 1)

In [226]:
# Funding relative to founding era
era_funding_median = df.groupby('founding_era')['funding_total_usd'].transform('median')
df['era_adjusted_funding'] = df['funding_total_usd'] / (era_funding_median + 1)

In [227]:
# Competitive landscape intensity
industry_competition = df.groupby(['category_code', 'founded_year']).size().reset_index(name='industry_competition')
df = df.merge(industry_competition, on=['category_code', 'founded_year'], how='left')
df['industry_competition'] = df['industry_competition'].fillna(df['industry_competition'].median())

In [228]:
# Geographic market saturation
geo_saturation = df.groupby(['country_code', 'founded_year']).size().reset_index(name='geo_market_saturation')
df = df.merge(geo_saturation, on=['country_code', 'founded_year'], how='left')
df['geo_market_saturation'] = df['geo_market_saturation'].fillna(df['geo_market_saturation'].median())

In [229]:
# Risk deviation from peers
df['risk_vs_country_peers'] = df['failure_risk'] - df['country_risk_mean']
df['risk_vs_industry_peers'] = df['failure_risk'] - df['industry_risk_mean']
df['risk_vs_stage_peers'] = df['failure_risk'] - df['stage_risk_mean']

In [230]:
# Funding deviation from expectations
df['funding_vs_age_expectation'] = df['funding_total_usd'] - age_funding_median
df['funding_vs_era_expectation'] = df['funding_total_usd'] - era_funding_median

In [231]:
# Composite peer comparison score
df['peer_performance_score'] = (
    0.4 * (-df['risk_vs_industry_peers']) +  # Lower risk vs peers = better
    0.3 * df['funding_vs_industry_avg'] +   # Higher funding vs industry = better
    0.3 * (-df['risk_vs_country_peers'])    # Lower risk vs country = better
)

In [232]:
# Investment activity level
df['investment_activity_score'] = (
    0.6 * (df['investment_rounds'] / (df['investment_rounds'].max() + 1)) +
    0.4 * (df['invested_companies'] / (df['invested_companies'].max() + 1))
)

# Investment to funding ratio (investment activity vs seeking funding)
df['investment_to_funding_ratio'] = df['investment_rounds'] / (df['funding_rounds'] + 1)

# Network effect proxy - convert relationships to numeric first
df['relationships_count'] = pd.to_numeric(df['relationships'], errors='coerce').fillna(0)
df['network_connectivity'] = df['investment_rounds'] + df['invested_companies'] + df['relationships_count']

In [233]:
# Milestones per year of existence
df['milestones_per_year'] = df['milestones'] / (df['company_age_years'] + 1)

# Companies with high milestone activity
df['is_milestone_active'] = (df['milestones'] >= df['milestones'].quantile(0.75)).astype(int)

# Achievement score (combination of milestones and funding success)
df['achievement_score'] = (
    0.4 * (df['milestones'] / (df['milestones'].max() + 1)) +
    0.3 * (df['funding_rounds'] / (df['funding_rounds'].max() + 1))
)

In [234]:
# Basic digital presence indicators
df['has_domain'] = (df['domain'] != 'None').astype(int)
df['has_twitter'] = (df['twitter_username'] != 'None').astype(int)
df['has_logo'] = (df['logo_url'].notna()).astype(int)

# Digital presence score
df['digital_presence_score'] = (
    0.4 * df['has_domain'] +
    0.3 * df['has_twitter'] +
    0.3 * df['has_logo']
)

# Extract domain insights
def extract_domain_features(row):
    domain = row['domain']
    if domain == 'None' or pd.isna(domain):
        return 'none', 0, 0
    
    # Domain type
    if any(ext in domain.lower() for ext in ['.gov', '.edu', '.org']):
        domain_type = 'institutional'
    elif any(ext in domain.lower() for ext in ['.com', '.net', '.biz']):
        domain_type = 'commercial'
    else:
        domain_type = 'other'
    
    # Domain complexity
    domain_length = len(domain)
    subdomain_count = domain.count('.') - 1  # Subtract 1 for the main domain
    
    return domain_type, domain_length, subdomain_count

# Apply domain feature extraction
domain_features = df.apply(extract_domain_features, axis=1, result_type='expand')
df['domain_type'] = domain_features[0]
df['domain_length'] = domain_features[1]
df['subdomain_count'] = domain_features[2]

print(f"  - Companies with domains: {df['has_domain'].sum():,}")
print(f"  - Companies with Twitter: {df['has_twitter'].sum():,}")

  - Companies with domains: 196,530
  - Companies with Twitter: 196,530


In [235]:
# Content availability
df['has_description'] = (df['description'] != 'Unknown').astype(int)
df['has_overview'] = (df['overview'] != 'Unknown').astype(int)
df['has_tags'] = (df['tag_list'] != 'Unknown').astype(int)

# Content richness score
df['content_richness'] = df['has_description'] + df['has_overview'] + df['has_tags']

# Description length analysis
def safe_len(text):
    if text == 'Unknown' or pd.isna(text):
        return 0
    return len(str(text))

df['description_length'] = df['description'].apply(safe_len)
df['overview_length'] = df['overview'].apply(safe_len)

# Tag analysis
def analyze_tags(tag_string):
    if tag_string == 'Unknown' or pd.isna(tag_string):
        return 0, 0
    
    tags = str(tag_string).split(',')
    tag_count = len(tags)
    avg_tag_length = sum(len(tag.strip()) for tag in tags) / tag_count if tag_count > 0 else 0
    
    return tag_count, avg_tag_length

tag_features = df['tag_list'].apply(analyze_tags)
df['tag_count'] = [x[0] for x in tag_features]
df['avg_tag_length'] = [x[1] for x in tag_features]

# Text sophistication score
df['text_sophistication'] = (
    0.3 * (df['description_length'] / (df['description_length'].max() + 1)) +
    0.3 * (df['overview_length'] / (df['overview_length'].max() + 1)) +
    0.4 * (df['tag_count'] / (df['tag_count'].max() + 1))
)

In [236]:
# Convert date columns to datetime
date_cols = ['first_funding_at', 'last_funding_at', 'first_milestone_at', 'last_milestone_at', 'created_at', 'updated_at', 'closed_at']
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Funding timeline features
df['funding_duration_days'] = (df['last_funding_at'] - df['first_funding_at']).dt.days
df['funding_duration_days'] = df['funding_duration_days'].fillna(0)

# Time to first funding (from founding) - check if founded_at exists, otherwise use founded_year
if 'founded_at' in df.columns:
    df['founded_at'] = pd.to_datetime(df['founded_at'], errors='coerce')
    df['time_to_first_funding_days'] = (df['first_funding_at'] - df['founded_at']).dt.days
else:
    # Create founded_at from founded_year
    df['founded_at_estimated'] = pd.to_datetime(df['founded_year'], format='%Y', errors='coerce')
    df['time_to_first_funding_days'] = (df['first_funding_at'] - df['founded_at_estimated']).dt.days

df['time_to_first_funding_years'] = df['time_to_first_funding_days'] / 365.25

# Milestone timeline features
df['milestone_duration_days'] = (df['last_milestone_at'] - df['first_milestone_at']).dt.days
df['milestone_duration_days'] = df['milestone_duration_days'].fillna(0)

# Activity recency (days since last update)
current_date = pd.Timestamp.now()
df['days_since_last_update'] = (current_date - df['updated_at']).dt.days
df['days_since_last_funding'] = (current_date - df['last_funding_at']).dt.days
df['days_since_last_milestone'] = (current_date - df['last_milestone_at']).dt.days

# Fill NaN values for companies without funding/milestones
df['days_since_last_funding'] = df['days_since_last_funding'].fillna(df['days_since_last_update'])
df['days_since_last_milestone'] = df['days_since_last_milestone'].fillna(df['days_since_last_update'])

# Activity frequency (avoid division by zero)
df['funding_frequency'] = df['funding_rounds'] / (df['funding_duration_days'] / 365.25 + 1)
df['milestone_frequency'] = df['milestones'] / (df['milestone_duration_days'] / 365.25 + 1)

# Recently active flags
df['recently_funded'] = (df['days_since_last_funding'] <= 365).astype(int)  # Funded in last year
df['recently_milestone'] = (df['days_since_last_milestone'] <= 365).astype(int)  # Milestone in last year
df['recently_updated'] = (df['days_since_last_update'] <= 90).astype(int)  # Updated in last 3 months

In [237]:
# Overall business maturity score
df['business_maturity_score'] = (
    0.2 * df['digital_presence_score'] +
    0.2 * df['achievement_score'] +
    0.2 * (df['recently_funded'] + df['recently_milestone'] + df['recently_updated']) / 3 +
    0.2 * df['text_sophistication'] +
    0.2 * df['investment_activity_score']
)

# Ecosystem engagement (how connected/active the company is)
df['ecosystem_engagement'] = (
    0.3 * (df['network_connectivity'] / (df['network_connectivity'].max() + 1)) +
    0.2 * df['has_twitter'] +
    0.3 * (df['content_richness'] / 3)
)

# Strategic positioning (investor appeal factors)
df['strategic_positioning'] = (
    0.25 * df['funding_vs_industry_avg'] +
    0.25 * df['digital_presence_score'] +
    0.25 * df['achievement_score'] +
    0.25 * df['ecosystem_engagement']
)

# Activity momentum (recent activity across multiple dimensions)
df['activity_momentum'] = (
    0.4 * df['recently_funded'] +
    0.3 * df['recently_milestone'] +
    0.3 * df['recently_updated']
)

# Operational sophistication (professional setup indicators)
df['operational_sophistication'] = (
    0.3 * df['has_domain'] +
    0.2 * df['has_twitter'] +
    0.3 * (df['content_richness'] / 3)
)

In [238]:
df.to_csv('./companies_featured_targets.csv', index=False)