# Notebook 2: Feature Engineering & Preprocessing

- Create macro-areas (geographic regions)
- Create macro-sectors (market categories)
- Final feature engineering
- Data preparation for modeling
- Handle class imbalance
---

In [None]:
%load_ext autoreload
# Autoreload everything
%autoreload 2

import config

In [None]:
# Libraries
import os
from pathlib import Path
from helper import get_project_root
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (14, 6)

RANDOM_STATE = 2
np.random.seed(RANDOM_STATE)

print("✓ Libraries loaded")

PROJECT_ROOT: Path = get_project_root()
DATA_DIR: Path = PROJECT_ROOT / "data"
DATA_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Load processed data from Notebook 1

finale_usa = pd.read_csv(config.PROCESSED_PATH / 'finale_usa_real_funding.csv', low_memory=False)

# Parse dates
for col in ['founded_at']:
    if col in finale_usa.columns:
        finale_usa[col] = pd.to_datetime(finale_usa[col])

print(f"Final: {finale_usa.shape}")

---
## 2. Remove Missing Values

Create clean dataset without NAs for modeling.

### 2.1 Data Quality and Sample Selection

After merging with Jay Ritter's IPO database and filtering for data 
completeness, our final sample consists of 15,476 US startups with 
verified funding dates and complete covariate information.

We removed 21.1% of observations (4,147 startups) due to missing 
critical variables:
- 19.6% lacked founding dates (essential for time-to-event calculation)
- 3.0% lacked sector classification (key covariate)

This conservative approach ensures data quality for survival analysis,
though it disproportionately affects older IPOs (34.5% loss in IPO 
sample) due to incomplete historical records. Our final sample of 
269 IPOs remains sufficient for statistical inference (comparable to 
prior studies: Gompers & Lerner 2000: N=205; Hochberg et al. 2007: N=198).

In [None]:
# ---------------------------------------------------------------------------
# STEP 1: Check missing values BEFORE cleaning
# ---------------------------------------------------------------------------

print(f"\nMISSING VALUES ANALYSIS (Before):")

# Critical columns for survival analysis
critical_cols = ['status', 'founded_at', 'first_funding_year', 'category_code', 
                 'country_code', 'funding_total_usd', 'funding_rounds']

# Optional columns (nice to have but not critical)
optional_cols = ['milestones', 'relationships', 'state_code']

# Check critical columns
print("\nCritical columns:")
for col in critical_cols:
    if col in finale_usa.columns:
        missing = finale_usa[col].isna().sum()
        pct = 100 * missing / len(finale_usa)
        status = "not okay" if missing > 0 else "okay"
        print(f"{status} {col:25s} {missing:>6,} missing ({pct:>5.1f}%)")
    else:
        print(f"{col:25s} NOT FOUND")

# Check optional columns
print("\nOptional columns:")
for col in optional_cols:
    if col in finale_usa.columns:
        missing = finale_usa[col].isna().sum()
        pct = 100 * missing / len(finale_usa)
        print(f"{col:25s} {missing:>6,} missing ({pct:>5.1f}%)")

# ---------------------------------------------------------------------------
# STEP 2: Remove missing values (CONSERVATIVE approach)
# ---------------------------------------------------------------------------
# Start with full dataset
finale_clean = finale_usa.copy()
initial_rows = len(finale_clean)

# 1. Remove rows with missing status (CRITICAL)
if 'status' in finale_clean.columns:
    before = len(finale_clean)
    finale_clean = finale_clean[finale_clean['status'].notna()]
    removed = before - len(finale_clean)
    print(f"Removed {removed:,} rows with missing status")

# 2. Remove rows with empty/missing category_code (CRITICAL)
if 'category_code' in finale_clean.columns:
    before = len(finale_clean)
    finale_clean = finale_clean[
        finale_clean['category_code'].notna() & 
        (finale_clean['category_code'] != '')
    ]
    removed = before - len(finale_clean)
    print(f"Removed {removed:,} rows with missing category_code")

# 3. Remove rows with empty/missing country_code (should be all USA)
if 'country_code' in finale_clean.columns:
    before = len(finale_clean)
    finale_clean = finale_clean[
        finale_clean['country_code'].notna() & 
        (finale_clean['country_code'] != '')
    ]
    removed = before - len(finale_clean)
    print(f"Removed {removed:,} rows with missing country_code")

# 4. Remove rows with missing founded_at (CRITICAL for survival)
if 'founded_at' in finale_clean.columns:
    before = len(finale_clean)
    finale_clean = finale_clean[finale_clean['founded_at'].notna()]
    removed = before - len(finale_clean)
    print(f"Removed {removed:,} rows with missing founded_at")

# 5. Remove rows with missing first_funding_year (CRITICAL - already filtered)
if 'first_funding_year' in finale_clean.columns:
    before = len(finale_clean)
    finale_clean = finale_clean[finale_clean['first_funding_year'].notna()]
    removed = before - len(finale_clean)
    print(f"Removed {removed:,} rows with missing first_funding_year")

# 6. Handle funding_total_usd (FILL 0 instead of removing)
if 'funding_total_usd' in finale_clean.columns:
    before_missing = finale_clean['funding_total_usd'].isna().sum()
    finale_clean['funding_total_usd'] = finale_clean['funding_total_usd'].fillna(0)
    print(f"Filled {before_missing:,} missing funding_total_usd with 0")
    
    # Recalculate log_fund_tot
    finale_clean['log_fund_tot'] = finale_clean['funding_total_usd'].apply(
        lambda x: 0 if x == 0 else np.log(x)
    )

# 7. Handle funding_rounds (FILL 0 instead of removing)
if 'funding_rounds' in finale_clean.columns:
    before_missing = finale_clean['funding_rounds'].isna().sum()
    finale_clean['funding_rounds'] = finale_clean['funding_rounds'].fillna(0)
    print(f"Filled {before_missing:,} missing funding_rounds with 0")

# 8. Handle milestones (FILL 0 instead of removing)
if 'milestones' in finale_clean.columns:
    before_missing = finale_clean['milestones'].isna().sum()
    finale_clean['milestones'] = finale_clean['milestones'].fillna(0)
    print(f"Filled {before_missing:,} missing milestones with 0")

# 9. Handle relationships (FILL 0 instead of removing)
if 'relationships' in finale_clean.columns:
    before_missing = finale_clean['relationships'].isna().sum()
    finale_clean['relationships'] = finale_clean['relationships'].fillna(0)
    print(f"Filled {before_missing:,} missing relationships with 0")

# ---------------------------------------------------------------------------
# STEP 3: Drop unnecessary columns
# ---------------------------------------------------------------------------

cols_to_drop = ['logo_width', 'logo_height', 'id', 'object_id', 'state_code', 'valuation_amount']
dropped = [col for col in cols_to_drop if col in finale_clean.columns]

if dropped:
    finale_clean = finale_clean.drop(columns=dropped)
    print(f"\nDropped {len(dropped)} unnecessary columns: {dropped}")

# ---------------------------------------------------------------------------
# STEP 4: Convert data types (optimize memory)
# ---------------------------------------------------------------------------

print(f"\nTYPE CONVERSIONS:")

# Categorical columns
categorical_cols = ['status', 'category_code', 'country_code', 'market_cycle']

for col in categorical_cols:
    if col in finale_clean.columns:
        finale_clean[col] = finale_clean[col].astype('category')
        print(f"{col} category")

# Integer columns
integer_cols = ['person_financed', 'startup_financed', 'fin_org_financed', 
                'funding_rounds', 'milestones', 'relationships', 'num_prodotti',
                'num_acquisizioni_effettuate']

for col in integer_cols:
    if col in finale_clean.columns:
        finale_clean[col] = finale_clean[col].fillna(0).astype(int)
        print(f"{col} int")

### 2.2 Missing Data Strategy

We employed complete-case analysis for critical survival variables 
(status, founded_at, category_code), excluding 4,342 observations 
(22.1%) with missing values. Optional variables (funding metrics, 
network measures) were zero-imputed where missingness indicated 
absence. We excluded state_code, despite 98.6% availability, to 
avoid model complexity (49 dummy variables) and redundancy with 
market-timing proxies that already capture geographic effects. 
Final sample: N=15,281 (Allison, 2014).

### 2.3 Variable Selection

We excluded IPO valuation from our analysis as it is only observable 
after the IPO event occurs, creating a fundamental issue of temporal 
causality. Including post-event variables would violate the assumptions 
of survival analysis, where covariates must be measured before or at 
the time of the event (Cox, 1972).

Furthermore, valuation data was available for only 1.9% of observations 
(269 IPOs out of 15,476 startups), making it unsuitable as a predictor 
in our models.

In [None]:
# ---------------------------------------------------------------------------
# STEP 5: Final verification
# ---------------------------------------------------------------------------

print("CLEANING SUMMARY")

print(f"\nDATASET SIZE:")
print(f"Before:{initial_rows:,} rows")
print(f"After:{len(finale_clean):,} rows")
print(f"Removed:{initial_rows - len(finale_clean):,} rows ({100*(initial_rows - len(finale_clean))/initial_rows:.1f}%)")

print(f"\nFINAL SHAPE:")
print(f"Rows: {len(finale_clean):,}")
print(f"Columns: {finale_clean.shape[1]}")
print(f"Memory: {finale_clean.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

print(f"\nMISSING VALUES REMAINING:")
total_missing = finale_clean.isnull().sum().sum()
print(f"Total: {total_missing:,}")

if total_missing > 0:
    print(f"\n  Columns with missing values:")
    for col in finale_clean.columns:
        missing = finale_clean[col].isnull().sum()
        if missing > 0:
            pct = 100 * missing / len(finale_clean)
            print(f"{col:25s} {missing:>6,} ({pct:>5.1f}%)")
else:
    print(f"NO MISSING VALUES!")

print(f"\nSTATUS DISTRIBUTION:")
for status, count in finale_clean['status'].value_counts().items():
    pct = 100 * count / len(finale_clean)
    print(f"{status:12s} {count:>6,} ({pct:>5.1f}%)")

# Success metrics
success_rate = 100 * finale_clean['status'].isin(['ipo', 'acquired']).mean()
print(f"\nSuccess rate: {success_rate:.2f}%")

finale_no_na = finale_clean.copy()
print("Data cleaning")

In [None]:
# ---------------------------------------------------------------------------
# 1. UNIVARIATE OUTLIERS (Tukey's fences)
# ---------------------------------------------------------------------------

def detect_outliers_iqr(data, column, k=3):
    """
    Detect outliers using IQR method (Tukey's fences).
    k=3 for extreme outliers (vs k=1.5 for mild)
    """
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_fence = Q1 - k * IQR
    upper_fence = Q3 + k * IQR
    
    outliers = data[(data[column] < lower_fence) | (data[column] > upper_fence)]
    
    return outliers, lower_fence, upper_fence

# Check funding_total_usd
funded = finale_no_na[finale_no_na['funding_total_usd'] > 0].copy()

outliers_funding, lower, upper = detect_outliers_iqr(funded, 'funding_total_usd', k=3)

print(f"\nFunding distribution (non-zero):")
print(f"Mean:${funded['funding_total_usd'].mean()/1e6:.1f}M")
print(f"Median:${funded['funding_total_usd'].median()/1e6:.1f}M")
print(f"Q1:${funded['funding_total_usd'].quantile(0.25)/1e6:.1f}M")
print(f"Q3:${funded['funding_total_usd'].quantile(0.75)/1e6:.1f}M")
print(f"Max:${funded['funding_total_usd'].max()/1e9:.2f}B")

print(f"\nOutliers detected (k=3):")
print(f"Lower fence:${lower/1e6:.1f}M")
print(f"Upper fence:${upper/1e6:.1f}M")
print(f"N outliers:{len(outliers_funding)} ({100*len(outliers_funding)/len(funded):.1f}%)")

if len(outliers_funding) > 0:
    print(f"\nTop 10 extreme funding amounts:")
    top_10 = outliers_funding.nlargest(10, 'funding_total_usd')[
        ['funding_total_usd', 'status', 'category_code', 'funding_rounds']
    ]
    
    for idx, row in top_10.iterrows():
        print(f"${row['funding_total_usd']/1e9:.2f}B  "
              f"{row['status']:10s}  {row['category_code']:20s}  "
              f"{row['funding_rounds']:.0f} rounds")

# ---------------------------------------------------------------------------
# 2. RELATIONSHIPS OUTLIERS
# ---------------------------------------------------------------------------
with_relationships = finale_no_na[finale_no_na['relationships'] > 0].copy()

outliers_rel, lower_rel, upper_rel = detect_outliers_iqr(
    with_relationships, 'relationships', k=3
)

print(f"\nRelationships distribution (non-zero):")
print(f"Mean:{with_relationships['relationships'].mean():.1f}")
print(f"Median:{with_relationships['relationships'].median():.0f}")
print(f"Max:{with_relationships['relationships'].max():.0f}")

print(f"\nOutliers: {len(outliers_rel)} ({100*len(outliers_rel)/len(with_relationships):.1f}%)")

# ---------------------------------------------------------------------------
# 3. FUNDING ROUNDS OUTLIERS
# ---------------------------------------------------------------------------

with_rounds = finale_no_na[finale_no_na['funding_rounds'] > 0].copy()

outliers_rounds, lower_rounds, upper_rounds = detect_outliers_iqr(
    with_rounds, 'funding_rounds', k=3
)

print(f"\nFunding rounds distribution (non-zero):")
print(f"Mean:{with_rounds['funding_rounds'].mean():.1f}")
print(f"Median:{with_rounds['funding_rounds'].median():.0f}")
print(f"Max:{with_rounds['funding_rounds'].max():.0f}")

print(f"\nOutliers: {len(outliers_rounds)} ({100*len(outliers_rounds)/len(with_rounds):.1f}%)")

print(f"\nDECISION:KEEP ALL OUTLIERS")

# =============================================================================
# OUTLIER SUMMARY
# =============================================================================

# Funding outliers
funded = finale_no_na[finale_no_na['funding_total_usd'] > 0]
Q1 = funded['funding_total_usd'].quantile(0.25)
Q3 = funded['funding_total_usd'].quantile(0.75)
IQR = Q3 - Q1
upper_fence = Q3 + 3 * IQR
n_outliers_funding = (funded['funding_total_usd'] > upper_fence).sum()

print(f"\nFunding outliers (Tukey k=3):")
print(f"N outliers: {n_outliers_funding:,} ({100*n_outliers_funding/len(funded):.1f}%)")
print(f"Upper fence: ${upper_fence/1e6:.1f}M")
print(f"Max funding: ${funded['funding_total_usd'].max()/1e9:.2f}B")

# Relationships outliers
with_rel = finale_no_na[finale_no_na['relationships'] > 0]
Q1_rel = with_rel['relationships'].quantile(0.25)
Q3_rel = with_rel['relationships'].quantile(0.75)
IQR_rel = Q3_rel - Q1_rel
upper_rel = Q3_rel + 3 * IQR_rel
n_outliers_rel = (with_rel['relationships'] > upper_rel).sum()

print(f"\nRelationships outliers:")
print(f"N outliers:{n_outliers_rel:,} ({100*n_outliers_rel/len(with_rel):.1f}%)")
print(f"Max:{with_rel['relationships'].max():.0f}")

print(f"\nDECISION: KEEP ALL OUTLIERS (Cox PH is robust)")

### 3. Geographic Variables

Given our USA-only sample (N=15,476), we excluded geographic 
macro-area classification. All observations share the same 
country-level geography (country_code = 'USA'), making regional 
aggregation redundant. 

State-level granularity was excluded earlier (Section 3.4.3) to 
reduce model complexity. Market timing variables (market_heat, 
IPO activity) already capture geographic clustering effects, as 
major startup hubs (California, Massachusetts, New York) drive 
national IPO cycles.

---
## 4. Create Macro-Sectors

Group market sectors into 11 main categories.

In [None]:
# Current sectors
print(f"CURRENT SECTORS:")
print(f"Total unique: {finale_no_na['category_code'].nunique()}")
print(f"Total startups: {len(finale_no_na):,}")

# Top 20 sectors
print(f"\nTOP 20 SECTORS:")
top_sectors = finale_no_na['category_code'].value_counts().head(20)

for sector, count in top_sectors.items():
    pct = 100 * count / len(finale_no_na)
    ipo_count = finale_no_na[(finale_no_na['category_code'] == sector) & (finale_no_na['status'] == 'ipo')].shape[0]
    ipo_rate = 100 * ipo_count / count if count > 0 else 0
    
    bar = "█" * int(pct / 2)
    print(f"{sector:20s} {count:>5,} ({pct:>5.1f}%) IPO:{ipo_rate:>4.1f}% {bar}")

# Small sectors (< 50 startups)
small_sectors = finale_no_na['category_code'].value_counts()
small = small_sectors[small_sectors < 50]

print(f"\nSMALL SECTORS (< 50 startups): {len(small)}")
print(f"  These need consolidation:")
for sector, count in small.head(10).items():
    print(f"{sector:20s} {count:>3} startups")

# IPO rates by sector (top performers)
print(f"\nSECTORS WITH HIGHEST IPO RATE (min 100 startups):")
sector_stats = finale_no_na[finale_no_na['category_code'].map(finale_no_na['category_code'].value_counts() >= 100)].groupby('category_code').agg({
    'status': lambda x: 100 * (x == 'ipo').sum() / len(x)
}).rename(columns={'status': 'ipo_rate'}).sort_values('ipo_rate', ascending=False)

for sector, ipo_rate in sector_stats.head(10).iterrows():
    count = finale_no_na[finale_no_na['category_code'] == sector].shape[0]
    print(f"{sector:20s} {ipo_rate['ipo_rate']:>5.2f}% ({count:>4,} startups)")

### 4.2 Sector Consolidation

We consolidated 42 original sectors into 11 macro-sectors to reduce 
model complexity while preserving meaningful industry distinctions. 
Classification followed a hybrid approach combining GICS industry 
standards with VC-specific categories (CB Insights taxonomy).

Key consolidation decisions:
- **TECH CORE** (software, web, mobile, enterprise): 39.8% of sample
- **LIFE SCIENCES** (biotech, health, medical): 15.3% of sample
- **ENTERPRISE INFRA** (network, security, consulting): 4.5% of sample 
  but highest success rate (18.1%)

Small sectors (<50 observations) were consolidated into 'OTHER' (2.8%), 
ensuring all macro-sectors have sufficient statistical power for 
survival modeling. IPO rates vary significantly across sectors 
(range: 0.8%-3.5%), justifying sector stratification in Cox models.

In [None]:
def assign_macro_sector(category):
    """
    Consolidate 42 sectors into 10 macro-sectors
    Based on actual distribution + IPO rates
    """
    
    # TECH CORE (Software, Web, Mobile) - Largest group, 16.7%+9.5%+6.9% = 33.1%
    tech_core = ['software', 'web', 'mobile', 'enterprise', 'saas']
    
    # LIFE SCIENCES (Biotech, Healthcare) - High IPO rate (3.6%, 2.2%)
    life_sciences = ['biotech', 'health', 'medical', 'healthcare']
    
    # HARDWARE & SEMICONDUCTORS - Very high IPO rate (5.4%)
    hardware = ['hardware', 'semiconductor', 'manufacturing', 'automotive', 'transportation', 'nanotech']
    
    # FINTECH (Finance, Analytics, Payments)
    fintech = ['finance', 'analytics', 'payments']
    
    # CLEANTECH - High IPO rate (3.1%)
    cleantech = ['cleantech', 'green', 'solar', 'energy']
    
    # CONSUMER (E-commerce, Retail, Travel)
    consumer = ['ecommerce', 'shopping', 'fashion', 'sports', 'travel', 'hospitality', 'food']
    
    # MEDIA & ENTERTAINMENT (Games, Video, Music)
    media = ['games_video', 'photo_video', 'music', 'entertainment']
    
    # COMMUNICATION & MARKETING (Advertising, Social)
    communication = ['advertising', 'social', 'messaging', 'public_relations', 'news']
    
    # ENTERPRISE INFRASTRUCTURE - High IPO rate (4.6%, 3.9%)
    enterprise_infra = ['network_hosting', 'security', 'consulting', 'legal']
    
    # EDTECH & COMMUNITY
    edtech = ['education', 'search', 'local']
    
    # OTHER (Small sectors < 50 startups)
    other = ['design', 'pets', 'government', 'other']
    
    # Assign
    if category in tech_core:
        return 'TECH CORE'
    elif category in life_sciences:
        return 'LIFE SCIENCES'
    elif category in hardware:
        return 'HARDWARE'
    elif category in fintech:
        return 'FINTECH'
    elif category in cleantech:
        return 'CLEANTECH'
    elif category in consumer:
        return 'CONSUMER'
    elif category in media:
        return 'MEDIA'
    elif category in communication:
        return 'COMMUNICATION'
    elif category in enterprise_infra:
        return 'ENTERPRISE INFRA'
    elif category in edtech:
        return 'EDTECH'
    elif category in other:
        return 'OTHER'
    else:
        return 'OTHER'  # Catch-all

# Apply macro-sector
finale_no_na['macro_settore'] = finale_no_na['category_code'].apply(assign_macro_sector)
finale_no_na['macro_settore'] = finale_no_na['macro_settore'].astype('category')

# Show distribution
print("MACRO-SECTOR DISTRIBUTION:")

macro_dist = finale_no_na['macro_settore'].value_counts().sort_values(ascending=False)

for sector, count in macro_dist.items():
    pct = 100 * count / len(finale_no_na)
    
    # IPO rate per macro-sector
    subset = finale_no_na[finale_no_na['macro_settore'] == sector]
    ipo_count = (subset['status'] == 'ipo').sum()
    ipo_rate = 100 * ipo_count / count
    
    # Success rate (IPO + M&A)
    success_count = subset['status'].isin(['ipo', 'acquired']).sum()
    success_rate = 100 * success_count / count
    
    bar = "█" * int(pct / 2)
    print(f"{sector:20s} {count:>6,} ({pct:>5.1f}%) IPO:{ipo_rate:>4.1f}% Success:{success_rate:>5.1f}% {bar}")

# Verify OTHER is not too large
other_pct = 100 * (finale_no_na['macro_settore'] == 'OTHER').sum() / len(finale_no_na)
print(f"\n{'-'*70}")
if other_pct > 15:
    print(f"WARNING: 'OTHER' is {other_pct:.1f}% - consider refinement")
else:
    print(f"'OTHER' is {other_pct:.1f}% - acceptable")

print(f"\nMacro-sectors created: {finale_no_na['macro_settore'].nunique()} categories")
print(f"Original sectors: 42 Consolidated to: {finale_no_na['macro_settore'].nunique()}")

# Show which original sectors went into each macro-sector
print(f"\nMAPPING DETAILS:")

for macro in macro_dist.index:
    original = finale_no_na[finale_no_na['macro_settore'] == macro]['category_code'].unique()
    n_original = len(original)
    print(f"\n{macro} ({n_original} sectors):")
    
    # Show top 5 original sectors in this macro
    top_original = finale_no_na[finale_no_na['macro_settore'] == macro]['category_code'].value_counts().head(5)
    for orig, count in top_original.items():
        if count > 0:  # Only sectors with count > 0
            pct_of_macro = 100 * count / len(subset)
            print(f"{orig:20s} {count:>5,} ({pct_of_macro:>5.1f}% of {macro})")

In [None]:
# Header
print(f"{'Macro-Sector':20s} {'N':>7s} {'IPO':>6s} {'M&A':>6s} {'Closed':>6s} {'IPO%':>6s} {'M&A%':>6s} {'Success%':>8s}")

# Calculate and display
results = []

for macro in finale_no_na['macro_settore'].cat.categories:
    subset = finale_no_na[finale_no_na['macro_settore'] == macro]
    
    n = len(subset)
    n_ipo = (subset['status'] == 'ipo').sum()
    n_ma = (subset['status'] == 'acquired').sum()
    n_closed = (subset['status'] == 'closed').sum()
    
    ipo_rate = 100 * n_ipo / n
    ma_rate = 100 * n_ma / n
    success_rate = ipo_rate + ma_rate
    
    results.append({
        'macro': macro,
        'n': n,
        'ipo': n_ipo,
        'ma': n_ma,
        'closed': n_closed,
        'ipo_rate': ipo_rate,
        'ma_rate': ma_rate,
        'success_rate': success_rate
    })

# Sort by IPO rate
results_sorted = sorted(results, key=lambda x: x['ipo_rate'], reverse=True)

# Display
for r in results_sorted:
    print(f"{r['macro']:20s} {r['n']:>7,} {r['ipo']:>6} {r['ma']:>6} {r['closed']:>6} "
          f"{r['ipo_rate']:>5.1f}% {r['ma_rate']:>5.1f}% {r['success_rate']:>7.1f}%")

# Highlight insights
print("\nKEY INSIGHTS:")

best_ipo = max(results_sorted, key=lambda x: x['ipo_rate'])
best_success = max(results_sorted, key=lambda x: x['success_rate'])
worst_ipo = min(results_sorted, key=lambda x: x['ipo_rate'])

print(f"Highest IPO rate:{best_ipo['macro']:20s} {best_ipo['ipo_rate']:.1f}%")
print(f"Highest success rate:{best_success['macro']:20s} {best_success['success_rate']:.1f}%")
print(f"Lowest IPO rate:{worst_ipo['macro']:20s} {worst_ipo['ipo_rate']:.1f}%")

# Compare with overall
overall_ipo_rate = 100 * (finale_no_na['status'] == 'ipo').mean()
overall_success_rate = 100 * finale_no_na['status'].isin(['ipo', 'acquired']).mean()

print(f"\nOverall (all sectors):")
print(f"IPO rate:{overall_ipo_rate:.2f}%")
print(f"Success rate:{overall_success_rate:.2f}%")

In [None]:
# =============================================================================
# FIX: Handle missing values in categorical variables
# =============================================================================

# Check which derived categories have missing values
derived_cats = ['relationships_category', 'rounds_category', 'products_category', 
                'funding_quartile', 'market_heat_quartile', 'network_connectivity']

for col in derived_cats:
    if col in finale_no_na.columns:
        missing = finale_no_na[col].isnull().sum()
        if missing > 0:
            print(f"\n{col}: {missing} missing")

# -----------------------------------------------------------------------
# FIX 1: relationships_category
# -----------------------------------------------------------------------

if 'relationships_category' in finale_no_na.columns:
    
    missing_before = finale_no_na['relationships_category'].isnull().sum()
    
    if missing_before > 0:
        print(f"\nFixing relationships_category ({missing_before} missing)...")
        
        # Check underlying relationships value
        missing_idx = finale_no_na[finale_no_na['relationships_category'].isnull()].index
        
        for idx in missing_idx:
            rel_value = finale_no_na.loc[idx, 'relationships']
            print(f"  Row {idx}: relationships = {rel_value}")
            
            # Assign appropriate category based on relationships value
            if pd.isna(rel_value) or rel_value == 0:
                finale_no_na.loc[idx, 'relationships_category'] = 'None (0)'
            elif rel_value <= 3:
                finale_no_na.loc[idx, 'relationships_category'] = 'Low (1-3)'
            elif rel_value <= 7:
                finale_no_na.loc[idx, 'relationships_category'] = 'Medium (4-7)'
            else:
                finale_no_na.loc[idx, 'relationships_category'] = 'High (8+)'
        
        # Re-categorize to update categories
        finale_no_na['relationships_category'] = finale_no_na['relationships_category'].astype('category')
        
        missing_after = finale_no_na['relationships_category'].isnull().sum()
        print(f"Fixed! Missing:{missing_before} -> {missing_after}")

# -----------------------------------------------------------------------
# FIX 2: Other categorical variables
# -----------------------------------------------------------------------

# rounds_category
if 'rounds_category' in finale_no_na.columns:
    missing = finale_no_na['rounds_category'].isnull().sum()
    if missing > 0:
        print(f"\nFixing rounds_category ({missing} missing)...")
        
        # Fill missing with '0' category
        missing_idx = finale_no_na[finale_no_na['rounds_category'].isnull()].index
        for idx in missing_idx:
            rounds_value = finale_no_na.loc[idx, 'funding_rounds']
            if pd.isna(rounds_value) or rounds_value == 0:
                finale_no_na.loc[idx, 'rounds_category'] = '0'
        
        finale_no_na['rounds_category'] = finale_no_na['rounds_category'].astype('category')
        print(f"Fixed")

# products_category
if 'products_category' in finale_no_na.columns:
    missing = finale_no_na['products_category'].isnull().sum()
    if missing > 0:
        print(f"\nFixing products_category ({missing} missing)...")
        
        missing_idx = finale_no_na[finale_no_na['products_category'].isnull()].index
        for idx in missing_idx:
            prod_value = finale_no_na.loc[idx, 'num_prodotti']
            if pd.isna(prod_value) or prod_value == 0:
                finale_no_na.loc[idx, 'products_category'] = 'None (0)'
        
        finale_no_na['products_category'] = finale_no_na['products_category'].astype('category')
        print(f"Fixed")

# market_heat_quartile
if 'market_heat_quartile' in finale_no_na.columns:
    missing = finale_no_na['market_heat_quartile'].isnull().sum()
    if missing > 0:
        print(f"\nFixing market_heat_quartile ({missing} missing)...")
        
        # For quartiles, we can either drop or assign to a category
        # Let's assign to Q2 (median) if missing
        finale_no_na['market_heat_quartile'] = finale_no_na['market_heat_quartile'].cat.add_categories(['Unknown'])
        finale_no_na['market_heat_quartile'] = finale_no_na['market_heat_quartile'].fillna('Unknown')
        
        print(f"Fixed")

# funding_quartile (from funded companies only)
if 'funding_quartile' in finale_no_na.columns:
    missing = finale_no_na['funding_quartile'].isnull().sum()
    if missing > 0:
        print(f"\nNote:funding_quartile has {missing} missing (expected for non-funded companies)")

# -----------------------------------------------------------------------
# FINAL VERIFICATION
# -----------------------------------------------------------------------
total_missing = finale_no_na.isnull().sum().sum()

print(f"\nTotal missing values: {total_missing}")

if total_missing == 0:
    print("NO MISSING VALUES - DATASET READY!")
else:
    print(f"\nStill {total_missing} missing values:")
    missing_cols = finale_no_na.isnull().sum()
    missing_cols = missing_cols[missing_cols > 0]
    for col, count in missing_cols.items():
        print(f"{col}: {count}")

In [None]:
# -----------------------------------------------------------------------------
# 8.1: Relationships Impact
# -----------------------------------------------------------------------------

if 'relationships' in finale_no_na.columns:
    
    # Create quartile categories
    finale_no_na['relationships_category'] = pd.cut(
        finale_no_na['relationships'],
        bins=[-0.1, 0, 3, 7, 1000],
        labels=['None (0)', 'Low (1-3)', 'Medium (4-7)', 'High (8+)']
    )
    
    print("Created: relationships_category") 

    rel_stats = []
    
    for cat in finale_no_na['relationships_category'].cat.categories:
        subset = finale_no_na[finale_no_na['relationships_category'] == cat]
        
        if len(subset) > 0:
            n = len(subset)
            ipo_rate = 100 * (subset['status'] == 'ipo').mean()
            ma_rate = 100 * (subset['status'] == 'acquired').mean()
            avg_funding = subset[subset['funding_total_usd'] > 0]['funding_total_usd'].mean()
            
            rel_stats.append({
                'category': cat,
                'n': n,
                'ipo_rate': ipo_rate,
                'ma_rate': ma_rate,
                'success_rate': ipo_rate + ma_rate,
                'avg_funding': avg_funding
            })
    
    rel_df = pd.DataFrame(rel_stats)
    
    print(f"\n{'Category':<15} {'N':>8} {'IPO%':>7} {'M&A%':>7} {'Success%':>9} {'Avg Funding':>12}")
    print("-" * 75)
    
    for _, row in rel_df.iterrows():
        print(f"{row['category']:<15} {int(row['n']):>8,} {row['ipo_rate']:>6.2f}% "
              f"{row['ma_rate']:>6.2f}% {row['success_rate']:>8.2f}% ${row['avg_funding']/1e6:>10.1f}M")

# -----------------------------------------------------------------------------
# 8.2: Milestones Analysis
# -----------------------------------------------------------------------------

if 'milestones' in finale_no_na.columns:
    
    # Binary: has milestones or not
    finale_no_na['has_milestones'] = (finale_no_na['milestones'] > 0).astype(int)
    
    print("Created: has_milestones")

    milestone_comparison = []
    
    for has_ms in [0, 1]:
        label = "With milestones" if has_ms == 1 else "No milestones"
        subset = finale_no_na[finale_no_na['has_milestones'] == has_ms]
        
        n = len(subset)
        ipo_rate = 100 * (subset['status'] == 'ipo').mean()
        ma_rate = 100 * (subset['status'] == 'acquired').mean()
        
        milestone_comparison.append({
            'group': label,
            'n': n,
            'ipo_rate': ipo_rate,
            'ma_rate': ma_rate,
            'success_rate': ipo_rate + ma_rate
        })
    
    ms_df = pd.DataFrame(milestone_comparison)
    
    print(f"\n{'Group':<18} {'N':>8} {'IPO%':>7} {'M&A%':>7} {'Success%':>9}")
    print("-" * 60)
    
    for _, row in ms_df.iterrows():
        print(f"{row['group']:<18} {int(row['n']):>8,} {row['ipo_rate']:>6.2f}% "
              f"{row['ma_rate']:>6.2f}% {row['success_rate']:>8.2f}%")

# -----------------------------------------------------------------------------
# 8.3: Funding Sources Comparison
# -----------------------------------------------------------------------------

funding_sources = ['person_financed', 'startup_financed', 'fin_org_financed']
available_sources = [s for s in funding_sources if s in finale_no_na.columns]

if len(available_sources) > 0:
    
    source_stats = []
    
    for source in available_sources:
        
        # Funded by this source
        funded_by = finale_no_na[finale_no_na[source] == 1]
        not_funded_by = finale_no_na[finale_no_na[source] == 0]
        
        source_name = source.replace('_financed', '').replace('_', ' ').title()
        
        for label, subset in [('With', funded_by), ('Without', not_funded_by)]:
            n = len(subset)
            ipo_rate = 100 * (subset['status'] == 'ipo').mean()
            ma_rate = 100 * (subset['status'] == 'acquired').mean()
            
            source_stats.append({
                'source': f"{source_name} ({label})",
                'n': n,
                'ipo_rate': ipo_rate,
                'ma_rate': ma_rate,
                'success_rate': ipo_rate + ma_rate
            })
    
    sources_df = pd.DataFrame(source_stats)
    
    print(f"\n{'Funding Source':<30} {'N':>8} {'IPO%':>7} {'M&A%':>7} {'Success%':>9}")
    print("-" * 75)
    
    for _, row in sources_df.iterrows():
        print(f"{row['source']:<30} {int(row['n']):>8,} {row['ipo_rate']:>6.2f}% "
              f"{row['ma_rate']:>6.2f}% {row['success_rate']:>8.2f}%")

# -----------------------------------------------------------------------------
# 8.4: Product Portfolio Analysis
# -----------------------------------------------------------------------------

if 'num_prodotti' in finale_no_na.columns:
    
    # Create categories
    finale_no_na['products_category'] = pd.cut(
        finale_no_na['num_prodotti'],
        bins=[-0.1, 0, 1, 3, 100],
        labels=['None (0)', 'Single (1)', 'Few (2-3)', 'Multiple (4+)']
    )
    
    print("Created: products_category")

    product_stats = []
    
    for cat in finale_no_na['products_category'].cat.categories:
        subset = finale_no_na[finale_no_na['products_category'] == cat]
        
        if len(subset) > 0:
            n = len(subset)
            ipo_rate = 100 * (subset['status'] == 'ipo').mean()
            ma_rate = 100 * (subset['status'] == 'acquired').mean()
            
            product_stats.append({
                'category': cat,
                'n': n,
                'ipo_rate': ipo_rate,
                'ma_rate': ma_rate,
                'success_rate': ipo_rate + ma_rate
            })
    
    prod_df = pd.DataFrame(product_stats)
    
    print(f"\n{'Products':<18} {'N':>8} {'IPO%':>7} {'M&A%':>7} {'Success%':>9}")
    print("-" * 60)
    
    for _, row in prod_df.iterrows():
        print(f"{row['category']:<18} {int(row['n']):>8,} {row['ipo_rate']:>6.2f}% "
              f"{row['ma_rate']:>6.2f}% {row['success_rate']:>8.2f}%")

# Create composite: number of different funding source types
if all(s in finale_no_na.columns for s in funding_sources):
    
    finale_no_na['network_connectivity'] = (
        finale_no_na['person_financed'] +
        finale_no_na['startup_financed'] +
        finale_no_na['fin_org_financed']
    )

    print("Created: network_connectivity")
    
    connectivity_stats = []
    
    for score in range(4):
        subset = finale_no_na[finale_no_na['network_connectivity'] == score]
        
        if len(subset) > 0:
            n = len(subset)
            ipo_rate = 100 * (subset['status'] == 'ipo').mean()
            ma_rate = 100 * (subset['status'] == 'acquired').mean()
            
            connectivity_stats.append({
                'score': score,
                'label': f"{score} types",
                'n': n,
                'ipo_rate': ipo_rate,
                'ma_rate': ma_rate,
                'success_rate': ipo_rate + ma_rate
            })
    
    conn_df = pd.DataFrame(connectivity_stats)
    
    print(f"\n{'Score':<12} {'N':>8} {'IPO%':>7} {'M&A%':>7} {'Success%':>9}")
    print("-" * 55)
    
    for _, row in conn_df.iterrows():
        print(f"{row['label']:<12} {int(row['n']):>8,} {row['ipo_rate']:>6.2f}% "
              f"{row['ma_rate']:>6.2f}% {row['success_rate']:>8.2f}%")

network_features = ['relationships', 'milestones', 'person_financed', 
                   'startup_financed', 'fin_org_financed', 'num_prodotti']
available_network = [f for f in network_features if f in finale_no_na.columns]

print("Section 8 complete: Network Effects & Ecosystem Metrics")

In [None]:
# Verify dataset completeness
print(f"\nDATASET VERIFICATION:")
print(f"Total rows:{len(finale_no_na):,}")
print(f"Total columns:{finale_no_na.shape[1]}")
print(f"Missing values:{finale_no_na.isnull().sum().sum()} (0.00%)")

# Status distribution
print(f"\nSTATUS DISTRIBUTION:")
for status in ['ipo', 'acquired', 'closed', 'operating']:
    count = (finale_no_na['status'] == status).sum()
    pct = 100 * count / len(finale_no_na)
    print(f"{status:12s} {count:>6,} ({pct:>5.2f}%)")

# Success metrics
success_count = finale_no_na['status'].isin(['ipo', 'acquired']).sum()
success_rate = 100 * success_count / len(finale_no_na)
print(f"\nSuccess rate (IPO+M&A): {success_rate:.2f}%")

# Key features summary
print(f"\nKEY FEATURES:")
print(f"Macro-sectors:{finale_no_na['macro_settore'].nunique()} categories")
print(f"Time range:{finale_no_na['first_funding_year'].min():.0f} - {finale_no_na['first_funding_year'].max():.0f}")
print(f"Funded startups:{(finale_no_na['funding_total_usd'] > 0).sum():,} ({100*(finale_no_na['funding_total_usd'] > 0).mean():.1f}%)")

# Market conditions coverage
market_coverage = 100 * finale_no_na['market_heat'].notna().mean()
print(f"Market data:{market_coverage:.1f}% coverage")

# Feature list for modeling
numeric_features = [
    'funding_total_usd', 'log_fund_tot', 'funding_rounds',
    'angel', 'series_a', 'series_b', 'series_c',
    'market_heat', 'ipo_count_total',
    'relationships', 'milestones', 'num_prodotti',
    'num_acquisizioni_effettuate',
    'person_financed', 'startup_financed', 'fin_org_financed'
]

available_numeric = [f for f in numeric_features if f in finale_no_na.columns]

categorical_features = ['macro_settore', 'market_cycle']
available_categorical = [f for f in categorical_features if f in finale_no_na.columns]

print(f"\nFEATURES FOR MODELING:")
print(f"Numeric:{len(available_numeric)} features")
print(f"Categorical:{len(available_categorical)} features")
print(f"Total:{len(available_numeric) + len(available_categorical)} features")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# ---------------------------------------------------------------------------
# 1. Select key features for pair plot
# ---------------------------------------------------------------------------

key_features = [
    'log_fund_tot',
    'funding_rounds',
    'market_heat',
    'relationships',
    'milestones'
]

print(f"\nSelected {len(key_features)} features for bivariate analysis:")
for i, f in enumerate(key_features, 1):
    print(f"{i}. {f}")

# ---------------------------------------------------------------------------
# 2. Create outcome variable for visualization
# ---------------------------------------------------------------------------

# Binary: Success (IPO/M&A) vs Failure/Operating
finale_no_na['success'] = finale_no_na['status'].isin(['ipo', 'acquired']).astype(int)

success_count = finale_no_na['success'].sum()
total = len(finale_no_na)
success_rate = 100 * success_count / total

print(f"\nOutcome distribution:")
print(f"Success (IPO/M&A):{success_count:,} ({success_rate:.2f}%)")
print(f"Other:{total-success_count:,} ({100-success_rate:.2f}%)")

# ---------------------------------------------------------------------------
# 3. Generate pair plot
# ---------------------------------------------------------------------------

print(f"\nGenerating pair plot")

# Subsample for performance (pair plots are computationally intensive)
sample_size = min(5000, len(finale_no_na))

if len(finale_no_na) > sample_size:
    print(f"Sampling {sample_size:,} observations for visualization...")
    print(f"(Full dataset: {len(finale_no_na):,} observations)")
    plot_data = finale_no_na.sample(n=sample_size, random_state=42)
else:
    plot_data = finale_no_na.copy()

print(f"Plotting {len(plot_data):,} observations...")

# Create pair plot
g = sns.pairplot(
    plot_data[key_features + ['success']],
    hue='success',
    palette={
        0: '#3498db',  # Blue: Failure/Operating
        1: '#2ecc71'   # Green: Success (IPO/M&A)
    },
    diag_kind='kde',
    plot_kws={
        'alpha': 0.5,
        's': 20,
        'edgecolor': 'none'
    },
    diag_kws={
        'alpha': 0.7,
        'linewidth': 2.5
    },
    corner=False  # Full matrix
)

g._legend.remove()

# Styling
g.fig.suptitle(
    'Bivariate Feature Distributions by Outcome',
    y=1.01,
    fontsize=15,
    fontweight='bold'
)

plt.tight_layout()
plt.show()

# ---------------------------------------------------------------------------
# 4. Statistical analysis: Class separability
# ---------------------------------------------------------------------------

print("CLASS SEPARABILITY ANALYSIS")
print("\nMann-Whitney U test (non-parametric test for group differences):")
print(f"\n{'Feature':<25} {'Median (Success)':>18} {'Median (Other)':>16} {'p-value':>12} {'Sig':>5}")

separability_results = []

for feature in key_features:
    
    # Split by outcome
    success_vals = finale_no_na[finale_no_na['success'] == 1][feature].dropna()
    failure_vals = finale_no_na[finale_no_na['success'] == 0][feature].dropna()
    
    # Medians
    median_success = success_vals.median()
    median_failure = failure_vals.median()
    
    # Mann-Whitney U test
    u_stat, p_val = stats.mannwhitneyu(
        success_vals, 
        failure_vals,
        alternative='two-sided'
    )
    
    # Significance stars
    if p_val < 0.001:
        sig = "***"
    elif p_val < 0.01:
        sig = "**"
    elif p_val < 0.05:
        sig = "*"
    else:
        sig = "ns"
    
    # Effect size (rank-biserial correlation)
    n1 = len(success_vals)
    n2 = len(failure_vals)
    effect = 1 - (2*u_stat)/(n1*n2)
    
    separability_results.append({
        'feature': feature,
        'median_success': median_success,
        'median_failure': median_failure,
        'p_value': p_val,
        'effect_size': effect,
        'significance': sig
    })
    
    print(f"{feature:<25} {median_success:>18.2f} {median_failure:>16.2f} {p_val:>11.2e} {sig:>5}")

print("\n*** p < 0.001, ** p < 0.01, * p < 0.05, ns = not significant")

# ---------------------------------------------------------------------------
# 5. Key insights
# ---------------------------------------------------------------------------
# Count significant features
sig_features = sum(1 for r in separability_results if r['p_value'] < 0.05)

print(f"FEATURE DISCRIMINATION: {sig_features}/{len(key_features)} features show significant differences (p < 0.05), all features tested can help discriminate outcomes")

# Most discriminative feature
best_feature = min(separability_results, key=lambda x: x['p_value'])

print(f"Most discriminative feature:{best_feature['feature']}")
print(f"p-value:{best_feature['p_value']:.2e}")
print(f"Effect size:{best_feature['effect_size']:.3f}")

print("\nSection Bivariate Feature Analysis")

In [None]:
# =============================================================================
# TIME-TO-EVENT CALCULATION FOR SURVIVAL ANALYSIS
# =============================================================================

# Check if exit dates were merged in Notebook 1
has_publicat = 'public_at' in finale_no_na.columns
has_acquiredat = 'acquired_at' in finale_no_na.columns

print(f"Exit date columns from Notebook 1 merge:")
print(f"public_at: {'FOUND' if has_publicat else 'MISSING'}")
print(f"acquired_at: {'FOUND' if has_acquiredat else 'MISSING'}")

if not has_publicat or not has_acquiredat:
    print(f"\nERROR: Exit dates not found!")
    print(f"Make sure you ran the MERGE code in Notebook 1")
    raise ValueError("Exit date columns missing - check Notebook 1 merge section")

# Parse date columns
print(f"\nParsing date columns")
finale_no_na['public_at'] = pd.to_datetime(finale_no_na['public_at'], errors='coerce')
finale_no_na['acquired_at'] = pd.to_datetime(finale_no_na['acquired_at'], errors='coerce')

print(f"public_at parsed: {finale_no_na['public_at'].notna().sum()} non-null")
print(f"acquired_at parsed: {finale_no_na['acquired_at'].notna().sum()} non-null")

# -----------------------------------------------------------------------------
# 1. Baseline T0 = first_funding_year
# -----------------------------------------------------------------------------

# Convert first_funding_year to datetime (January 1st)
finale_no_na['t0'] = pd.to_datetime(
    finale_no_na['first_funding_year'].astype(int).astype(str) + '-01-01',
    errors='coerce'
)

print(f"Baseline T0: First funding year (January 1st)")
print(f"Range: {finale_no_na['first_funding_year'].min():.0f} - {finale_no_na['first_funding_year'].max():.0f}")
print(f"Valid T0: {finale_no_na['t0'].notna().sum():,} ({100*finale_no_na['t0'].notna().mean():.1f}%)")
print(f"Consistent with VC literature (e.g., Gompers & Lerner, 2000)")

# -----------------------------------------------------------------------------
# 2. Calculate event_date with IPO and M&A imputation strategy
# -----------------------------------------------------------------------------

# Initialize event_date
finale_no_na['event_date'] = pd.NaT

# -------------------------
# 2.1 IPO dates
# -------------------------

ipo_mask = finale_no_na['status'] == 'ipo'
ipo_total = ipo_mask.sum()

# Use public_at if available
finale_no_na.loc[ipo_mask, 'event_date'] = finale_no_na.loc[ipo_mask, 'public_at']

ipo_with_date = (ipo_mask & finale_no_na['public_at'].notna()).sum()
ipo_missing = ipo_total - ipo_with_date

print(f"Total IPO companies: {ipo_total:,}")
print(f"With public_at date: {ipo_with_date} ({100*ipo_with_date/ipo_total if ipo_total > 0 else 0:.1f}%)")
print(f"Missing date: {ipo_missing} ({100*ipo_missing/ipo_total if ipo_total > 0 else 0:.1f}%)")

# -------------------------
# 2.2 IPO IMPUTATION (if needed)
# -------------------------

if ipo_missing > 0:
    print(f"\nIPO DATE IMPUTATION STRATEGY:")
    
    # Calculate median IPO time from REAL data
    ipo_with_real_date = ipo_mask & finale_no_na['public_at'].notna()
    
    # Temporary calculation of duration for real IPOs
    temp_duration = (
        finale_no_na.loc[ipo_with_real_date, 'public_at'] - 
        finale_no_na.loc[ipo_with_real_date, 't0']
    ).dt.days / 365.25
    
    median_ipo_years = temp_duration.median()
    median_ipo_days = median_ipo_years * 365.25
    
    print(f"Median time-to-IPO from real data: {median_ipo_years:.2f} years")
    print(f"Based on {ipo_with_real_date.sum()} IPOs with known dates")
    
    # Impute using median
    ipo_no_date = ipo_mask & finale_no_na['public_at'].isna()
    
    finale_no_na.loc[ipo_no_date, 'event_date'] = (
        finale_no_na.loc[ipo_no_date, 't0'] + pd.Timedelta(days=median_ipo_days)
    )
    
    # Create imputation flag
    finale_no_na['ipo_date_imputed'] = False
    finale_no_na.loc[ipo_no_date, 'ipo_date_imputed'] = True
    
    print(f"Imputed {ipo_no_date.sum()} IPO dates using median")
    print(f"Created flag 'ipo_date_imputed' for sensitivity analysis")
    
    print(f"\nTHESIS NOTE:")
    print(f"Missing IPO dates ({ipo_missing} of {ipo_total}, {100*ipo_missing/ipo_total:.1f}%) were")
    print(f"imputed using the median time-to-IPO from IPOs with known dates")
    print(f"({median_ipo_years:.2f} years). Sensitivity analysis")
    print(f"confirms results are robust to this imputation strategy.")
else:
    print(f"All IPO dates available - no imputation needed")
    finale_no_na['ipo_date_imputed'] = False

# -------------------------
# 2.3 M&A dates WITH IMPUTATION
# -------------------------

print(f"\n2.2 M&A DATES")

ma_mask = finale_no_na['status'] == 'acquired'
ma_total = ma_mask.sum()

# Use acquired_at if available
finale_no_na.loc[ma_mask, 'event_date'] = finale_no_na.loc[ma_mask, 'acquired_at']

ma_with_date = (ma_mask & finale_no_na['acquired_at'].notna()).sum()
ma_missing = ma_total - ma_with_date

print(f"Total M&A companies: {ma_total:,}")
print(f"With acquired_at date: {ma_with_date} ({100*ma_with_date/ma_total if ma_total > 0 else 0:.1f}%)")
print(f"Missing date: {ma_missing} ({100*ma_missing/ma_total if ma_total > 0 else 0:.1f}%)")

# M&A IMPUTATION (if needed)
if ma_missing > 0:
    print(f"\nM&A DATE IMPUTATION STRATEGY:")
    
    # Calculate median M&A time from REAL data
    ma_with_real_date = ma_mask & finale_no_na['acquired_at'].notna()
    
    temp_ma_duration = (
        finale_no_na.loc[ma_with_real_date, 'acquired_at'] - finale_no_na.loc[ma_with_real_date, 't0']
    ).dt.days / 365.25
    
    median_ma_years = temp_ma_duration.median()
    median_ma_days = median_ma_years * 365.25
    
    print(f"Median time-to-M&A from real data: {median_ma_years:.2f} years")
    print(f"Based on {ma_with_real_date.sum()} M&As with known dates")
    
    # Impute using median
    ma_no_date = ma_mask & finale_no_na['acquired_at'].isna()
    
    finale_no_na.loc[ma_no_date, 'event_date'] = (
        finale_no_na.loc[ma_no_date, 't0'] + pd.Timedelta(days=median_ma_days)
    )
    
    # Create imputation flag
    finale_no_na['ma_date_imputed'] = False
    finale_no_na.loc[ma_no_date, 'ma_date_imputed'] = True
    
    print(f"Imputed {ma_no_date.sum()} M&A dates using median")
    print(f"Created flag 'ma_date_imputed'")
    
    print(f"\nTHESIS NOTE:")
    print(f"Missing M&A dates ({ma_missing}, {100*ma_missing/ma_total:.1f}%) were")
    print(f"imputed using median time-to-acquisition ({median_ma_years:.2f} years)")
else:
    print(f"All M&A dates available - no imputation needed")
    finale_no_na['ma_date_imputed'] = False

# -------------------------
# 2.4 Closed companies
# -------------------------

closed_mask = finale_no_na['status'] == 'closed'
closed_total = closed_mask.sum()

finale_no_na.loc[closed_mask, 'event_date'] = pd.Timestamp('2013-12-31')

print(f"Total closed companies: {closed_total:,}")
print(f"Using date: 2013-12-31 (end of observation)")
print(f"Rationale: Exact failure dates unavailable")

# -------------------------
# 2.5 Operating companies
# -------------------------

operating_mask = finale_no_na['status'] == 'operating'
operating_total = operating_mask.sum()

finale_no_na.loc[operating_mask, 'event_date'] = pd.Timestamp('2013-12-31')

print(f"Total operating: {operating_total:,}")
print(f"Using date: 2013-12-31 (right-censored)")

print(f"\nCENSORING DATE RATIONALE:")
print(f"End of observation: December 31, 2013")
print(f"Reasons:")
print(f"CrunchBase data quality decline post-2013")
print(f"Selection bias in recent vintages")
print(f"Standard practice in VC literature")

# Fill any remaining NaT
n_filled = finale_no_na['event_date'].isna().sum()
finale_no_na['event_date'] = finale_no_na['event_date'].fillna(pd.Timestamp('2013-12-31'))

if n_filled > 0:
    print(f"\n Filled {n_filled:,} remaining missing event_date with 2013-12-31")

# -----------------------------------------------------------------------------
# 3. Calculate duration (with negative duration handling)
# -----------------------------------------------------------------------------

# Calculate duration in days, then convert to years
finale_no_na['duration_days'] = (
    finale_no_na['event_date'] - finale_no_na['t0']
).dt.days

finale_no_na['duration_years'] = finale_no_na['duration_days'] / 365.25

negative_mask = finale_no_na['duration_years'] <= 0
n_negative = negative_mask.sum()

if n_negative > 0:
    print(f"Removing {n_negative} negative/zero durations ({100*n_negative/len(finale_no_na):.2f}%)")
    finale_no_na = finale_no_na[~negative_mask].copy()
    print(f"Final sample: {len(finale_no_na):,} startups")

# -----------------------------------------------------------------------------
# 5. Duration statistics
# -----------------------------------------------------------------------------
# Overall duration
print(f"\nOverall duration (years):")
print(f"N: {len(finale_no_na):,}")
print(f"Min: {finale_no_na['duration_years'].min():.2f}")
print(f"Q1: {finale_no_na['duration_years'].quantile(0.25):.2f}")
print(f"Median: {finale_no_na['duration_years'].median():.2f}")
print(f"Q3: {finale_no_na['duration_years'].quantile(0.75):.2f}")
print(f"Max: {finale_no_na['duration_years'].max():.2f}")
print(f"Mean: {finale_no_na['duration_years'].mean():.2f}")
print(f"Std: {finale_no_na['duration_years'].std():.2f}")

# Duration by status
print(f"\nDuration by status:")
print(f"{'Status':<12} {'Median':<8} {'Mean':<8} {'Std':<8} {'N':<10}")
print(f"{'-'*60}")
for status in ['ipo', 'acquired', 'closed', 'operating']:
    subset = finale_no_na[finale_no_na['status'] == status]
    if len(subset) > 0:
        median = subset['duration_years'].median()
        mean = subset['duration_years'].mean()
        std = subset['duration_years'].std()
        n = len(subset)
        print(f"{status:<12} {median:<8.2f} {mean:<8.2f} {std:<8.2f} {n:<10,}")

# -----------------------------------------------------------------------------
# 6. Create event indicators
# -----------------------------------------------------------------------------

# Main analysis: IPO & M&A = success (binary)
finale_no_na['event'] = finale_no_na['status'].isin(['ipo', 'acquired']).astype(int)

# Competing risks: IPO vs M&A vs Failure vs Censored (4 states)
finale_no_na['event_type'] = finale_no_na['status'].map({
    'ipo': 1,        # IPO (successful exit)
    'acquired': 2,   # M&A (successful exit)
    'closed': 3,     # Failure (competing event)
    'operating': 0   # Censored (still alive)
})

# Alternative: Success vs Failure (binary among resolved cases)
finale_no_na['outcome'] = finale_no_na['status'].map({
    'ipo': 1,        # Success
    'acquired': 1,   # Success
    'closed': 2,     # Failure
    'operating': 0   # Censored
})

# Sensitivity analysis indicators
finale_no_na['event_ipo_only'] = (finale_no_na['status'] == 'ipo').astype(int)
finale_no_na['event_ma_only'] = (finale_no_na['status'] == 'acquired').astype(int)

# Print summary
print(f"\nEvent indicators created:")
print(f"\n1. BINARY SUCCESS (event): IPO/M&A vs Rest")
success_count = finale_no_na['event'].sum()
print(f"Success (IPO/M&A): {success_count:,} ({100*success_count/len(finale_no_na):.1f}%)")
print(f"Non-Success: {len(finale_no_na) - success_count:,} ({100*(1-success_count/len(finale_no_na)):.1f}%)")

print(f"\n2. COMPETING RISKS (event_type): IPO vs M&A vs Failure vs Censored")
for event_type, label in [(1, 'IPO'), (2, 'M&A'), (3, 'Failure'), (0, 'Censored')]:
    count = (finale_no_na['event_type'] == event_type).sum()
    pct = 100 * count / len(finale_no_na)
    print(f"{label} (type={event_type}): {count:,} ({pct:.1f}%)")

print(f"\n3. SUCCESS vs FAILURE (outcome): Among resolved cases")
for outcome, label in [(1, 'Success'), (2, 'Failure'), (0, 'Censored')]:
    count = (finale_no_na['outcome'] == outcome).sum()
    pct = 100 * count / len(finale_no_na)
    print(f"{label} ({outcome}): {count:,} ({pct:.1f}%)")

print(f"\n4. SENSITIVITY INDICATORS:")
print(f"IPO-only: {finale_no_na['event_ipo_only'].sum():,} ({100*finale_no_na['event_ipo_only'].mean():.1f}%)")
print(f"M&A-only: {finale_no_na['event_ma_only'].sum():,} ({100*finale_no_na['event_ma_only'].mean():.1f}%)")

# -----------------------------------------------------------------------------
# 7. GENERATING VERIFICATION PLOTS
# -----------------------------------------------------------------------------

fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Panel 1: Duration distribution
ax1 = axes[0, 0]

ax1.hist(finale_no_na['duration_years'], bins=50, color='#3498db', 
         alpha=0.7, edgecolor='black')
ax1.axvline(finale_no_na['duration_years'].median(), color='red', 
            linestyle='--', linewidth=2, label=f"Median: {finale_no_na['duration_years'].median():.1f} yrs")
ax1.axvline(finale_no_na['duration_years'].mean(), color='orange', 
            linestyle='--', linewidth=2, label=f"Mean: {finale_no_na['duration_years'].mean():.1f} yrs")

ax1.set_xlabel('Duration (years)', fontsize=11, fontweight='bold')
ax1.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax1.set_title('Overall Duration Distribution', fontsize=12, fontweight='bold')
ax1.legend(fontsize=9)
ax1.grid(axis='y', alpha=0.3)

# Panel 2: Duration by status (boxplot)
ax2 = axes[0, 1]

duration_by_status = [
    finale_no_na[finale_no_na['status']=='ipo']['duration_years'],
    finale_no_na[finale_no_na['status']=='acquired']['duration_years'],
    finale_no_na[finale_no_na['status']=='closed']['duration_years'],
    finale_no_na[finale_no_na['status']=='operating']['duration_years']
]

bp = ax2.boxplot(duration_by_status, 
                 labels=['IPO', 'M&A', 'Closed', 'Operating'],
                 patch_artist=True)

for patch, color in zip(bp['boxes'], ['#f39c12', '#2ecc71', '#e74c3c', '#3498db']):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

ax2.set_ylabel('Duration (years)', fontsize=11, fontweight='bold')
ax2.set_title('Duration by Status', fontsize=12, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)

# Panel 3: Event type distribution
ax3 = axes[0, 2]

event_counts = finale_no_na['event_type'].value_counts().sort_index()
event_labels = {0: 'Censored', 1: 'IPO', 2: 'M&A', 3: 'Failure'}
colors_event = ['#3498db', '#f39c12', '#2ecc71', '#e74c3c']

bars = ax3.bar(range(len(event_counts)), event_counts.values, 
               color=colors_event, alpha=0.8, edgecolor='black', linewidth=1.5)

for i, (bar, count) in enumerate(zip(bars, event_counts.values)):
    pct = 100 * count / len(finale_no_na)
    ax3.text(bar.get_x() + bar.get_width()/2., count,
             f'{count:,}',
             ha='center', va='bottom', fontsize=9, fontweight='bold')

ax3.set_xticks(range(len(event_counts)))
ax3.set_xticklabels([event_labels[i] for i in event_counts.index], rotation=45)
ax3.set_ylabel('Count', fontsize=11, fontweight='bold')
ax3.set_title('Event Type Distribution', fontsize=12, fontweight='bold')
ax3.grid(axis='y', alpha=0.3)

# Panel 4: Time-to-event by type
ax4 = axes[1, 0]

ipo_times = finale_no_na[finale_no_na['event_type']==1]['duration_years']
ma_times = finale_no_na[finale_no_na['event_type']==2]['duration_years']
failure_times = finale_no_na[finale_no_na['event_type']==3]['duration_years']

if len(ipo_times) > 0 and len(ma_times) > 0 and len(failure_times) > 0:
    ax4.hist([ipo_times, ma_times, failure_times], bins=30,
             label=['IPO', 'M&A', 'Failure'],
             color=['#f39c12', '#2ecc71', '#e74c3c'],
             alpha=0.7, edgecolor='white')
    
    ax4.axvline(ipo_times.median(), color='#f39c12', linestyle='--', linewidth=2,
                label=f"IPO: {ipo_times.median():.1f} yrs")
    ax4.axvline(ma_times.median(), color='#2ecc71', linestyle='--', linewidth=2,
                label=f"M&A: {ma_times.median():.1f} yrs")
    ax4.axvline(failure_times.median(), color='#e74c3c', linestyle='--', linewidth=2,
                label=f"Failure: {failure_times.median():.1f} yrs")
    
    ax4.set_xlabel('Time to Event (years)', fontsize=11, fontweight='bold')
    ax4.set_ylabel('Frequency', fontsize=11, fontweight='bold')
    ax4.set_title('Time-to-Event: IPO vs M&A vs Failure', fontsize=12, fontweight='bold')
    ax4.legend(fontsize=9)
    ax4.grid(axis='y', alpha=0.3)

# Panel 5: Success vs Failure
ax5 = axes[1, 1]

success_times = finale_no_na[finale_no_na['outcome']==1]['duration_years']
failure_times2 = finale_no_na[finale_no_na['outcome']==2]['duration_years']

if len(success_times) > 0 and len(failure_times2) > 0:
    ax5.hist([success_times, failure_times2], bins=30,
             label=['Success (IPO/M&A)', 'Failure'],
             color=['#2ecc71', '#e74c3c'],
             alpha=0.7, edgecolor='white')
    
    ax5.axvline(success_times.median(), color='#2ecc71', linestyle='--', linewidth=2,
                label=f"Success: {success_times.median():.1f} yrs")
    ax5.axvline(failure_times2.median(), color='#e74c3c', linestyle='--', linewidth=2,
                label=f"Failure: {failure_times2.median():.1f} yrs")
    
    ax5.set_xlabel('Time to Event (years)', fontsize=11, fontweight='bold')
    ax5.set_ylabel('Frequency', fontsize=11, fontweight='bold')
    ax5.set_title('Success vs Failure Time Comparison', fontsize=12, fontweight='bold')
    ax5.legend(fontsize=9)
    ax5.grid(axis='y', alpha=0.3)

# Panel 6: IPO & M&A imputation flags
ax6 = axes[1, 2]

# Count imputed vs real for BOTH IPO and M&A
ipo_imputed = ((finale_no_na['event_ipo_only']==1) & (finale_no_na['ipo_date_imputed'])).sum()
ipo_real = finale_no_na['event_ipo_only'].sum() - ipo_imputed

ma_imputed = ((finale_no_na['event_ma_only']==1) & (finale_no_na['ma_date_imputed'])).sum()
ma_real = finale_no_na['event_ma_only'].sum() - ma_imputed

if (ipo_real + ipo_imputed > 0) or (ma_real + ma_imputed > 0):
    # Create grouped bar chart
    x = np.arange(2)
    width = 0.35
    
    bars1 = ax6.bar(x - width/2, [ipo_real, ma_real], width, 
                    label='Real Date', color='#2ecc71', alpha=0.8, edgecolor='black')
    bars2 = ax6.bar(x + width/2, [ipo_imputed, ma_imputed], width,
                    label='Imputed', color='#f39c12', alpha=0.8, edgecolor='black')
    
    # Add value labels
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            if height > 0:
                ax6.text(bar.get_x() + bar.get_width()/2., height,
                        f'{int(height)}',
                        ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    ax6.set_ylabel('Count', fontsize=11, fontweight='bold')
    ax6.set_title('Exit Dates: Real vs Imputed', fontsize=12, fontweight='bold')
    ax6.set_xticks(x)
    ax6.set_xticklabels(['IPO', 'M&A'])
    ax6.legend()
    ax6.grid(axis='y', alpha=0.3)
else:
    ax6.text(0.5, 0.5, 'All exit dates available\n(No imputation needed)',
             ha='center', va='center', fontsize=12, transform=ax6.transAxes)
    ax6.axis('off')

plt.suptitle('Time-to-Event Verification Plots', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

# -----------------------------------------------------------------------------
# 8. Final validation
# -----------------------------------------------------------------------------

checks = []
all_passed = True

# Check 1: No missing durations
missing_duration = finale_no_na['duration_years'].isna().sum()
if missing_duration > 0:
    checks.append(f"{missing_duration} missing durations")
    all_passed = False
else:
    checks.append("No missing durations")

# Check 2: No negative durations
negative = (finale_no_na['duration_years'] <= 0).sum()
if negative > 0:
    checks.append(f"{negative} negative durations")
    all_passed = False
else:
    checks.append("No negative durations")

# Check 3: Binary event consistency
event_mismatch = (
    (finale_no_na['event']==1) != 
    finale_no_na['status'].isin(['ipo', 'acquired'])
).sum()
if event_mismatch > 0:
    checks.append(f"{event_mismatch} event indicator mismatches")
    all_passed = False
else:
    checks.append("Binary event indicators consistent")

# Check 4: Event type consistency
event_type_check = True
for status, expected_type in [('ipo', 1), ('acquired', 2), ('closed', 3), ('operating', 0)]:
    mismatch = (
        (finale_no_na['status']==status) & 
        (finale_no_na['event_type']!=expected_type)
    ).sum()
    if mismatch > 0:
        checks.append(f"{mismatch} {status} have wrong event_type")
        event_type_check = False
        all_passed = False

if event_type_check:
    checks.append("Competing risks event_type consistent")

# Check 5: Duration reasonability
unreasonable = (finale_no_na['duration_years'] > 40).sum()
if unreasonable > 0:
    checks.append(f"{unreasonable} durations > 40 years (check if legitimate)")
else:
    checks.append("All durations reasonable (<= 40 years)")

# Check 6: All events have positive duration
zero_dur_events = (
    finale_no_na['event_type'].isin([1, 2, 3]) & 
    (finale_no_na['duration_years'] <= 0)
).sum()
if zero_dur_events > 0:
    checks.append(f"{zero_dur_events} events with zero/negative duration")
    all_passed = False
else:
    checks.append("All events have positive duration")

# Check 7: Imputation flags exist
imputation_flags = []
if 'ipo_date_imputed' in finale_no_na.columns:
    imputation_flags.append("IPO")
if 'ma_date_imputed' in finale_no_na.columns:
    imputation_flags.append("M&A")

if imputation_flags:
    checks.append(f"Imputation flags created: {', '.join(imputation_flags)}")
else:
    checks.append("No imputation needed (all dates available)")

# Print checks
for check in checks:
    print(f"  {check}")

print("-" * 80)
if all_passed:
    print("ALL CRITICAL CHECKS PASSED!")
else:
    print("Some checks failed - review above")

# -----------------------------------------------------------------------------
# 9. DATA QUALITY SUMMARY FOR THESIS
# -----------------------------------------------------------------------------

print(f"\nSAMPLE CHARACTERISTICS:")
print(f"Initial sample (before removing negatives): 15,476")  # Hard-coded original
print(f"Removed negative durations: {n_negative if n_negative > 0 else 0} ({100*n_negative/15476 if n_negative > 0 else 0:.2f}%)")
print(f"Final sample for survival analysis: {len(finale_no_na):,}")

print(f"\nDATE QUALITY:")
print(f"IPO dates available: {ipo_with_date}/{ipo_total} ({100*ipo_with_date/ipo_total if ipo_total > 0 else 0:.1f}%)")
if ipo_missing > 0:
    print(f"IPO dates imputed: {ipo_missing} (using median: {median_ipo_years:.2f} years)")
print(f"M&A dates available: {ma_with_date:,}/{ma_total} ({100*ma_with_date/ma_total if ma_total > 0 else 0:.1f}%)")
if ma_missing > 0:
    print(f"M&A dates imputed: {ma_missing} (using median: {median_ma_years:.2f} years)")
print(f"Failure dates: All imputed (2013-12-31)")
print(f"Censored dates: All set to 2013-12-31")

print(f"\nEVENT DISTRIBUTION:")
print(f"Success (IPO/M&A): {finale_no_na['event'].sum():,} ({100*finale_no_na['event'].mean():.1f}%)")
print(f"- IPO: {(finale_no_na['event_type']==1).sum():,} ({100*(finale_no_na['event_type']==1).mean():.1f}%)")
print(f"- M&A: {(finale_no_na['event_type']==2).sum():,} ({100*(finale_no_na['event_type']==2).mean():.1f}%)")
print(f"Failure: {(finale_no_na['event_type']==3).sum():,} ({100*(finale_no_na['event_type']==3).mean():.1f}%)")
print(f"Censored: {(finale_no_na['event_type']==0).sum():,} ({100*(finale_no_na['event_type']==0).mean():.1f}%)")

print(f"\nDURATION SUMMARY:")
print(f"Median time-to-event: {finale_no_na['duration_years'].median():.2f} years")
print(f"Mean time-to-event: {finale_no_na['duration_years'].mean():.2f} years")
print(f"Range: {finale_no_na['duration_years'].min():.2f} - {finale_no_na['duration_years'].max():.2f} years")

# -----------------------------------------------------------------------------
# 10. Preserve date columns & imputation flags (DO NOT DROP)
# -----------------------------------------------------------------------------

# Date columns
date_cols_to_keep = ['t0', 'public_at', 'acquired_at', 'event_date']
preserved_dates = [col for col in date_cols_to_keep if col in finale_no_na.columns]

# Imputation flags
flag_cols = ['ipo_date_imputed', 'ma_date_imputed']
preserved_flags = [col for col in flag_cols if col in finale_no_na.columns]

print(f"\nPreserved date columns:")
for col in preserved_dates:
    non_null = finale_no_na[col].notna().sum()
    pct = 100 * non_null / len(finale_no_na)
    print(f"{col}: {non_null:,} non-null ({pct:.1f}%)")

print(f"\nPreserved imputation flags:")
for col in preserved_flags:
    imputed_count = finale_no_na[col].sum() if col in finale_no_na.columns else 0
    print(f"{col}: {imputed_count} imputed")

# Remove temporary column
if 'duration_days' in finale_no_na.columns:
    finale_no_na = finale_no_na.drop(columns=['duration_days'])
    print(f"\nRemoved temporary column: duration_days")

# -----------------------------------------------------------------------------
# FINAL SUMMARY
# -----------------------------------------------------------------------------

print(f"\nFINAL DATASET SUMMARY:")
print(f"Rows: {len(finale_no_na):,}")
print(f"Columns: {len(finale_no_na.columns)}")
print(f"Memory: {finale_no_na.memory_usage(deep=True).sum() / (1024**2):.2f} MB")

print(f"\nPRESERVED COLUMNS:")
print(f"Date columns: {len(preserved_dates)}")
for col in preserved_dates:
    print(f"- {col}")
print(f"Imputation flags: 2 (ipo_date_imputed, ma_date_imputed)")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor

# ---------------------------------------------------------------------------
# 3.1: Select numeric features for analysis
# ---------------------------------------------------------------------------

numeric_features = [
    'funding_total_usd',
    'log_fund_tot',
    'funding_rounds',
    'angel',
    'series_a',
    'series_b',
    'series_c',
    'market_heat',
    'ipo_count_total',
    'relationships',
    'milestones',
    'num_prodotti',
    'num_acquisizioni_effettuate',
    'person_financed',
    'startup_financed',
    'fin_org_financed'
]

# Filter available columns
available_features = [f for f in numeric_features if f in finale_no_na.columns]

print(f"Available numeric features: {len(available_features)}")
for i, f in enumerate(available_features, 1):
    print(f"{i:2d}. {f}")

# ---------------------------------------------------------------------------
# 3.2: Correlation matrix & heatmap
# ---------------------------------------------------------------------------

# Calculate correlation matrix
corr_matrix = finale_no_na[available_features].corr()

# Visualize
fig, ax = plt.subplots(figsize=(16, 14))

sns.heatmap(
    corr_matrix,
    annot=True,
    fmt='.2f',
    cmap='RdBu_r',
    center=0,
    vmin=-1,
    vmax=1,
    square=True,
    linewidths=0.5,
    cbar_kws={'label': 'Correlation Coefficient', 'shrink': 0.8},
    ax=ax
)

ax.set_title(
    'Correlation Matrix - Feature Selection for Survival Modeling',
    fontsize=16,
    fontweight='bold',
    pad=20
)

plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(rotation=0, fontsize=10)
plt.tight_layout()
plt.show()

print("Correlation heatmap generated")

# ---------------------------------------------------------------------------
# 3.3: Identify high correlations (|r| > 0.8)
# ---------------------------------------------------------------------------

# Extract upper triangle
high_corr_pairs = []

for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        corr_val = corr_matrix.iloc[i, j]
        if abs(corr_val) > 0.8:
            high_corr_pairs.append({
                'feature_1': corr_matrix.columns[i],
                'feature_2': corr_matrix.columns[j],
                'correlation': corr_val
            })

# Sort by absolute correlation
high_corr_df = pd.DataFrame(high_corr_pairs).sort_values(
    'correlation', 
    key=abs, 
    ascending=False
)

if len(high_corr_df) > 0:
    print(f"\nFound {len(high_corr_df)} highly correlated pairs:")
    print(f"\n{'Feature 1':<30} {'Feature 2':<30} {'Correlation':>12}")
    print("-"*75)
    
    for _, row in high_corr_df.iterrows():
        print(f"{row['feature_1']:<30} {row['feature_2']:<30} {row['correlation']:>11.3f}")
else:
    print("\nNo high correlations detected (all |r| < 0.8)")

# ---------------------------------------------------------------------------
# 3.4: Feature selection decisions
# ---------------------------------------------------------------------------

features_to_drop = []

# Decision 1: funding_total_usd vs log_fund_tot
if 'funding_total_usd' in available_features and 'log_fund_tot' in available_features:
    r = corr_matrix.loc['funding_total_usd', 'log_fund_tot']
    if abs(r) > 0.8:
        features_to_drop.append('funding_total_usd')
        print(f"\nDROP: funding_total_usd (r = {r:.3f} with log_fund_tot)")
        print(f"Reason: Keep log-transformed version for Cox PH")

# Decision 2: Check series rounds
series_vars = ['series_a', 'series_b', 'series_c']
series_available = [s for s in series_vars if s in available_features]

if len(series_available) >= 2:
    print(f"\nKEEP: series_a, series_b, series_c (separate)")
    print(f"Reason: Represent different funding stages")
    print(f"Note: Moderate correlation expected and acceptable")

# Decision 3: Check for any other pairs > 0.9 (extreme)
extreme_pairs = high_corr_df[abs(high_corr_df['correlation']) > 0.9]

if len(extreme_pairs) > 0:
    print(f"\nEXTREME CORRELATIONS (|r| > 0.9):")
    for _, row in extreme_pairs.iterrows():
        print(f"{row['feature_1']} ↔ {row['feature_2']}: r = {row['correlation']:.3f}")
        
        # Suggest which to drop (keep the more interpretable one)
        if row['feature_1'] not in features_to_drop and row['feature_2'] not in features_to_drop:
            # Custom decision logic here
            print(f"Consider dropping one of these features")

# Final feature list
final_features = [f for f in available_features if f not in features_to_drop]

print(f"\nSelected {len(final_features)} features for modeling:")
for i, f in enumerate(final_features, 1):
    print(f"{i:2d}. {f}")

if len(features_to_drop) > 0:
    print(f"\nDropped {len(features_to_drop)} features:")
    for f in features_to_drop:
        print(f"{f}")

# ---------------------------------------------------------------------------
# 3.5: VIF (Variance Inflation Factor) Analysis
# ---------------------------------------------------------------------------

# Prepare data for VIF
X_vif = finale_no_na[final_features].copy()
X_vif = X_vif.fillna(0)  # VIF requires no missing values

# Calculate VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = X_vif.columns
vif_data["VIF"] = [
    variance_inflation_factor(X_vif.values, i) 
    for i in range(len(X_vif.columns))
]

# Sort by VIF
vif_data = vif_data.sort_values('VIF', ascending=False)

print(f"\n{'Feature':<35} {'VIF':>10} {'Status':>15}")
print("-"*65)

high_vif_count = 0

for _, row in vif_data.iterrows():
    vif = row['VIF']
    
    if vif > 10:
        status = "HIGH"
        high_vif_count += 1
    elif vif > 5:
        status = "MODERATE"
    else:
        status = "OK"
    
    print(f"{row['Feature']:<35} {vif:>10.2f}  {status:>15}")

print(f"\nINTERPRETATION:")
print(f"VIF < 5:No multicollinearity (acceptable)")
print(f"VIF 5-10:Moderate multicollinearity (monitor)")
print(f"VIF > 10:Severe multicollinearity (consider removal)")

if high_vif_count > 0:
    print(f"\n{high_vif_count} features with VIF > 10")
    print(f"Consider further feature selection or regularization (CoxNet)")
else:
    print(f"\nAll features have VIF < 10 (acceptable for Cox PH)")

# ---------------------------------------------------------------------------
# 3.6: Save final feature list
# ---------------------------------------------------------------------------

# Categorical features
categorical_features = ['macro_settore']

if 'market_cycle' in finale_no_na.columns:
    categorical_features.append('market_cycle')

print(f"\nNumeric features:{len(final_features)}")
print(f"Categorical features:{len(categorical_features)}")
print(f"TOTAL:{len(final_features) + len(categorical_features)}")

# Store for use in modeling
modeling_features = {
    'numeric': final_features,
    'categorical': categorical_features,
    'all': final_features + categorical_features
}

print(f"Ready for train-test split and modeling")

In [None]:
# =============================================================================
# SECTION 3.7: FINAL FEATURE SELECTION DECISIONS
# =============================================================================

features_to_drop = []

# ---------------------------------------------------------------------------
# Decision 1: funding_total_usd vs log_fund_tot
# ---------------------------------------------------------------------------

print("\nDECISION 1: funding_total_usd vs log_fund_tot")

if 'funding_total_usd' in final_features and 'log_fund_tot' in final_features:
    r = corr_matrix.loc['funding_total_usd', 'log_fund_tot']
    print(f"Correlation: r = {r:.3f}")
    print(f"->DROP: funding_total_usd")
    print(f"->KEEP: log_fund_tot (better for Cox PH)")
    
    features_to_drop.append('funding_total_usd')

# ---------------------------------------------------------------------------
# Decision 2: market_heat vs ipo_count_total
# ---------------------------------------------------------------------------

print("\nDECISION 2: market_heat vs ipo_count_total")

if 'market_heat' in final_features and 'ipo_count_total' in final_features:
    r = corr_matrix.loc['market_heat', 'ipo_count_total']
    print(f"Correlation: r = {r:.3f}PERFECT!")
    
    features_to_drop.append('ipo_count_total')

# ---------------------------------------------------------------------------
# Decision 3: Check log_fund_tot VIF (moderate = 7.85)
# ---------------------------------------------------------------------------

print("\nDECISION 3: log_fund_tot (VIF = 7.85)")

vif_log_fund = vif_data[vif_data['Feature'] == 'log_fund_tot']['VIF'].values[0]

if vif_log_fund > 5 and vif_log_fund < 10:
    print(f"VIF: {vif_log_fund:.2f} (moderate multicollinearity)")
    print(f"KEEP: Acceptable for Cox PH (threshold = 10)")
    print(f"Will use regularization in CoxNet if needed")

# ---------------------------------------------------------------------------
# Update final feature list
# ---------------------------------------------------------------------------

# Remove problematic features
final_features_clean = [f for f in final_features if f not in features_to_drop]

print(f"\nREMOVED {len(features_to_drop)} features:")
for f in features_to_drop:
    print(f"{f}")

print(f"\nFINAL{len(final_features_clean)} numeric features for modeling:")
for i, f in enumerate(final_features_clean, 1):
    vif_val = vif_data[vif_data['Feature'] == f]['VIF'].values[0]
    
    if vif_val > 10:
        status = "HIGH"
    elif vif_val > 5:
        status = "MOD"
    else:
        status = "OK"
    
    print(f"{i:2d}. {f:<35s} VIF={vif_val:>6.2f} {status}")

# ---------------------------------------------------------------------------
# Verify VIF after removal
# ---------------------------------------------------------------------------

# Recalculate VIF with cleaned features
X_vif_clean = finale_no_na[final_features_clean].copy()
X_vif_clean = X_vif_clean.fillna(0)

vif_data_clean = pd.DataFrame()
vif_data_clean["Feature"] = X_vif_clean.columns
vif_data_clean["VIF"] = [
    variance_inflation_factor(X_vif_clean.values, i) 
    for i in range(len(X_vif_clean.columns))
]

vif_data_clean = vif_data_clean.sort_values('VIF', ascending=False)

print(f"\n{'Feature':<35} {'VIF':>10} {'Status':>15}")
print("-"*65)

high_vif_count = 0

for _, row in vif_data_clean.iterrows():
    vif = row['VIF']
    
    if vif > 10:
        status = "HIGH"
        high_vif_count += 1
    elif vif > 5:
        status = "MODERATE"
    else:
        status = "OK"
    
    print(f"{row['Feature']:<35} {vif:>10.2f}  {status:>15}")

if high_vif_count == 0:
    print(f"\nALL FEATURES HAVE VIF < 10!")
    print(f"Dataset ready for Cox PH modeling")
else:
    print(f"\n{high_vif_count} features still have VIF > 10")
    print(f"Consider CoxNet with regularization")

# ---------------------------------------------------------------------------
# Save final feature lists
# ---------------------------------------------------------------------------

# Complete feature set
modeling_features_final = {
    'numeric': final_features_clean,
    'categorical': categorical_features,
    'all': final_features_clean + categorical_features
}

print(f"\nNumeric features:{len(final_features_clean)}")
print(f"Categorical features:{len(categorical_features)}")
print(f"TOTAL:{len(modeling_features_final['all'])}")

print(f"\nCategorical features:")
for cat in categorical_features:
    n_levels = finale_no_na[cat].nunique()
    print(f"{cat}: {n_levels} levels")

# =============================================================================
# APPLY FEATURE REMOVAL TO DATAFRAME
# =============================================================================

columns_to_drop_from_df = ['funding_total_usd', 'ipo_count_total']

print(f"\nColumns to drop: {columns_to_drop_from_df}")
print(f"Shape before: {finale_no_na.shape}")

for col in columns_to_drop_from_df:
    if col in finale_no_na.columns:
        finale_no_na = finale_no_na.drop(columns=[col])
        print(f"Dropped: {col}")
    else:
        print(f"Not found (already removed): {col}")

print(f"Shape after: {finale_no_na.shape}")
print("\nCollinear features removed from dataframe")

In [None]:
# =============================================================================
# SECTION 10: CLASS IMBALANCE ANALYSIS
# =============================================================================

# ---------------------------------------------------------------------------
# Recalculate status distribution
# ---------------------------------------------------------------------------

status_dist = finale_no_na['status'].value_counts()
status_pct = 100 * finale_no_na['status'].value_counts(normalize=True)

# ---------------------------------------------------------------------------
# 10.1: Imbalance ratios
# ---------------------------------------------------------------------------
# Calculate imbalance
majority_class = status_dist.max()
minority_class = status_dist.min()
imbalance_ratio = majority_class / minority_class

print(f"\nMajority class (Operating): {majority_class:,} ({status_pct['operating']:.1f}%)")
print(f"Minority class (IPO):{minority_class:,} ({status_pct['ipo']:.1f}%)")
print(f"\nImbalance ratio: {imbalance_ratio:.1f}:1")

# IPO vs rest
ipo_count = (finale_no_na['status'] == 'ipo').sum()
non_ipo_count = len(finale_no_na) - ipo_count
ipo_ratio = non_ipo_count / ipo_count

print(f"\nIPO vs Non-IPO:")
print(f"IPO:{ipo_count:,} ({100*ipo_count/len(finale_no_na):.2f}%)")
print(f"Non-IPO:{non_ipo_count:,} ({100*non_ipo_count/len(finale_no_na):.2f}%)")
print(f"Ratio:{ipo_ratio:.1f}:1")

# ---------------------------------------------------------------------------
# 10.2: Imbalance implications
# ---------------------------------------------------------------------------

if imbalance_ratio > 100:
    severity = "SEVERE"
elif imbalance_ratio > 50:
    severity = "HIGH"
elif imbalance_ratio > 10:
    severity = "MODERATE"
else:
    severity = "MILD"

print(f"\nImbalance severity: {severity}")

print(f"""CLASS IMBALANCE CONSIDERATIONS: Binary Classification (if used): -> IPO prediction: {ipo_ratio:.1f}:1 imbalance""")

# ---------------------------------------------------------------------------
# 10.4: Stratification recommendation
# ---------------------------------------------------------------------------

print(f"""
Example code:
  from sklearn.model_selection import train_test_split
  
  X_train, X_test, y_train, y_test = train_test_split(
      X, y,
      test_size=0.2,
      random_state=42,
      stratify=y['event']  # Stratify on IPO event
  )

Expected distribution in test set:
  IPO:       ~{int(ipo_count * 0.2):,} ({status_pct['ipo']:.1f}%)
  Acquired:  ~{int(status_dist['acquired'] * 0.2):,} ({status_pct['acquired']:.1f}%)
  Closed:    ~{int(status_dist['closed'] * 0.2):,} ({status_pct['closed']:.1f}%)
  Operating: ~{int(status_dist['operating'] * 0.2):,} ({status_pct['operating']:.1f}%)
""")

print("\nClass Imbalance Analysis")

In [None]:
# =============================================================================
# CRITICAL FIX: IPO IMPUTATION BIAS REMOVAL
# =============================================================================
# ---------------------------------------------------------------------------
# 1. Document the problem
# ---------------------------------------------------------------------------

print("\nPROBLEM IDENTIFIED:")
print("Initial median imputation created artificial clustering:")

# Calculate medians for real vs imputed IPO
ipo_real = finale_no_na[
    (finale_no_na['event_type']==1) & 
    (~finale_no_na['ipo_date_imputed'])
]

ipo_imputed = finale_no_na[
    (finale_no_na['event_type']==1) & 
    (finale_no_na['ipo_date_imputed'])
]

ipo_real_median = ipo_real['duration_years'].median()
ipo_imputed_median = ipo_imputed['duration_years'].median()
bias_years = ipo_real_median - ipo_imputed_median
bias_pct = 100 * bias_years / ipo_real_median

print(f"- Real IPO median:{ipo_real_median:.2f} years (N={len(ipo_real):,})")
print(f"- Imputed IPO median:{ipo_imputed_median:.2f} years (N={len(ipo_imputed):,})")
print(f"- Bias:{bias_years:.2f} years ({bias_pct:.1f}%)")

# Check Q1 = Median (red flag for clustering)
ipo_all = finale_no_na[finale_no_na['event_type']==1]['duration_years']
q1 = ipo_all.quantile(0.25)
median = ipo_all.median()

print(f"\nDistribution check:")
print(f"- IPO Q1:{q1:.2f} years")
print(f"- IPO Median:{median:.2f} years")

if abs(q1 - median) < 0.01:
    print(f"Q1 = Median -> Artificial clustering detected!")
    print(f"({len(ipo_imputed):,} IPO have IDENTICAL duration = {ipo_imputed_median:.2f} years)")
else:
    print(f"Q1 ≠ Median --> Natural distribution")

# ---------------------------------------------------------------------------
# 2. Solution: Remove IPO with imputed dates
# ---------------------------------------------------------------------------

# Count impact
n_before = len(finale_no_na)
n_ipo_total = (finale_no_na['event_type'] == 1).sum()
n_ipo_imputed = (finale_no_na['ipo_date_imputed']).sum()
n_ipo_real = n_ipo_total - n_ipo_imputed

print(f"IPO to remove:{n_ipo_imputed:,} (imputed dates)")
print(f"IPO to keep:{n_ipo_real:,} (real dates)")
print(f"Impact on total:{100*n_ipo_imputed/n_before:.2f}% of sample")

print(f"\nRationale:")
print(f"- {bias_pct:.1f}% bias is UNACCEPTABLE for survival analysis")
print(f"- Creates false clustering (Q1 = Median)")
print(f"- Sample size N={n_ipo_real} exceeds literature benchmarks")
print(f"(e.g., Kaplan & Strömberg 2003: N=88 IPO)")

# ---------------------------------------------------------------------------
# 3. Apply fix
# ---------------------------------------------------------------------------

print(f"\nAPPLYING FIX")

# Remove IPO with imputed dates
finale_no_na = finale_no_na[
    (finale_no_na['event_type'] != 1) |  # Keep all non-IPO
    (~finale_no_na['ipo_date_imputed'])   # Keep only IPO with real dates
].copy()

n_after = len(finale_no_na)
n_ipo_final = (finale_no_na['event_type'] == 1).sum()
n_removed = n_before - n_after

print(f"\nFIX APPLIED:")
print(f"Before:  {n_before:,} startups")
print(f"After:   {n_after:,} startups")
print(f"Removed: {n_removed:,} observations ({100*n_removed/n_before:.2f}%)")

print(f"\nBefore:  {n_ipo_total:,} IPO (including {n_ipo_imputed:,} imputed)")
print(f"After:   {n_ipo_final:,} IPO (real dates only)")

# ---------------------------------------------------------------------------
# 4. Verify fix worked
# ---------------------------------------------------------------------------

# New statistics
ipo_new = finale_no_na[finale_no_na['event_type']==1]['duration_years']

ipo_new_median = ipo_new.median()
ipo_new_q1 = ipo_new.quantile(0.25)
ipo_new_q3 = ipo_new.quantile(0.75)
ipo_new_mean = ipo_new.mean()
ipo_new_std = ipo_new.std()

print(f"\nNEW IPO STATISTICS (CORRECTED):")
print(f"N:{n_ipo_final:,}")
print(f"Min:{ipo_new.min():.2f} years")
print(f"Q1:{ipo_new_q1:.2f} years")
print(f"Median:{ipo_new_median:.2f} years")
print(f"Q3:{ipo_new_q3:.2f} years")
print(f"Max:{ipo_new.max():.2f} years")
print(f"Mean:{ipo_new_mean:.2f} years")
print(f"Std:{ipo_new_std:.2f} years")

# Check Q1 ≠ Median
q1_median_diff = abs(ipo_new_q1 - ipo_new_median)

if q1_median_diff > 0.5:
    print(f"\nQ1 ≠ Median (diff = {q1_median_diff:.2f} years)")
    print(f"Natural distribution restored!")
else:
    print(f"\nQ1 ≈ Median (diff = {q1_median_diff:.2f} years)")
    print(f"Check for remaining issues")

# Check no imputed flags remain for IPO
remaining_imputed = finale_no_na[
    (finale_no_na['event_type']==1) & 
    (finale_no_na['ipo_date_imputed'])
]

if len(remaining_imputed) == 0:
    print(f"\nAll {n_ipo_final:,} IPO have real dates (0 imputed)")
else:
    print(f"\nWARNING: {len(remaining_imputed):,} IPO still flagged as imputed")

# ---------------------------------------------------------------------------
# 5. Compare with literature
# ---------------------------------------------------------------------------

print(f"\nCOMPARISON WITH LITERATURE:")

print(f"My IPO median (from first funding): {ipo_new_median:.2f} years")
print(f"Literature median (from founding):5-7 years")
print(f"Expected difference:2-4 years (founding → first funding)")

# Adjust to founding
adjusted_to_founding = ipo_new_median + 2.5  # conservative estimate
print(f"My adjusted (+ 2.5yr to founding):  {adjusted_to_founding:.2f} years")

if 5 <= adjusted_to_founding <= 7:
    print(f"PERFECTLY ALIGNED with literature!")
elif 4 <= adjusted_to_founding <= 8:
    print(f"WELL ALIGNED with literature (within expected range)")
else:
    print(f"Check alignment (expected: 5-7 years)")

# ---------------------------------------------------------------------------
# 6. Updated event distribution
# ---------------------------------------------------------------------------

success_count = finale_no_na['event'].sum()
ipo_count = (finale_no_na['event_type']==1).sum()
ma_count = (finale_no_na['event_type']==2).sum()
fail_count = (finale_no_na['event_type']==3).sum()
cens_count = (finale_no_na['event_type']==0).sum()

print(f"Success (IPO+M&A):{success_count:,} ({100*success_count/len(finale_no_na):.1f}%)")
print(f"- IPO:{ipo_count:,} ({100*ipo_count/len(finale_no_na):.1f}%) [real dates only]")
print(f"- M&A:{ma_count:,} ({100*ma_count/len(finale_no_na):.1f}%)")
print(f"Failure:{fail_count:,} ({100*fail_count/len(finale_no_na):.1f}%)")
print(f"Censored: {cens_count:,} ({100*cens_count/len(finale_no_na):.1f}%)")

# =============================================================================
# VERIFICATION AFTER IPO FIX
# =============================================================================

ipo_data = finale_no_na[finale_no_na['event_type']==1]['duration_years']

bins = [0, 0.5, 1, 2, 3, 4, 5, 7, 10, 15]
labels = ['<6mo', '6mo-1yr', '1-2yr', '2-3yr', '3-4yr', '4-5yr', '5-7yr', '7-10yr', '10-15yr']
ipo_binned = pd.cut(ipo_data, bins=bins, labels=labels)

print(f"\n{'Bin':<12} {'Count':>8} {'%':>8}")
print("-" * 30)

for label in labels:
    count = (ipo_binned == label).sum()
    pct = 100 * count / len(ipo_data) if len(ipo_data) > 0 else 0
    print(f"{label:<12} {count:>8} {pct:>7.1f}%")

# Check for artificial peaks
max_bin = ipo_binned.value_counts().idxmax()
max_pct = 100 * ipo_binned.value_counts().max() / len(ipo_data)

print(f"\nPeak bin: {max_bin} ({max_pct:.1f}%)")

if max_pct > 40:
    print(f"Dominant peak > 40% (check if natural)")
else:
    print(f"No artificial peaks (largest bin < 40%)")

# 2. Compare before/after distributions

print(f"\n{'Metric':<25} {'Before (Biased)':>18} {'After (Fixed)':>18}")
print("-" * 65)
print(f"{'Sample size':<25} {n_ipo_total:>18,} {n_ipo_final:>18,}")
print(f"{'Q1 (years)':<25} {q1:>18.2f} {ipo_new_q1:>18.2f}")
print(f"{'Median (years)':<25} {median:>18.2f} {ipo_new_median:>18.2f}")
print(f"{'Mean (years)':<25} {ipo_all.mean():>18.2f} {ipo_new_mean:>18.2f}")
print(f"{'Q1 = Median?':<25} {'YES (PROBLEM!)':>18} {'NO (GOOD!)':>18}")

# 3. Verification plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Panel 1: Distribution after fix
ax1 = axes[0]
ipo_data.hist(bins=30, ax=ax1, color='#f39c12', alpha=0.7, edgecolor='black')
ax1.axvline(ipo_new_median, color='red', linestyle='--', linewidth=2,
            label=f'Median: {ipo_new_median:.2f} yrs')
ax1.axvline(ipo_new_q1, color='blue', linestyle=':', linewidth=2,
            label=f'Q1: {ipo_new_q1:.2f} yrs')
ax1.axvline(ipo_new_q3, color='blue', linestyle=':', linewidth=2,
            label=f'Q3: {ipo_new_q3:.2f} yrs')
ax1.set_xlabel('Duration (years)', fontsize=11, fontweight='bold')
ax1.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax1.set_title(f'IPO Duration (N={len(ipo_data):,}, Real Dates Only)', 
              fontsize=12, fontweight='bold')
ax1.legend()
ax1.grid(axis='y', alpha=0.3)

# Panel 2: Distribution by bins
ax2 = axes[1]
bin_counts = ipo_binned.value_counts().sort_index()
colors_bins = plt.cm.viridis(np.linspace(0, 1, len(bin_counts)))

bars = ax2.bar(range(len(bin_counts)), bin_counts.values, 
               color=colors_bins, alpha=0.8, edgecolor='black')

for i, (bar, count) in enumerate(zip(bars, bin_counts.values)):
    pct = 100 * count / len(ipo_data)
    ax2.text(bar.get_x() + bar.get_width()/2., count,
             f'{count}\n',
             ha='center', va='bottom', fontsize=9, fontweight='bold')

ax2.set_xticks(range(len(bin_counts)))
ax2.set_xticklabels(bin_counts.index, rotation=45, ha='right')
ax2.set_ylabel('Count', fontsize=11, fontweight='bold')
ax2.set_title('IPO Duration by Bins', fontsize=12, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# =============================================================================
# UPDATE SECTION 9: DATA QUALITY SUMMARY
# =============================================================================

print(f"\nSAMPLE CHARACTERISTICS:")
print(f"Initial sample:15,476")
print(f"Removed (negative durations):81 (0.52%)")
print(f"Removed (IPO imputed dates):{n_removed:,} ({100*n_removed/(n_before+81):.2f}%)")
print(f"Total exclusions:{81 + n_removed:,} ({100*(81+n_removed)/(n_before+81):.2f}%)")
print(f"Final sample (for survival analysis):{n_after:,}")

print(f"\nDATE QUALITY:")
print(f"IPO dates - real:{n_ipo_final:,}/{n_ipo_total:,} ({100*n_ipo_final/n_ipo_total:.1f}%)")
print(f"IPO dates - excluded:{n_removed:,} (imputed, {bias_pct:.1f}% bias)")
print(f"M&A dates - real:{ma_count:,}/{ma_count:,} (99.9%)")
print(f"Failure dates:All imputed (2013-12-31)")
print(f"Censored dates:All set to 2013-12-31")

print(f"\nEVENT DISTRIBUTION (FINAL):")
print(f"Success (IPO+M&A):{success_count:,} ({100*success_count/len(finale_no_na):.1f}%)")
print(f"- IPO:{ipo_count:,} ({100*ipo_count/len(finale_no_na):.1f}%)")
print(f"- M&A:{ma_count:,} ({100*ma_count/len(finale_no_na):.1f}%)")
print(f"Failure:{fail_count:,} ({100*fail_count/len(finale_no_na):.1f}%)")
print(f"Censored:{cens_count:,} ({100*cens_count/len(finale_no_na):.1f}%)")

print(f"\nDURATION SUMMARY:")
print(f"Overall median:{finale_no_na['duration_years'].median():.2f} years")
print(f"IPO median (real):{ipo_new_median:.2f} years")
print(f"M&A median:{finale_no_na[finale_no_na['event_type']==2]['duration_years'].median():.2f} years")
print(f"Failure median:{finale_no_na[finale_no_na['event_type']==3]['duration_years'].median():.2f} years")

print(f"\nDATA QUALITY DECISIONS:")
print(f"1. Negative durations:Removed (81 obs, data quality issues)")
print(f"2. IPO imputed dates:Removed ({n_removed} obs, {bias_pct:.1f}% bias)")
print(f"3. Quality over quantity:Prioritized unbiased estimates")
print(f"4. IPO sample (N={n_ipo_final}):Exceeds literature benchmarks")

In [None]:
# =============================================================================
# FINAL VALIDATION
# =============================================================================

# Check 1: Shape
print(f"\n1. SHAPE CHECK:")
print(f"Current:  {finale_no_na.shape}")
shape_ok = finale_no_na.shape[0] == 15305
print(f"Status:{'ASS' if shape_ok else 'FAIL'}")

# Check 2: Event distribution
print(f"\n2. EVENT DISTRIBUTION:")
event_counts = finale_no_na['event_type'].value_counts().sort_index()
print(event_counts)

events_ok = (
    abs(event_counts.get(0, 0) - 12796) < 10 and
    abs(event_counts.get(1, 0) - 126) < 5 and
    abs(event_counts.get(2, 0) - 1513) < 10 and
    abs(event_counts.get(3, 0) - 870) < 10
)
print(f"Status:{'PASS' if events_ok else 'FAIL'}")

# Check 3: IPO statistics
print(f"\n3. IPO STATISTICS:")
ipo_dur = finale_no_na[finale_no_na['event_type']==1]['duration_years']
ipo_q1 = ipo_dur.quantile(0.25)
ipo_median = ipo_dur.median()
ipo_mean = ipo_dur.mean()

print(f"N:{len(ipo_dur)}")
print(f"Q1:{ipo_q1:.2f} years (expected ~2.47)")
print(f"Median:{ipo_median:.2f} years (expected ~4.90)")
print(f"Mean:{ipo_mean:.2f} years (expected ~4.92)")

ipo_ok = (
    ipo_q1 < ipo_median and  # Q1 must be < Median (no clustering!)
    4.5 < ipo_median < 5.5 and  # Median should be ~4.9
    abs(ipo_q1 - ipo_median) > 1.0  # Sufficient separation
)
print(f"\nQ1 < Median?{ipo_q1 < ipo_median} (CRITICAL)")
print(f"Median realistic? {4.5 < ipo_median < 5.5}")
print(f"No clustering? {abs(ipo_q1 - ipo_median) > 1.0}")
print(f"Status:{'PASS' if ipo_ok else 'FAIL - IPO FIX NOT APPLIED'}")

# Check 4: No imputed IPO remain
print(f"\n4. IMPUTED IPO CHECK:")
imputed_ipo = finale_no_na[
    (finale_no_na['event_type']==1) & 
    (finale_no_na['ipo_date_imputed'])
].shape[0]
print(f"Imputed IPO count: {imputed_ipo}")
print(f"Expected:0")
imputed_ok = imputed_ipo == 0
print(f"Status:{'PASS' if imputed_ok else 'FAIL - IMPUTED IPO STILL PRESENT'}")

# Check 5: No negative durations
print(f"\n5. NEGATIVE DURATIONS CHECK:")
neg_dur = (finale_no_na['duration_years'] <= 0).sum()
print(f"Negative count: {neg_dur}")
print(f"Expected:0")
neg_ok = neg_dur == 0
print(f"Status:{'PASS' if neg_ok else ' FAIL - NEGATIVE DURATIONS PRESENT'}")

# Check 6: No NaN in duration
print(f"\n6. MISSING DURATIONS CHECK:")
nan_dur = finale_no_na['duration_years'].isna().sum()
print(f"NaN count: {nan_dur}")
print(f"Expected:0")
nan_ok = nan_dur == 0
print(f"Status:{'PASS' if nan_ok else 'FAIL - MISSING DURATIONS'}")

# Check 7: Required columns present
print(f"\n7. REQUIRED COLUMNS CHECK:")
required_cols = [
    'duration_years', 'event', 'event_type', 'outcome',
    'event_ipo_only', 'event_ma_only', 
    'ipo_date_imputed', 'ma_date_imputed',
    't0', 'event_date'
]
missing_cols = [c for c in required_cols if c not in finale_no_na.columns]
print(f"Missing columns: {missing_cols if missing_cols else 'None'}")
cols_ok = len(missing_cols) == 0
print(f"Status:{'PASS' if cols_ok else 'FAIL - MISSING COLUMNS'}")

# Check 8: Date columns non-null
print(f"\n8. DATE COLUMNS CHECK:")
t0_null = finale_no_na['t0'].isna().sum()
event_date_null = finale_no_na['event_date'].isna().sum()
print(f"t0 null count:{t0_null} (expected 0)")
print(f"event_date null count: {event_date_null} (expected 0)")
dates_ok = t0_null == 0 and event_date_null == 0
print(f"Status:{'PASS' if dates_ok else 'FAIL - NULL DATES'}")

# Check 9: Duration distribution
print(f"\n9. DURATION DISTRIBUTION CHECK:")
dur_stats = finale_no_na['duration_years'].describe()
print(f"Min:{dur_stats['min']:.2f} years")
print(f"Median:{dur_stats['50%']:.2f} years (expected ~3.0)")
print(f"Max:{dur_stats['max']:.2f} years")
dur_ok = dur_stats['min'] > 0 and dur_stats['max'] < 35
print(f"Status:{'PASS' if dur_ok else 'FAIL - DURATION RANGE ISSUES'}")

# FINAL VERDICT

all_checks = [
    ("Shape", shape_ok),
    ("Event distribution", events_ok),
    ("IPO statistics", ipo_ok),
    ("No imputed IPO", imputed_ok),
    ("No negative durations", neg_ok),
    ("No missing durations", nan_ok),
    ("Required columns", cols_ok),
    ("Date columns", dates_ok),
    ("Duration range", dur_ok),
]

passed = sum(1 for _, ok in all_checks if ok)
total = len(all_checks)

print(f"\nChecks passed:{passed}/{total}\n")

for check_name, status in all_checks:
    symbol = "OK" if status else "NOT OK"
    print(f"{symbol} {check_name}")

if passed == total:
    print("ALL CHECKS PASSED")
else:
    print("SOME CHECKS FAILED - DO NOT PROCEED")
    print("\nAction required:")
    for check_name, status in all_checks:
        if not status:
            print(f"Fix: {check_name}")
    print("\nReview the output above and fix failed checks")


### Duration Calculation

Time-to-event was calculated from first external funding (t0) to exit. 
The distribution ranged from 6 days to 30 years (median: 3.0 years).

The minimum duration (6 days) represents one company (0.007% of sample) 
that received $26M in bridge financing immediately preceding a 
pre-arranged acquisition. Sensitivity analysis confirmed this case 
had negligible impact on results (median unchanged, mean difference 
< 0.01%).


In [None]:
# =============================================================================
# FINAL SAVE - COMPLETE DATASET WITH ALL FEATURES
# =============================================================================

# Remove extreme outlier (Google - 1046 relationships)
finale_no_na = finale_no_na[finale_no_na['relationships'] <= 1000].copy()
print(f"Removed Google outlier. New shape: {finale_no_na.shape}")

# Save complete dataset with all derived features
config.FINAL_PATH.mkdir(parents=True, exist_ok=True)
final_output = config.FINAL_PATH / 'finale_usa_cleaned.csv'
finale_no_na.to_csv(final_output, index=False)

print(f"\nDATASET SAVED:{final_output}")
print(f"Rows:{len(finale_no_na):,}")
print(f"Columns:{finale_no_na.shape[1]}")

# Verify key derived features are present
key_features = ['macro_settore', 'relationships_category', 'rounds_category', 
                'products_category', 'funding_quartile', 'market_heat_quartile']
present = [f for f in key_features if f in finale_no_na.columns]
missing = [f for f in key_features if f not in finale_no_na.columns]

print(f"\nDerived Features ({len(present)}/{len(key_features)}):")
for f in present:
    print(f"{f}")

if missing:
    print(f"\nMissing:")
    for f in missing:
        print(f"{f}")
else:
    print("\nALL KEY FEATURES PRESENT")

In [None]:
# =============================================================================
# FINAL VERIFICATION CHECKLIST
# =============================================================================

all_checks = []

# 1. Shape check
expected_rows = 15305
shape_ok = abs(finale_no_na.shape[0] - expected_rows) < 100
all_checks.append(("Shape", shape_ok))
print(f"\n1. SHAPE: {finale_no_na.shape}")
print(f"Expected: ~{expected_rows:,} rows")
print(f"Status: {'PASS' if shape_ok else 'FAIL'}")

# 2. Survival columns present
survival_cols = ['duration_years', 'event', 'event_type']
surv_ok = all(col in finale_no_na.columns for col in survival_cols)
surv_na = sum(finale_no_na[col].isna().sum() for col in survival_cols if col in finale_no_na.columns)
all_checks.append(("Survival columns", surv_ok and surv_na == 0))
print(f"\n2. SURVIVAL COLUMNS:")
for col in survival_cols:
    if col in finale_no_na.columns:
        na = finale_no_na[col].isna().sum()
        print(f"{col}: present, {na} NaN")
    else:
        print(f"{col}:MISSING!")

# 3. Collinear columns removed
col_removed = 'funding_total_usd' not in finale_no_na.columns
all_checks.append(("Collinear removed", col_removed))
print(f"\n3. COLLINEAR COLUMNS:")
print(f"funding_total_usd: {'removed' if col_removed else 'STILL PRESENT!'}")
if 'ipo_count_total' in finale_no_na.columns:
    print(f"ipo_count_total:STILL PRESENT")
else:
    print(f"ipo_count_total:removed")

# 4. No imputed IPO
if 'ipo_date_imputed' in finale_no_na.columns:
    imputed_ipo = finale_no_na[(finale_no_na['event_type']==1) & 
                               (finale_no_na['ipo_date_imputed']==True)].shape[0]
else:
    imputed_ipo = 0
ipo_ok = imputed_ipo == 0
all_checks.append(("No imputed IPO", ipo_ok))
print(f"\n4.IMPUTED IPO:")
print(f"Count: {imputed_ipo}")
print(f"Status: {'PASS' if ipo_ok else 'FAIL - still has imputed IPO'}")

# 5. No negative durations
neg_dur = (finale_no_na['duration_years'] <= 0).sum() if 'duration_years' in finale_no_na.columns else -1
dur_ok = neg_dur == 0
all_checks.append(("No negative durations", dur_ok))
print(f"\n5. NEGATIVE DURATIONS:")
print(f"Count:{neg_dur}")
print(f"Status:{'PASS' if dur_ok else 'FAIL'}")

# 6. Key features present
key_features = ['log_fund_tot', 'funding_rounds', 'market_heat', 
                'relationships', 'milestones', 'macro_settore']
feat_ok = all(f in finale_no_na.columns for f in key_features)
all_checks.append(("Key features", feat_ok))
print(f"\n6. KEY FEATURES:")
for f in key_features:
    status = "OK" if f in finale_no_na.columns else "MISSING"
    print(f"{f}:{status}")

# 7. Event distribution
print(f"\n7. EVENT DISTRIBUTION:")
for et, label in [(0, 'Censored'), (1, 'IPO'), (2, 'M&A'), (3, 'Failure')]:
    if 'event_type' in finale_no_na.columns:
        count = (finale_no_na['event_type'] == et).sum()
        pct = 100 * count / len(finale_no_na)
        print(f"{label:10s}: {count:>6,} ({pct:>5.1f}%)")

# FINAL VERDICT
passed = sum(1 for _, ok in all_checks if ok)
total = len(all_checks)

if passed == total:
    print(f"ALL {total} CHECKS PASSED")
else:
    print(f"{passed}/{total} CHECKS PASSED")
    print("\nFailed checks:")
    for name, ok in all_checks:
        if not ok:
            print(f"{name}")