# Task
Clean the dataset by handling missing values, standardizing categorical variables, and removing duplicates.

In [1]:
import pandas as pd

In [2]:
# Load the original data
df = pd.read_csv('../data/companies.csv')

# Basic preprocessing
df['founded_at'] = pd.to_datetime(df['founded_at'], errors='coerce')
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

current_year = pd.Timestamp.now().year
df['founded_year'] = df['founded_at'].dt.year
df['company_age_years'] = current_year - df['founded_year']

# Handle missing founded years
missing_founded = df['founded_year'].isna()
df.loc[missing_founded, 'founded_year'] = df.loc[missing_founded, 'created_at'].dt.year
df.loc[missing_founded, 'company_age_years'] = current_year - df.loc[missing_founded, 'founded_year']
df['company_age_years'] = df['company_age_years'].clip(upper=50, lower=0)

In [3]:
 # Funding features
df['funding_total_usd'] = pd.to_numeric(df['funding_total_usd'], errors='coerce')
df['has_funding'] = (~df['funding_total_usd'].isna() & (df['funding_total_usd'] > 0)).astype(int)

In [4]:
# Initialize targets
df['failure_risk'] = 0
df['risk_tier'] = 1  # 0=Low, 1=Medium, 2=High

# TIER 0: LOW RISK (Successful/Strong Companies)
low_risk_conditions = [
    # Successful exits - ALWAYS low risk
    df['status'].isin(['ipo', 'acquired']),

    # Well-funded operating companies
    (df['status'] == 'operating') & (df['funding_total_usd'] > 500000),

    # Young companies with decent funding
    (df['company_age_years'] <= 3) & (df['funding_total_usd'] > 100000),

    # Companies with significant funding regardless of age
    (df['funding_total_usd'] > 2000000)
]

low_risk_mask = pd.concat(low_risk_conditions, axis=1).any(axis=1)
df.loc[low_risk_mask, 'risk_tier'] = 0
df.loc[low_risk_mask, 'failure_risk'] = 0

# TIER 2: HIGH RISK (Clear Failure Signals)
high_risk_conditions = [
        # Explicitly closed
    df['status'] == 'closed',

        # Very old with no funding (true zombies)
    (df['company_age_years'] > 10) & (df['has_funding'] == 0),

        # Old with extremely low funding
    (df['company_age_years'] > 8) & (df['funding_total_usd'] < 10000),
]

high_risk_mask = pd.concat(high_risk_conditions, axis=1).any(axis=1)
df.loc[high_risk_mask, 'risk_tier'] = 2
df.loc[high_risk_mask, 'failure_risk'] = 1

    # TIER 1: MEDIUM RISK (Everything else - the uncertain middle)
    # This is automatic based on the initialization

    # Map to readable labels
risk_labels = {0: 'low_risk', 1: 'medium_risk', 2: 'high_risk'}
df['risk_tier_label'] = df['risk_tier'].map(risk_labels)

    # Validation
print("\n=== OPTIMIZED TARGET DISTRIBUTION ===")
print(df['risk_tier_label'].value_counts().sort_index())
print(f"High risk rate: {df['failure_risk'].mean():.1%}")

    # Validate key segments
print("\n=== VALIDATION BY STATUS ===")
status_risk = pd.crosstab(df['status'], df['risk_tier_label'], normalize='index')
print(status_risk)

    # Analyze risk by age and funding
print("\n=== RISK BY AGE GROUPS ===")
df['age_group'] = pd.cut(df['company_age_years'],
                            bins=[0, 3, 7, 15, 50],
                            labels=['0-3y', '4-7y', '8-15y', '15+y'],
                            right=False)
age_risk = pd.crosstab(df['age_group'], df['risk_tier_label'], normalize='index')
print(age_risk)

    # Save optimized version
output_columns = [
        'id', 'name', 'status', 'category_code', 'country_code', 'state_code', 'region',
        'founded_at', 'founded_year', 'company_age_years', 'age_group',
        'funding_total_usd', 'has_funding',
        'failure_risk', 'risk_tier', 'risk_tier_label'
]

    # Only include existing columns
existing_columns = [col for col in output_columns if col in df.columns]
df_output = df[existing_columns]

df_output.to_csv('../processed_data/companies_optimized_targets.csv', index=False)
print(f"\nSaved optimized targets to companies_optimized_targets.csv")
print(f"Final dataset: {len(df_output):,} companies with {len(existing_columns)} columns")


=== OPTIMIZED TARGET DISTRIBUTION ===
risk_tier_label
high_risk      170553
low_risk        19976
medium_risk      6024
Name: count, dtype: int64
High risk rate: 86.8%

=== VALIDATION BY STATUS ===
risk_tier_label  high_risk  low_risk  medium_risk
status                                           
acquired          0.751756  0.248244     0.000000
closed            1.000000  0.000000     0.000000
ipo               0.577601  0.422399     0.000000
operating         0.873589  0.093572     0.032839

=== RISK BY AGE GROUPS ===
risk_tier_label  high_risk  low_risk  medium_risk
age_group                                        
8-15y             0.919854  0.049031     0.031114
15+y              0.792937  0.176687     0.030376

Saved optimized targets to companies_optimized_targets.csv
Final dataset: 196,553 companies with 16 columns


In [5]:
missing_percentages = df.isnull().mean() * 100
missing_percentages = missing_percentages.sort_values(ascending=False)

print("Percentage of missing values per column:")
print(missing_percentages[missing_percentages > 0])

Percentage of missing values per column:
parent_id              100.000000
ROI                     99.630634
last_investment_at      98.685851
first_investment_at     98.685851
invested_companies      98.681780
investment_rounds       98.681780
closed_at               98.667026
short_description       96.371971
funding_total_usd       85.818583
last_funding_at         83.970227
first_funding_at        83.970227
funding_rounds          83.868473
state_code              74.102151
twitter_username        58.997828
tag_list                58.559778
lng                     57.338733
lat                     57.338733
city                    57.319400
country_code            55.233448
founded_at              53.586564
milestones              53.346426
last_milestone_at       53.346426
first_milestone_at      53.346426
description             53.168865
logo_height             43.979486
logo_url                43.979486
logo_width              43.979486
category_code           37.326828
domain 

In [6]:
admin_cols_to_drop = [
    'entity_type', 'entity_id', 'parent_id',
    'Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 1',
    'permalink'
]

existing_admin_cols = [col for col in admin_cols_to_drop if col in df.columns]
df = df.drop(columns=existing_admin_cols)
print(f"Dropped administrative columns: {existing_admin_cols}")

Dropped administrative columns: ['entity_type', 'entity_id', 'parent_id', 'Unnamed: 0.1', 'permalink']


In [7]:
# Now handle missing values - drop columns with missing percentage > 50% 
missing_percentages = df.isnull().mean() * 100
missing_percentages = missing_percentages.sort_values(ascending=False)

columns_to_drop = missing_percentages[missing_percentages > 50].index

critical_cols = [
    # Funding-related
    'funding_total_usd', 'funding_rounds', 'first_funding_at', 'last_funding_at',
    
    # Geographic data
    'country_code', 'state_code', 'city', 'region', 'lat', 'lng',
    
    # Company descriptive info (valuable for analysis)
    'category_code', 'description', 'overview', 'tag_list',
    
    # Investment/milestone data
    
    'investment_rounds', 'invested_companies', 'milestones',
    'first_milestone_at', 'last_milestone_at',
    
    # Company metadata
    'twitter_username', 'homepage_url', 'domain', 'closed_at'
]

columns_to_drop = columns_to_drop.difference(critical_cols)

df = df.drop(columns=columns_to_drop)
print(f"Dropped {len(columns_to_drop)} columns with >50% missing values.")
print(f"Preserved critical columns: {[col for col in critical_cols if col in df.columns]}")

Dropped 5 columns with >50% missing values.
Preserved critical columns: ['funding_total_usd', 'funding_rounds', 'first_funding_at', 'last_funding_at', 'country_code', 'state_code', 'city', 'region', 'lat', 'lng', 'category_code', 'description', 'overview', 'tag_list', 'investment_rounds', 'invested_companies', 'milestones', 'first_milestone_at', 'last_milestone_at', 'twitter_username', 'homepage_url', 'domain', 'closed_at']


In [8]:
# Check what columns still have missing values
remaining_missing = df.isnull().sum()
remaining_missing = remaining_missing[remaining_missing > 0].sort_values(ascending=False)
print(f"\nColumns with missing values:")
print(remaining_missing)

df.dropna(subset=['name'], inplace=True)

# Final check
final_missing = df.isnull().sum()
final_missing = final_missing[final_missing > 0].sort_values(ascending=False)
print(f"\nRemaining missing values (this is NORMAL and OK):")
print(final_missing)


Columns with missing values:
invested_companies    193962
investment_rounds     193962
closed_at             193933
funding_total_usd     168679
last_funding_at       165046
first_funding_at      165046
funding_rounds        164846
state_code            145650
twitter_username      115962
tag_list              115101
lat                   112701
lng                   112701
city                  112663
country_code          108563
first_milestone_at    104854
last_milestone_at     104854
milestones            104854
description           104505
logo_height            86443
logo_width             86443
logo_url               86443
category_code          73367
homepage_url           70008
domain                 70008
overview               69582
relationships          66886
created_by             41020
age_group               1574
normalized_name           26
name                      23
dtype: int64

Remaining missing values (this is NORMAL and OK):
invested_companies    193941
investm

In [9]:
# Handle other funding-related columns
funding_cols_to_fill = ['funding_rounds', 'first_funding_at', 'last_funding_at']
for col in funding_cols_to_fill:
    if col in df.columns:
        if col == 'funding_rounds':
            # Missing funding rounds = 0 rounds
            before_count = df[col].isnull().sum()
            df[col] = df[col].fillna(0)
            print(f"Imputed {before_count:,} missing {col} values with 0")
        else:
            # For date columns, leave as NaN (indicates no funding events)
            print(f"Left {df[col].isnull().sum():,} missing {col} values as NaN (no funding events)")


Imputed 164,825 missing funding_rounds values with 0
Left 165,025 missing first_funding_at values as NaN (no funding events)
Left 165,025 missing last_funding_at values as NaN (no funding events)


In [10]:
# Handle investment-related columns
investment_cols_to_fill = ['investment_rounds', 'invested_companies']
for col in investment_cols_to_fill:
    if col in df.columns:
        before_count = df[col].isnull().sum()
        df[col] = df[col].fillna(0)
        print(f"Imputed {before_count:,} missing {col} values with 0")

Imputed 193,941 missing investment_rounds values with 0
Imputed 193,941 missing invested_companies values with 0


In [11]:
# Handle milestone columns
milestone_cols = ['milestones']
for col in milestone_cols:
    if col in df.columns:
        before_count = df[col].isnull().sum()
        df[col] = df[col].fillna(0)
        print(f"Imputed {before_count:,} missing {col} values with 0")

Imputed 104,839 missing milestones values with 0


In [12]:
geographic_cols = ['country_code', 'state_code', 'region', 'city']
for col in geographic_cols:
    if col in df.columns:
        before_count = df[col].isnull().sum()
        df[col] = df[col].fillna('Unknown')
        print(f"Imputed {before_count:,} missing {col} values with 'Unknown'")

text_cols = ['category_code', 'description', 'overview', 'tag_list']
for col in text_cols:
    if col in df.columns:
        before_count = df[col].isnull().sum()
        if col == 'category_code':
            df[col] = df[col].fillna('other')
        else:
            df[col] = df[col].fillna('Unknown')
        print(f"Imputed {before_count:,} missing {col} values")

web_cols = ['twitter_username', 'homepage_url', 'domain']
for col in web_cols:
    if col in df.columns:
        before_count = df[col].isnull().sum()
        df[col] = df[col].fillna('None')
        print(f"Imputed {before_count:,} missing {col} values with 'None'")

relationship_cols = ['relationships', 'created_by']
for col in relationship_cols:
    if col in df.columns:
        before_count = df[col].isnull().sum()
        df[col] = df[col].fillna('Unknown')
        print(f"Imputed {before_count:,} missing {col} values with 'Unknown'")

numerical_cols = ['lat', 'lng', 'logo_height', 'logo_width']
for col in numerical_cols:
    if col in df.columns:
        before_count = df[col].isnull().sum()
        if col in ['lat', 'lng']:
            # For coordinates, keep as NaN
            print(f"Keeping {before_count:,} missing {col} values as NaN (no fake coordinates)")
        else:
            median_value = df[col].median()
            df[col] = df[col].fillna(median_value)
            print(f"Imputed {before_count:,} missing {col} values with median: {median_value}")

critical_missing_cols = ['name']
for col in critical_missing_cols:
    if col in df.columns:
        rows_before = len(df)
        df.dropna(subset=[col], inplace=True)
        rows_after = len(df)
        print(f"Dropped {rows_before - rows_after} rows with missing values in '{col}'.")

# Verify remaining missing values
missing_percentages_after = df.isnull().mean() * 100
missing_percentages_after = missing_percentages_after.sort_values(ascending=False)

print("\nPercentage of missing values per column after cleaning:")
print(missing_percentages_after[missing_percentages_after > 0])

Imputed 108,550 missing country_code values with 'Unknown'
Imputed 145,629 missing state_code values with 'Unknown'
Imputed 0 missing region values with 'Unknown'
Imputed 112,648 missing city values with 'Unknown'
Imputed 73,357 missing category_code values
Imputed 104,492 missing description values
Imputed 69,565 missing overview values
Imputed 115,087 missing tag_list values
Imputed 115,945 missing twitter_username values with 'None'
Imputed 69,997 missing homepage_url values with 'None'
Imputed 69,997 missing domain values with 'None'
Imputed 66,877 missing relationships values with 'Unknown'
Imputed 41,017 missing created_by values with 'Unknown'
Keeping 112,686 missing lat values as NaN (no fake coordinates)
Keeping 112,686 missing lng values as NaN (no fake coordinates)
Imputed 86,431 missing logo_height values with median: 105.0
Imputed 86,431 missing logo_width values with median: 267.0
Dropped 0 rows with missing values in 'name'.

Percentage of missing values per column after

In [13]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include='object').columns
print("Categorical columns identified:")
print(categorical_cols)

# Inspect unique values for potential inconsistencies in a few key categorical columns
cols_to_inspect = ['category_code', 'country_code', 'state_code', 'region', 'status', 'age_group', 'risk_tier_label']

for col in cols_to_inspect:
    if col in df.columns:
        print(f"\nUnique values for '{col}':")
        # Display a limited number of unique values if there are many
        unique_values = df[col].unique()
        if len(unique_values) > 50:
            print(unique_values[:50])
            print(f"... and {len(unique_values) - 50} more.")
        else:
            print(unique_values)

Categorical columns identified:
Index(['id', 'name', 'normalized_name', 'category_code', 'status', 'closed_at',
       'domain', 'homepage_url', 'twitter_username', 'logo_url', 'description',
       'overview', 'tag_list', 'country_code', 'state_code', 'city', 'region',
       'first_funding_at', 'last_funding_at', 'first_milestone_at',
       'last_milestone_at', 'relationships', 'created_by', 'updated_at',
       'risk_tier_label'],
      dtype='object')

Unique values for 'category_code':
['web' 'games_video' 'network_hosting' 'advertising' 'cleantech' 'other'
 'enterprise' 'consulting' 'mobile' 'health' 'software' 'analytics'
 'finance' 'education' 'medical' 'manufacturing' 'biotech' 'ecommerce'
 'public_relations' 'hardware' 'search' 'news' 'government' 'security'
 'photo_video' 'travel' 'semiconductor' 'social' 'legal' 'transportation'
 'hospitality' 'sports' 'nonprofit' 'fashion' 'messaging' 'music'
 'automotive' 'design' 'real_estate' 'local' 'nanotech' 'pets']

Unique values f

In [14]:
# Standardize categorical columns: convert to lowercase and strip whitespace
cols_to_standardize = ['category_code', 'country_code', 'state_code', 'region', 'status', 'risk_tier_label']

for col in cols_to_standardize:
    if col in df.columns and df[col].dtype == 'object':
        df[col] = df[col].str.lower().str.strip()
        print(f"Standardized '{col}'.")

# Re-verify unique values after standardization
for col in cols_to_standardize:
    if col in df.columns:
        print(f"\nUnique values for '{col}' after standardization:")
        unique_values = df[col].unique()
        if len(unique_values) > 50:
            print(unique_values[:50])
            print(f"... and {len(unique_values) - 50} more.")
        else:
            print(unique_values)

# age_group is already a categorical type with defined labels, no string standardization needed.
print("\nUnique values for 'age_group':")
print(df['age_group'].unique())

Standardized 'category_code'.
Standardized 'country_code'.
Standardized 'state_code'.
Standardized 'region'.
Standardized 'status'.
Standardized 'risk_tier_label'.

Unique values for 'category_code' after standardization:
['web' 'games_video' 'network_hosting' 'advertising' 'cleantech' 'other'
 'enterprise' 'consulting' 'mobile' 'health' 'software' 'analytics'
 'finance' 'education' 'medical' 'manufacturing' 'biotech' 'ecommerce'
 'public_relations' 'hardware' 'search' 'news' 'government' 'security'
 'photo_video' 'travel' 'semiconductor' 'social' 'legal' 'transportation'
 'hospitality' 'sports' 'nonprofit' 'fashion' 'messaging' 'music'
 'automotive' 'design' 'real_estate' 'local' 'nanotech' 'pets']

Unique values for 'country_code' after standardization:
['usa' 'unknown' 'mar' 'ind' 'aus' 'fra' 'jpn' 'nld' 'egy' 'isr' 'gbr'
 'tha' 'can' 'aut' 'irl' 'swe' 'deu' 'bra' 'fin' 'rus' 'sgp' 'mex' 'chn'
 'esp' 'isl' 'kor' 'tur' 'dnk' 'arg' 'pak' 'hun' 'pol' 'grc' 'prt' 'blr'
 'css' 'mkd' 'che

In [15]:
# Check for duplicate rows
duplicate_rows = df.duplicated()

# Count the number of duplicate rows
num_duplicates = duplicate_rows.sum()

print(f"Number of duplicate rows found: {num_duplicates}")

# Remove duplicate rows if any exist
if num_duplicates > 0:
    rows_before_dropping = len(df)
    df.drop_duplicates(inplace=True)
    rows_after_dropping = len(df)
    print(f"Removed {rows_before_dropping - rows_after_dropping} duplicate rows.")
else:
    print("No duplicate rows to remove.")

Number of duplicate rows found: 0
No duplicate rows to remove.


# 1. Under-Capitalization Identification (CRITICAL MISSING)

### Step 1: Calculate 2% Funding Threshold

```
ALGORITHM: Identify Target Population
1. Load cleaned dataset from previous steps
2. Calculate funding distribution percentiles [1%, 2%, 5%, 10%]
3. Extract 2% percentile as under-capitalization threshold
4. Display threshold value for validation

EXPECTED OUTPUT:
- Funding percentiles table
- 2% threshold: "$X,XXX" (actual dollar amount)
- Dataset coverage validation
```

In [16]:
percentiles = df['funding_total_usd'].quantile([0.01, 0.02, 0.05, 0.10])

percentiles = percentiles.reset_index()
percentiles.columns = ['Percentile', 'Funding (USD)']

percentiles['Funding (USD)'] = percentiles['Funding (USD)'].apply(lambda x: f"${x:,.0f}")
percentiles['Percentile'] = (percentiles['Percentile']*100).astype(int).astype(str) + '%'

threshold_value = df['funding_total_usd'].quantile(0.02)

print(percentiles)
print("\n2% Threshold for funding_total_usd: $", threshold_value)

  Percentile Funding (USD)
0         1%       $11,700
1         2%       $19,616
2         5%       $40,000
3        10%      $100,000

2% Threshold for funding_total_usd: $ 19616.0


In [17]:
total_companies = len(df)
companies_with_funding = df['has_funding'].sum()
companies_without_funding = total_companies - companies_with_funding

print(f"Total companies in dataset: {total_companies:,}")
print(f"Companies with funding data: {companies_with_funding:,} ({companies_with_funding/total_companies:.1%})")
print(f"Companies without funding: {companies_without_funding:,} ({companies_without_funding/total_companies:.1%})")

under_capitalized = (df['funding_total_usd'] <= threshold_value) & (df['has_funding'] == 1)
under_cap_count = under_capitalized.sum()

print(f"\nUnder-capitalized companies (≤ ${threshold_value:,.0f}): {under_cap_count:,}")
print(f"Percentage of funded companies that are under-capitalized: {under_cap_count/companies_with_funding:.1%}")

Total companies in dataset: 196,530
Companies with funding data: 27,873 (14.2%)
Companies without funding: 168,657 (85.8%)

Under-capitalized companies (≤ $19,616): 559
Percentage of funded companies that are under-capitalized: 2.0%


## Step 2: Create Under-Capitalized Flag

```
ALGORITHM: Under-Capitalized Population Flag
1. Create boolean column 'under_capitalized' where:
   - funding_total_usd <= 2% threshold OR
   - funding_total_usd == 0 OR
   - funding_total_usd is null
2. Count total under-capitalized companies
3. Calculate percentage of total dataset
4. Validate against project requirement (~2% expectation)

EXPECTED OUTPUT:
- under_capitalized column added to dataframe
- Population count: "X,XXX companies (X.X%)"
- Validation: Should align with ~2% of VC funding recipients
```

In [18]:
df['under_capitalized'] = (
    (df['funding_total_usd'] <= threshold_value) |
    (df['funding_total_usd'] == 0) |
    (df['funding_total_usd'].isna())
)

total_companies = len(df)
under_cap_count = df['under_capitalized'].sum()
under_cap_percentage = (under_cap_count / total_companies) * 100

companies_with_funding = df['has_funding'].sum()
funded_under_cap = ((df['funding_total_usd'] <= threshold_value) & (df['has_funding'] == 1)).sum()
zero_funding = (df['funding_total_usd'] == 0).sum()
null_funding = df['funding_total_usd'].isna().sum()

print(f"Population count: {total_companies:,} companies ({under_cap_percentage:.1f}%)")

if companies_with_funding > 0:
    vc_under_cap_rate = (funded_under_cap / companies_with_funding) * 100
    print(f"Validation: {vc_under_cap_rate:.1f}% of VC-funded companies are under-capitalized")

# not 2% and instead 13% because the quantile measurement from pandas doesnt include the 
# null values for the threshold value

Population count: 196,530 companies (86.1%)
Validation: 2.0% of VC-funded companies are under-capitalized


## Step 3: Population Validation

```
ALGORITHM: Under-Cap Population Characteristics
1. Compare under-cap vs well-funded populations:
   - Average company age
   - Geographic distribution differences
   - Industry sector patterns
   - Success rate differentials
2. Validate population makes business sense
3. Document population characteristics for stakeholders

EXPECTED OUTPUT:
- Population comparison statistics
- Business logic validation confirmed
- Stakeholder-ready population summary
```


## Comparison by Company Age

In [19]:
age_comparison = df.groupby('under_capitalized')['company_age_years'].agg(['mean', 'median', 'std']).round(1)
age_comparison.index = ['Well-Funded', 'Under-Capitalized']
print(age_comparison)

# Age distribution by funding status
age_crosstab = pd.crosstab(df['age_group'], df['under_capitalized'], normalize='columns') * 100
age_crosstab.columns = ['Well-Funded (%)', 'Under-Capitalized (%)']
print("\nAge Distribution by Funding Status:")
print(age_crosstab.round(1))

                   mean  median  std
Well-Funded        17.5    16.0  5.7
Under-Capitalized  15.4    13.0  6.1

Age Distribution by Funding Status:
           Well-Funded (%)  Under-Capitalized (%)
age_group                                        
8-15y                 33.7                   62.6
15+y                  66.3                   37.4


## Geographic Distribution

In [20]:
# Top countries comparison
print("Top 10 Countries - Under-Capitalized vs Well-Funded:")
geo_comparison = pd.crosstab(df['country_code'], df['under_capitalized'], normalize='columns') * 100
geo_comparison.columns = ['Well-Funded (%)', 'Under-Capitalized (%)']
geo_comparison = geo_comparison.sort_values('Under-Capitalized (%)', ascending=False).head(10)
print(geo_comparison.round(1))

# Regional distribution
print("\nRegional Distribution:")
region_comparison = pd.crosstab(df['region'], df['under_capitalized'], normalize='columns') * 100
region_comparison.columns = ['Well-Funded (%)', 'Under-Capitalized (%)']
region_comparison = region_comparison.sort_values('Under-Capitalized (%)', ascending=False).head(8)
print(region_comparison.round(1))


Top 10 Countries - Under-Capitalized vs Well-Funded:
              Well-Funded (%)  Under-Capitalized (%)
country_code                                        
unknown                   5.4                   63.3
usa                      65.6                   19.9
gbr                       6.0                    3.4
ind                       1.4                    2.1
can                       3.1                    1.7
deu                       1.6                    0.9
aus                       0.7                    0.7
fra                       2.3                    0.6
irl                       0.7                    0.5
esp                       1.2                    0.5

Regional Distribution:
               Well-Funded (%)  Under-Capitalized (%)
region                                               
unknown                    6.6                   63.9
sf bay                    17.2                    3.2
new york                   6.3                    2.0
london           

In [21]:
print("Top 10 Industry Categories:")
industry_comparison = pd.crosstab(df['category_code'], df['under_capitalized'], normalize='columns') * 100
industry_comparison.columns = ['Well-Funded (%)', 'Under-Capitalized (%)']
industry_comparison = industry_comparison.sort_values('Under-Capitalized (%)', ascending=False).head(10)
print(industry_comparison.round(2))


Top 10 Industry Categories:
                  Well-Funded (%)  Under-Capitalized (%)
category_code                                           
other                        5.48                  50.51
software                    15.28                   8.12
web                          8.29                   7.60
ecommerce                    4.73                   4.59
games_video                  4.18                   3.77
mobile                       6.54                   3.00
advertising                  3.88                   2.98
consulting                   0.98                   2.80
enterprise                   5.18                   1.79
public_relations             1.29                   1.47


In [22]:
# Risk tier distribution
print("Risk Tier Distribution by Funding Status:")
risk_comparison = pd.crosstab(df['risk_tier_label'], df['under_capitalized'], normalize='columns') * 100
risk_comparison.columns = ['Well-Funded (%)', 'Under-Capitalized (%)']
print(risk_comparison.round(2))

# Company status distribution
print("\nCompany Status Distribution:")
status_comparison = pd.crosstab(df['status'], df['under_capitalized'], normalize='columns') * 100
status_comparison.columns = ['Well-Funded (%)', 'Under-Capitalized (%)']
print(status_comparison.round(2))

# Failure risk rates
print("\nFailure Risk Rates:")
failure_rates = df.groupby('under_capitalized')['failure_risk'].mean() * 100
failure_rates.index = ['Well-Funded', 'Under-Capitalized']
print(f"Well-Funded Companies: {failure_rates['Well-Funded']:.1f}% failure risk")
print(f"Under-Capitalized Companies: {failure_rates['Under-Capitalized']:.1f}% failure risk")


Risk Tier Distribution by Funding Status:
                 Well-Funded (%)  Under-Capitalized (%)
risk_tier_label                                        
high_risk                   6.04                  99.80
low_risk                   73.09                   0.01
medium_risk                20.87                   0.19

Company Status Distribution:
           Well-Funded (%)  Under-Capitalized (%)
status                                           
acquired              8.49                   4.18
closed                6.04                   0.55
ipo                   1.75                   0.39
operating            83.72                  94.88

Failure Risk Rates:
Well-Funded Companies: 6.0% failure risk
Under-Capitalized Companies: 99.8% failure risk


In [23]:
# Population sizes
well_funded_count = (~df['under_capitalized']).sum()
under_cap_count = df['under_capitalized'].sum()
total_count = len(df)

print(f"Total Companies: {total_count:,}")
print(f"Under-Capitalized: {under_cap_count:,} ({under_cap_count/total_count:.1%})")
print(f"Well-Funded: {well_funded_count:,} ({well_funded_count/total_count:.1%})")

# Key validation metrics
avg_age_diff = df[df['under_capitalized']]['company_age_years'].mean() - df[~df['under_capitalized']]['company_age_years'].mean()
failure_risk_diff = df[df['under_capitalized']]['failure_risk'].mean() - df[~df['under_capitalized']]['failure_risk'].mean()

print(f"\nKey Differentials:")
print(f"Age Difference: Under-cap companies are {avg_age_diff:.1f} years older on average")
print(f"Failure Risk Difference: Under-cap companies have {failure_risk_diff:.1%} higher failure risk")

Total Companies: 196,530
Under-Capitalized: 169,216 (86.1%)
Well-Funded: 27,314 (13.9%)

Key Differentials:
Age Difference: Under-cap companies are -2.1 years older on average
Failure Risk Difference: Under-cap companies have 93.8% higher failure risk


In [24]:
print("UNDER-CAPITALIZED COMPANY CHARACTERISTICS:")
print(f"• Population Size: {under_cap_count:,} companies ({under_cap_count/total_count:.1%} of dataset)")
print(f"• Average Age: {df[df['under_capitalized']]['company_age_years'].mean():.1f} years")
print(f"• Failure Risk: {df[df['under_capitalized']]['failure_risk'].mean():.1%}")
print(f"• Geographic Concentration: Top 3 countries represent {geo_comparison.head(3)['Under-Capitalized (%)'].sum():.1f}% of population")
print(f"• High-Risk Companies: {(df[df['under_capitalized']]['risk_tier'] == 2).sum():,} ({(df[df['under_capitalized']]['risk_tier'] == 2).mean():.1%})")

UNDER-CAPITALIZED COMPANY CHARACTERISTICS:
• Population Size: 169,216 companies (86.1% of dataset)
• Average Age: 15.4 years
• Failure Risk: 99.8%
• Geographic Concentration: Top 3 countries represent 86.6% of population
• High-Risk Companies: 168,881 (99.8%)


In [25]:
# Top characteristics of under-capitalized companies
top_country = geo_comparison.index[0]
top_industry = industry_comparison.index[0]
top_age_group = age_crosstab.idxmax(axis=0)['Under-Capitalized (%)']

print(f"\nTYPICAL UNDER-CAPITALIZED COMPANY PROFILE:")
print(f"• Most Common Country: {top_country.upper()}")
print(f"• Most Common Industry: {top_industry}")
print(f"• Most Common Age Group: {top_age_group}")
print(f"• Status: {status_comparison.idxmax(axis=0)['Under-Capitalized (%)']} companies most common")


TYPICAL UNDER-CAPITALIZED COMPANY PROFILE:
• Most Common Country: UNKNOWN
• Most Common Industry: other
• Most Common Age Group: 8-15y
• Status: operating companies most common


# 2. Geographic Standardization for Heatmaps

## Step 1: Geographic Data Validation

```
ALGORITHM: Geographic Coverage Assessment
1. Analyze country_code distribution (focus on 'usa')
2. Calculate US vs international company percentages
3. Validate US market focus for project requirements
4. Assess data completeness for heatmap requirements

EXPECTED OUTPUT:
- Country distribution summary
- US market percentage: "XX.X% US companies"
- International market assessment
- Heatmap readiness evaluation
```

In [26]:
usa = df['country_code'] == 'usa'
intl = (df['country_code'] != 'usa' ) & (df['country_code'] != 'unknown')

us_companies = len(df[usa])
total_companies = len(df)
us_percentage = (us_companies / total_companies) * 100

print("=== USA ===")
print(f"Total Companies: {total_companies:,}")
print(f"US Companies: {us_companies:,}")
print(f"US Market Percentage: {us_percentage:.1f}% US companies")

# International Market Assessment
print("\n=== INTERNATIONAL ===")
intl_companies = len(df[intl])
intl_percentage = (intl_companies / total_companies) * 100

print(f"International Companies: {intl_companies:,} ({intl_percentage:.1f}%)")
print(f"Unknown Location: {len(df[~(usa|intl)]):,} ({len(df[~(usa|intl)])/total_companies:.1%})")

# Top international markets
if intl_companies > 0:
    print("\nTop 5 International Markets:")
    intl_markets = df[intl]['country_code'].value_counts().head(5)
    for country, count in intl_markets.items():
        percentage = (count / intl_companies) * 100
        global_pct = (count / total_companies) * 100
        print(f"  {country.upper()}: {count:,} companies ({percentage:.1f}% of international, {global_pct:.1f}% of total)")

# US State data completeness
us_df = df[usa]
us_with_state = us_df['state_code'].notna() & (us_df['state_code'] != 'unknown')
us_state_coverage = (us_with_state.sum() / len(us_df)) * 100

print(f"US State Code Coverage: {us_state_coverage:.1f}% of US companies have valid state codes")

# Geographic coordinate coverage
us_with_coords = us_df['lat'].notna() & us_df['lng'].notna()
us_coord_coverage = (us_with_coords.sum() / len(us_df)) * 100

print(f"US Coordinate Coverage: {us_coord_coverage:.1f}% of US companies have lat/lng data")

# Data completeness summary for stakeholders
print(f"\n=== STAKEHOLDER SUMMARY ===")
print(f"• Dataset Focus: {us_percentage:.1f}% US-based companies")
print(f"• Geographic Coverage: {us_state_coverage:.1f}% state-level, {us_coord_coverage:.1f}% coordinate-level")
print(f"• International Presence: {intl_percentage:.1f}% from {len(df[intl]['country_code'].unique())} countries")
print(f"• Heatmap Capability: {'State-level ready' if us_state_coverage >= 80 else 'State-level limited'}")

=== USA ===
Total Companies: 196,530
US Companies: 51,635
US Market Percentage: 26.3% US companies

=== INTERNATIONAL ===
International Companies: 36,345 (18.5%)
Unknown Location: 108,550 (55.2%)

Top 5 International Markets:
  GBR: 7,372 companies (20.3% of international, 3.8% of total)
  IND: 3,924 companies (10.8% of international, 2.0% of total)
  CAN: 3,728 companies (10.3% of international, 1.9% of total)
  DEU: 1,918 companies (5.3% of international, 1.0% of total)
  FRA: 1,652 companies (4.5% of international, 0.8% of total)
US State Code Coverage: 98.1% of US companies have valid state codes
US Coordinate Coverage: 97.2% of US companies have lat/lng data

=== STAKEHOLDER SUMMARY ===
• Dataset Focus: 26.3% US-based companies
• Geographic Coverage: 98.1% state-level, 97.2% coordinate-level
• International Presence: 18.5% from 174 countries
• Heatmap Capability: State-level ready


## Step 2: US State Code Standardization

```
ALGORITHM: State Code Cleaning and Standardization
1. Filter to US companies (country_code == 'usa')
2. Identify non-standard state codes:
   - Full state names vs abbreviations
   - Inconsistent casing/formatting
   - Invalid or missing state codes
3. Create state name → abbreviation mapping dictionary
4. Apply standardization transformations
5. Validate all US state codes are 2-character format

EXPECTED OUTPUT:
- Standardized state_code column (all 2-char format)
- State standardization report
- Missing state data percentage
- Heatmap-ready geographic data
```

In [27]:
df_us = df[usa].copy()
print("US Companies without state codes:", df_us['state_code'].isna().sum())    # all companies in USA have state codes
long_codes = df_us['state_code'].apply(lambda x: x if len(x) > 2 else None).dropna()
print("State codes longer than 2 characters:", long_codes.unique().tolist())

# unknown -> uk, following standard 2 letter state code conventions ( idk if necessary but yeah )
df_us['state_code'] = df_us['state_code'].apply(lambda x: 'uk' if x == 'unknown' else x)

missing_states = df_us['state_code'].isna().sum()
print(f"US companies with missing state codes: {missing_states:.1f}%")

US Companies without state codes: 0
State codes longer than 2 characters: ['unknown']
US companies with missing state codes: 0.0%


In [28]:
states_map = {
    'wa': 'Washington', 'or': 'Oregon', 'ca': 'California', 'nv': 'Nevada', 'id': 'Idaho',
    'ut': 'Utah', 'az': 'Arizona', 'co': 'Colorado', 'nm': 'New Mexico', 'tx': 'Texas', 'wa': 'Washington',
    'mt': 'Montana', 'wy': 'Wyoming', 'nd': 'North Dakota', 'sd': 'South Dakota', 'ne': 'Nebraska',
    'ks': 'Kansas', 'ok': 'Oklahoma', 'mn': 'Minnesota', 'ia': 'Iowa', 'mo': 'Missouri', 'ar': 'Arkansas',
    'la': 'Louisiana', 'wi': 'Wisconsin', 'il': 'Illinois', 'ms': 'Mississippi', 'mi': 'Michigan', 'in': 'Indiana',
    'oh': 'Ohio', 'ky': 'Kentucky', 'tn': 'Tennessee', 'al': 'Alabama', 'fl': 'Florida', 'ga': 'Georgia', 'sc': 'South Carolina',
    'nc': 'North Carolina', 'va': 'Virginia', 'wv': 'West Virginia', 'pa': 'Pennsylvania', 'md': 'Maryland', 'de': 'Delaware', 
    'nj': 'New Jersey', 'ny': 'New York', 'ct': 'Connecticut', 'ri': 'Rhode Island', 'ma': 'Massachusetts', 'vt': 'Vermont',
    'nh': 'New Hampshire', 'me': 'Maine', 'ak': 'Alaska', 'hi': 'Hawaii'
}

df_us['state_name'] = df_us['state_code'].map(states_map)

## Step 3: Geographic Heatmap Data Preparation

```
ALGORITHM: Heatmap Data Structure Creation
1. Create geographic aggregation columns:
   - State-level startup counts
   - State-level success rates
   - State-level under-cap concentrations
2. Validate geographic coverage completeness
3. Prepare data structure for Month 3 dashboard
4. Document geographic data limitations

EXPECTED OUTPUT:
- Geographic aggregation features ready
- Heatmap data validation report
- Dashboard-ready geographic dataset
- Coverage limitation documentation
```

In [29]:
df.to_csv('../processed_data/companies_cleaned_data.csv', index=False)