In [1]:
import pandas as pd

def create_optimized_targets():
    """
    Create final optimized target variables with better balance
    """
    # Load the original data (not the one with targets)
    df = pd.read_csv('./data/companies.csv')

    # Basic preprocessing (from your original script)
    df['founded_at'] = pd.to_datetime(df['founded_at'], errors='coerce')
    df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

    current_year = pd.Timestamp.now().year
    df['founded_year'] = df['founded_at'].dt.year
    df['company_age_years'] = current_year - df['founded_year']

    # Handle missing founded years
    missing_founded = df['founded_year'].isna()
    df.loc[missing_founded, 'founded_year'] = df.loc[missing_founded, 'created_at'].dt.year
    df.loc[missing_founded, 'company_age_years'] = current_year - df.loc[missing_founded, 'founded_year']
    df['company_age_years'] = df['company_age_years'].clip(upper=50, lower=0)

    # Funding features
    df['funding_total_usd'] = pd.to_numeric(df['funding_total_usd'], errors='coerce')
    df['has_funding'] = (~df['funding_total_usd'].isna() & (df['funding_total_usd'] > 0)).astype(int)

    print("=== CREATING OPTIMIZED TARGETS ===")
    print(f"Total companies: {len(df):,}")

    # Initialize targets
    df['failure_risk'] = 0
    df['risk_tier'] = 1  # 0=Low, 1=Medium, 2=High

    # TIER 0: LOW RISK (Successful/Strong Companies)
    low_risk_conditions = [
        # Successful exits - ALWAYS low risk
        df['status'].isin(['ipo', 'acquired']),

        # Well-funded operating companies
        (df['status'] == 'operating') & (df['funding_total_usd'] > 500000),

        # Young companies with decent funding
        (df['company_age_years'] <= 3) & (df['funding_total_usd'] > 100000),

        # Companies with significant funding regardless of age
        (df['funding_total_usd'] > 2000000)
    ]

    low_risk_mask = pd.concat(low_risk_conditions, axis=1).any(axis=1)
    df.loc[low_risk_mask, 'risk_tier'] = 0
    df.loc[low_risk_mask, 'failure_risk'] = 0

    # TIER 2: HIGH RISK (Clear Failure Signals)
    high_risk_conditions = [
        # Explicitly closed
        df['status'] == 'closed',

        # Very old with no funding (true zombies)
        (df['company_age_years'] > 10) & (df['has_funding'] == 0),

        # Old with extremely low funding
        (df['company_age_years'] > 8) & (df['funding_total_usd'] < 10000),
    ]

    high_risk_mask = pd.concat(high_risk_conditions, axis=1).any(axis=1)
    df.loc[high_risk_mask, 'risk_tier'] = 2
    df.loc[high_risk_mask, 'failure_risk'] = 1

    # TIER 1: MEDIUM RISK (Everything else - the uncertain middle)
    # This is automatic based on the initialization

    # Map to readable labels
    risk_labels = {0: 'low_risk', 1: 'medium_risk', 2: 'high_risk'}
    df['risk_tier_label'] = df['risk_tier'].map(risk_labels)

    # Validation
    print("\n=== OPTIMIZED TARGET DISTRIBUTION ===")
    print(df['risk_tier_label'].value_counts().sort_index())
    print(f"High risk rate: {df['failure_risk'].mean():.1%}")

    # Validate key segments
    print("\n=== VALIDATION BY STATUS ===")
    status_risk = pd.crosstab(df['status'], df['risk_tier_label'], normalize='index')
    print(status_risk)

    # Analyze risk by age and funding
    print("\n=== RISK BY AGE GROUPS ===")
    df['age_group'] = pd.cut(df['company_age_years'],
                            bins=[0, 3, 7, 15, 50],
                            labels=['0-3y', '4-7y', '8-15y', '15+y'],
                            right=False)
    age_risk = pd.crosstab(df['age_group'], df['risk_tier_label'], normalize='index')
    print(age_risk)

    # Save optimized version
    output_columns = [
        'id', 'name', 'status', 'category_code', 'country_code', 'state_code', 'region',
        'founded_at', 'founded_year', 'company_age_years', 'age_group',
        'funding_total_usd', 'has_funding',
        'failure_risk', 'risk_tier', 'risk_tier_label'
    ]

    # Only include existing columns
    existing_columns = [col for col in output_columns if col in df.columns]
    df_output = df[existing_columns]

    df_output.to_csv('companies_optimized_targets.csv', index=False)
    print(f"\nSaved optimized targets to companies_optimized_targets.csv")
    print(f"Final dataset: {len(df_output):,} companies with {len(existing_columns)} columns")

    return df

def analyze_risk_characteristics(df):
    """
    Analyze what characterizes each risk tier
    """
    print("\n" + "="*50)
    print("RISK TIER CHARACTERISTICS")
    print("="*50)

    for tier in [0, 1, 2]:
        tier_data = df[df['risk_tier'] == tier]
        print(f"\n--- TIER {tier} ({tier_data['risk_tier_label'].iloc[0]}) ---")
        print(f"Count: {len(tier_data):,} companies")
        print(f"Avg age: {tier_data['company_age_years'].mean():.1f} years")
        print(f"Avg funding: ${tier_data['funding_total_usd'].mean():,.0f}")
        print(f"Has funding: {tier_data['has_funding'].mean():.1%}")
        print(f"Top statuses:")
        print(tier_data['status'].value_counts().head(3))

if __name__ == "__main__":
    df_optimized = create_optimized_targets()
    analyze_risk_characteristics(df_optimized)

=== CREATING OPTIMIZED TARGETS ===
Total companies: 196,553

=== OPTIMIZED TARGET DISTRIBUTION ===
risk_tier_label
high_risk      170553
low_risk        19976
medium_risk      6024
Name: count, dtype: int64
High risk rate: 86.8%

=== VALIDATION BY STATUS ===
risk_tier_label  high_risk  low_risk  medium_risk
status                                           
acquired          0.751756  0.248244     0.000000
closed            1.000000  0.000000     0.000000
ipo               0.577601  0.422399     0.000000
operating         0.873589  0.093572     0.032839

=== RISK BY AGE GROUPS ===
risk_tier_label  high_risk  low_risk  medium_risk
age_group                                        
8-15y             0.919854  0.049031     0.031114
15+y              0.792937  0.176687     0.030376

Saved optimized targets to companies_optimized_targets.csv
Final dataset: 196,553 companies with 16 columns

RISK TIER CHARACTERISTICS

--- TIER 0 (low_risk) ---
Count: 19,976 companies
Avg age: 18.2 years
Avg fu

# Task
Clean the dataset by handling missing values, standardizing categorical variables, and removing duplicates.

In [2]:
missing_percentages = df_optimized.isnull().mean() * 100
missing_percentages = missing_percentages.sort_values(ascending=False)

print("Percentage of missing values per column:")
print(missing_percentages[missing_percentages > 0])

Percentage of missing values per column:
parent_id              100.000000
ROI                     99.630634
last_investment_at      98.685851
first_investment_at     98.685851
invested_companies      98.681780
investment_rounds       98.681780
closed_at               98.667026
short_description       96.371971
funding_total_usd       85.818583
last_funding_at         83.970227
first_funding_at        83.970227
funding_rounds          83.868473
state_code              74.102151
twitter_username        58.997828
tag_list                58.559778
lng                     57.338733
lat                     57.338733
city                    57.319400
country_code            55.233448
founded_at              53.586564
milestones              53.346426
last_milestone_at       53.346426
first_milestone_at      53.346426
description             53.168865
logo_height             43.979486
logo_url                43.979486
logo_width              43.979486
category_code           37.326828
domain 

In [3]:
admin_cols_to_drop = [
    'entity_type', 'entity_id', 'parent_id',
    'Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 1',
    'permalink'
]

existing_admin_cols = [col for col in admin_cols_to_drop if col in df_optimized.columns]
df_optimized = df_optimized.drop(columns=existing_admin_cols)
print(f"Dropped administrative columns: {existing_admin_cols}")

Dropped administrative columns: ['entity_type', 'entity_id', 'parent_id', 'Unnamed: 0.1', 'permalink']


In [4]:
# Now handle missing values - drop columns with missing percentage > 50% 
missing_percentages = df_optimized.isnull().mean() * 100
missing_percentages = missing_percentages.sort_values(ascending=False)

columns_to_drop = missing_percentages[missing_percentages > 50].index

# Define critical columns to preserve even if they have high missing values
critical_cols = [
    # Funding-related
    'funding_total_usd', 'funding_rounds', 'first_funding_at', 'last_funding_at',
    
    # Geographic data
    'country_code', 'state_code', 'city', 'region', 'lat', 'lng',
    
    # Company descriptive info (valuable for analysis)
    'category_code', 'description', 'overview', 'tag_list',
    
    # Investment/milestone data
    'investment_rounds', 'invested_companies', 'milestones',
    'first_milestone_at', 'last_milestone_at',
    
    # Company metadata
    'twitter_username', 'homepage_url', 'domain', 'closed_at'
]

columns_to_drop = columns_to_drop.difference(critical_cols)

df_optimized = df_optimized.drop(columns=columns_to_drop)
print(f"Dropped {len(columns_to_drop)} columns with >50% missing values.")
print(f"Preserved critical columns: {[col for col in critical_cols if col in df_optimized.columns]}")



Dropped 5 columns with >50% missing values.
Preserved critical columns: ['funding_total_usd', 'funding_rounds', 'first_funding_at', 'last_funding_at', 'country_code', 'state_code', 'city', 'region', 'lat', 'lng', 'category_code', 'description', 'overview', 'tag_list', 'investment_rounds', 'invested_companies', 'milestones', 'first_milestone_at', 'last_milestone_at', 'twitter_username', 'homepage_url', 'domain', 'closed_at']


In [5]:
# FIXED: Handle missing values intelligently without dropping most of the data
print(f"Starting with {len(df_optimized):,} rows")

# Check what columns still have missing values
remaining_missing = df_optimized.isnull().sum()
remaining_missing = remaining_missing[remaining_missing > 0].sort_values(ascending=False)
print(f"\nColumns with missing values:")
print(remaining_missing)

# ONLY drop rows for truly critical missing data
# Most companies won't have closed_at (they're still operating!)
# Most companies won't have funding dates (they're bootstrapped!)
critical_only_cols = ['name']  # Only name is truly critical

for col in critical_only_cols:
    if col in df_optimized.columns:
        rows_before = len(df_optimized)
        df_optimized.dropna(subset=[col], inplace=True)
        rows_after = len(df_optimized)
        print(f"Dropped {rows_before - rows_after:,} rows with missing {col} (truly critical)")

print(f"\nAfter intelligent cleaning: {len(df_optimized):,} rows")
print(f"Data preservation rate: {len(df_optimized)/196000:.1%}")

# Final check
final_missing = df_optimized.isnull().sum()
final_missing = final_missing[final_missing > 0].sort_values(ascending=False)
print(f"\nRemaining missing values (this is NORMAL and OK):")
print(final_missing)

Starting with 196,553 rows

Columns with missing values:
invested_companies    193962
investment_rounds     193962
closed_at             193933
funding_total_usd     168679
last_funding_at       165046
first_funding_at      165046
funding_rounds        164846
state_code            145650
twitter_username      115962
tag_list              115101
lat                   112701
lng                   112701
city                  112663
country_code          108563
first_milestone_at    104854
last_milestone_at     104854
milestones            104854
description           104505
logo_height            86443
logo_width             86443
logo_url               86443
category_code          73367
homepage_url           70008
domain                 70008
overview               69582
relationships          66886
created_by             41020
age_group               1574
normalized_name           26
name                      23
dtype: int64
Dropped 23 rows with missing name (truly critical)

After in

In [6]:
# Missing funding likely means $0 funding
if 'funding_total_usd' in df_optimized.columns:
    funding_before = df_optimized['funding_total_usd'].isnull().sum()
    df_optimized['funding_total_usd'] = df_optimized['funding_total_usd'].fillna(0)
    print(f"Imputed {funding_before:,} missing funding_total_usd values with $0")



Imputed 168,657 missing funding_total_usd values with $0


In [7]:
# Handle other funding-related columns
funding_cols_to_fill = ['funding_rounds', 'first_funding_at', 'last_funding_at']
for col in funding_cols_to_fill:
    if col in df_optimized.columns:
        if col == 'funding_rounds':
            # Missing funding rounds = 0 rounds
            before_count = df_optimized[col].isnull().sum()
            df_optimized[col] = df_optimized[col].fillna(0)
            print(f"Imputed {before_count:,} missing {col} values with 0")
        else:
            # For date columns, leave as NaN (indicates no funding events)
            print(f"Left {df_optimized[col].isnull().sum():,} missing {col} values as NaN (no funding events)")


Imputed 164,825 missing funding_rounds values with 0
Left 165,025 missing first_funding_at values as NaN (no funding events)
Left 165,025 missing last_funding_at values as NaN (no funding events)


In [8]:
# Handle investment-related columns
investment_cols_to_fill = ['investment_rounds', 'invested_companies']
for col in investment_cols_to_fill:
    if col in df_optimized.columns:
        before_count = df_optimized[col].isnull().sum()
        df_optimized[col] = df_optimized[col].fillna(0)
        print(f"Imputed {before_count:,} missing {col} values with 0")

Imputed 193,941 missing investment_rounds values with 0
Imputed 193,941 missing invested_companies values with 0


In [9]:
# Handle milestone columns
milestone_cols = ['milestones']
for col in milestone_cols:
    if col in df_optimized.columns:
        before_count = df_optimized[col].isnull().sum()
        df_optimized[col] = df_optimized[col].fillna(0)
        print(f"Imputed {before_count:,} missing {col} values with 0")

Imputed 104,839 missing milestones values with 0


In [10]:


geographic_cols = ['country_code', 'state_code', 'region', 'city']
for col in geographic_cols:
    if col in df_optimized.columns:
        before_count = df_optimized[col].isnull().sum()
        df_optimized[col] = df_optimized[col].fillna('Unknown')
        print(f"Imputed {before_count:,} missing {col} values with 'Unknown'")

text_cols = ['category_code', 'description', 'overview', 'tag_list']
for col in text_cols:
    if col in df_optimized.columns:
        before_count = df_optimized[col].isnull().sum()
        if col == 'category_code':
            df_optimized[col] = df_optimized[col].fillna('other')
        else:
            df_optimized[col] = df_optimized[col].fillna('Unknown')
        print(f"Imputed {before_count:,} missing {col} values")

web_cols = ['twitter_username', 'homepage_url', 'domain']
for col in web_cols:
    if col in df_optimized.columns:
        before_count = df_optimized[col].isnull().sum()
        df_optimized[col] = df_optimized[col].fillna('None')
        print(f"Imputed {before_count:,} missing {col} values with 'None'")

relationship_cols = ['relationships', 'created_by']
for col in relationship_cols:
    if col in df_optimized.columns:
        before_count = df_optimized[col].isnull().sum()
        df_optimized[col] = df_optimized[col].fillna('Unknown')
        print(f"Imputed {before_count:,} missing {col} values with 'Unknown'")

numerical_cols = ['lat', 'lng', 'logo_height', 'logo_width']
for col in numerical_cols:
    if col in df_optimized.columns:
        before_count = df_optimized[col].isnull().sum()
        if col in ['lat', 'lng']:
            # For coordinates, keep as NaN
            print(f"Keeping {before_count:,} missing {col} values as NaN (no fake coordinates)")
        else:
            median_value = df_optimized[col].median()
            df_optimized[col] = df_optimized[col].fillna(median_value)
            print(f"Imputed {before_count:,} missing {col} values with median: {median_value}")

critical_missing_cols = ['name']
for col in critical_missing_cols:
    if col in df_optimized.columns:
        rows_before = len(df_optimized)
        df_optimized.dropna(subset=[col], inplace=True)
        rows_after = len(df_optimized)
        print(f"Dropped {rows_before - rows_after} rows with missing values in '{col}'.")


# Verify remaining missing values
missing_percentages_after = df_optimized.isnull().mean() * 100
missing_percentages_after = missing_percentages_after.sort_values(ascending=False)

print("\nPercentage of missing values per column after cleaning:")
print(missing_percentages_after[missing_percentages_after > 0])

Imputed 108,550 missing country_code values with 'Unknown'
Imputed 145,629 missing state_code values with 'Unknown'
Imputed 0 missing region values with 'Unknown'
Imputed 112,648 missing city values with 'Unknown'
Imputed 73,357 missing category_code values
Imputed 104,492 missing description values
Imputed 69,565 missing overview values
Imputed 115,087 missing tag_list values
Imputed 115,945 missing twitter_username values with 'None'
Imputed 69,997 missing homepage_url values with 'None'
Imputed 69,997 missing domain values with 'None'
Imputed 66,877 missing relationships values with 'Unknown'
Imputed 41,017 missing created_by values with 'Unknown'
Keeping 112,686 missing lat values as NaN (no fake coordinates)
Keeping 112,686 missing lng values as NaN (no fake coordinates)
Imputed 86,431 missing logo_height values with median: 105.0
Imputed 86,431 missing logo_width values with median: 267.0
Dropped 0 rows with missing values in 'name'.

Percentage of missing values per column after

In [11]:
# Identify categorical columns
categorical_cols = df_optimized.select_dtypes(include='object').columns
print("Categorical columns identified:")
print(categorical_cols)

# Inspect unique values for potential inconsistencies in a few key categorical columns
cols_to_inspect = ['category_code', 'country_code', 'state_code', 'region', 'status', 'age_group', 'risk_tier_label']

for col in cols_to_inspect:
    if col in df_optimized.columns:
        print(f"\nUnique values for '{col}':")
        # Display a limited number of unique values if there are many
        unique_values = df_optimized[col].unique()
        if len(unique_values) > 50:
            print(unique_values[:50])
            print(f"... and {len(unique_values) - 50} more.")
        else:
            print(unique_values)

Categorical columns identified:
Index(['id', 'name', 'normalized_name', 'category_code', 'status', 'closed_at',
       'domain', 'homepage_url', 'twitter_username', 'logo_url', 'description',
       'overview', 'tag_list', 'country_code', 'state_code', 'city', 'region',
       'first_funding_at', 'last_funding_at', 'first_milestone_at',
       'last_milestone_at', 'relationships', 'created_by', 'updated_at',
       'risk_tier_label'],
      dtype='object')

Unique values for 'category_code':
['web' 'games_video' 'network_hosting' 'advertising' 'cleantech' 'other'
 'enterprise' 'consulting' 'mobile' 'health' 'software' 'analytics'
 'finance' 'education' 'medical' 'manufacturing' 'biotech' 'ecommerce'
 'public_relations' 'hardware' 'search' 'news' 'government' 'security'
 'photo_video' 'travel' 'semiconductor' 'social' 'legal' 'transportation'
 'hospitality' 'sports' 'nonprofit' 'fashion' 'messaging' 'music'
 'automotive' 'design' 'real_estate' 'local' 'nanotech' 'pets']

Unique values f

In [12]:
# Standardize categorical columns: convert to lowercase and strip whitespace
cols_to_standardize = ['category_code', 'country_code', 'state_code', 'region', 'status', 'risk_tier_label']

for col in cols_to_standardize:
    if col in df_optimized.columns and df_optimized[col].dtype == 'object':
        df_optimized[col] = df_optimized[col].str.lower().str.strip()
        print(f"Standardized '{col}'.")

# Re-verify unique values after standardization
for col in cols_to_standardize:
    if col in df_optimized.columns:
        print(f"\nUnique values for '{col}' after standardization:")
        unique_values = df_optimized[col].unique()
        if len(unique_values) > 50:
            print(unique_values[:50])
            print(f"... and {len(unique_values) - 50} more.")
        else:
            print(unique_values)

# age_group is already a categorical type with defined labels, no string standardization needed.
print("\nUnique values for 'age_group':")
print(df_optimized['age_group'].unique())

Standardized 'category_code'.
Standardized 'country_code'.
Standardized 'state_code'.
Standardized 'region'.
Standardized 'status'.
Standardized 'risk_tier_label'.

Unique values for 'category_code' after standardization:
['web' 'games_video' 'network_hosting' 'advertising' 'cleantech' 'other'
 'enterprise' 'consulting' 'mobile' 'health' 'software' 'analytics'
 'finance' 'education' 'medical' 'manufacturing' 'biotech' 'ecommerce'
 'public_relations' 'hardware' 'search' 'news' 'government' 'security'
 'photo_video' 'travel' 'semiconductor' 'social' 'legal' 'transportation'
 'hospitality' 'sports' 'nonprofit' 'fashion' 'messaging' 'music'
 'automotive' 'design' 'real_estate' 'local' 'nanotech' 'pets']

Unique values for 'country_code' after standardization:
['usa' 'unknown' 'mar' 'ind' 'aus' 'fra' 'jpn' 'nld' 'egy' 'isr' 'gbr'
 'tha' 'can' 'aut' 'irl' 'swe' 'deu' 'bra' 'fin' 'rus' 'sgp' 'mex' 'chn'
 'esp' 'isl' 'kor' 'tur' 'dnk' 'arg' 'pak' 'hun' 'pol' 'grc' 'prt' 'blr'
 'css' 'mkd' 'che

In [13]:
# Check for duplicate rows
duplicate_rows = df_optimized.duplicated()

# Count the number of duplicate rows
num_duplicates = duplicate_rows.sum()

print(f"Number of duplicate rows found: {num_duplicates}")

# Remove duplicate rows if any exist
if num_duplicates > 0:
    rows_before_dropping = len(df_optimized)
    df_optimized.drop_duplicates(inplace=True)
    rows_after_dropping = len(df_optimized)
    print(f"Removed {rows_before_dropping - rows_after_dropping} duplicate rows.")
else:
    print("No duplicate rows to remove.")

Number of duplicate rows found: 0
No duplicate rows to remove.


In [14]:
df_optimized.to_csv('./companies_cleaned_targets.csv', index=False)