In [4]:
import pandas as pd

def create_optimized_targets():
    """
    Create final optimized target variables with better balance
    """
    # Load the original data (not the one with targets)
    df = pd.read_csv('../data/companies.csv')

    # Basic preprocessing (from your original script)
    df['founded_at'] = pd.to_datetime(df['founded_at'], errors='coerce')
    df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

    current_year = pd.Timestamp.now().year
    df['founded_year'] = df['founded_at'].dt.year
    df['company_age_years'] = current_year - df['founded_year']

    # Handle missing founded years
    missing_founded = df['founded_year'].isna()
    df.loc[missing_founded, 'founded_year'] = df.loc[missing_founded, 'created_at'].dt.year
    df.loc[missing_founded, 'company_age_years'] = current_year - df.loc[missing_founded, 'founded_year']
    df['company_age_years'] = df['company_age_years'].clip(upper=50, lower=0)

    # Funding features
    df['funding_total_usd'] = pd.to_numeric(df['funding_total_usd'], errors='coerce')
    df['has_funding'] = (~df['funding_total_usd'].isna() & (df['funding_total_usd'] > 0)).astype(int)

    print("=== CREATING OPTIMIZED TARGETS ===")
    print(f"Total companies: {len(df):,}")

    # Initialize targets
    df['failure_risk'] = 0
    df['risk_tier'] = 1  # 0=Low, 1=Medium, 2=High

    # TIER 0: LOW RISK (Successful/Strong Companies)
    low_risk_conditions = [
        # Successful exits - ALWAYS low risk
        df['status'].isin(['ipo', 'acquired']),

        # Well-funded operating companies
        (df['status'] == 'operating') & (df['funding_total_usd'] > 500000),

        # Young companies with decent funding
        (df['company_age_years'] <= 3) & (df['funding_total_usd'] > 100000),

        # Companies with significant funding regardless of age
        (df['funding_total_usd'] > 2000000)
    ]

    low_risk_mask = pd.concat(low_risk_conditions, axis=1).any(axis=1)
    df.loc[low_risk_mask, 'risk_tier'] = 0
    df.loc[low_risk_mask, 'failure_risk'] = 0

    # TIER 2: HIGH RISK (Clear Failure Signals)
    high_risk_conditions = [
        # Explicitly closed
        df['status'] == 'closed',

        # Very old with no funding (true zombies)
        (df['company_age_years'] > 10) & (df['has_funding'] == 0),

        # Old with extremely low funding
        (df['company_age_years'] > 8) & (df['funding_total_usd'] < 10000),
    ]

    high_risk_mask = pd.concat(high_risk_conditions, axis=1).any(axis=1)
    df.loc[high_risk_mask, 'risk_tier'] = 2
    df.loc[high_risk_mask, 'failure_risk'] = 1

    # TIER 1: MEDIUM RISK (Everything else - the uncertain middle)
    # This is automatic based on the initialization

    # Map to readable labels
    risk_labels = {0: 'low_risk', 1: 'medium_risk', 2: 'high_risk'}
    df['risk_tier_label'] = df['risk_tier'].map(risk_labels)

    # Validation
    print("\n=== OPTIMIZED TARGET DISTRIBUTION ===")
    print(df['risk_tier_label'].value_counts().sort_index())
    print(f"High risk rate: {df['failure_risk'].mean():.1%}")

    # Validate key segments
    print("\n=== VALIDATION BY STATUS ===")
    status_risk = pd.crosstab(df['status'], df['risk_tier_label'], normalize='index')
    print(status_risk)

    # Analyze risk by age and funding
    print("\n=== RISK BY AGE GROUPS ===")
    df['age_group'] = pd.cut(df['company_age_years'],
                            bins=[0, 3, 7, 15, 50],
                            labels=['0-3y', '4-7y', '8-15y', '15+y'],
                            right=False)
    age_risk = pd.crosstab(df['age_group'], df['risk_tier_label'], normalize='index')
    print(age_risk)

    # Save optimized version
    output_columns = [
        'id', 'name', 'status', 'category_code', 'country_code', 'state_code', 'region',
        'founded_at', 'founded_year', 'company_age_years', 'age_group',
        'funding_total_usd', 'has_funding',
        'failure_risk', 'risk_tier', 'risk_tier_label'
    ]

    # Only include existing columns
    existing_columns = [col for col in output_columns if col in df.columns]
    df_output = df[existing_columns]

    df_output.to_csv('../processed_data/companies_optimized_targets.csv', index=False)
    print(f"\nSaved optimized targets to processed_data/companies_optimized_targets.csv")
    print(f"Final dataset: {len(df_output):,} companies with {len(existing_columns)} columns")

    return df

def analyze_risk_characteristics(df):
    """
    Analyze what characterizes each risk tier
    """
    print("\n" + "="*50)
    print("RISK TIER CHARACTERISTICS")
    print("="*50)

    for tier in [0, 1, 2]:
        tier_data = df[df['risk_tier'] == tier]
        print(f"\n--- TIER {tier} ({tier_data['risk_tier_label'].iloc[0]}) ---")
        print(f"Count: {len(tier_data):,} companies")
        print(f"Avg age: {tier_data['company_age_years'].mean():.1f} years")
        print(f"Avg funding: ${tier_data['funding_total_usd'].mean():,.0f}")
        print(f"Has funding: {tier_data['has_funding'].mean():.1%}")
        print(f"Top statuses:")
        print(tier_data['status'].value_counts().head(3))

if __name__ == "__main__":
    df_optimized = create_optimized_targets()
    analyze_risk_characteristics(df_optimized)

=== CREATING OPTIMIZED TARGETS ===
Total companies: 196,553

=== OPTIMIZED TARGET DISTRIBUTION ===
risk_tier_label
high_risk      170553
low_risk        19976
medium_risk      6024
Name: count, dtype: int64
High risk rate: 86.8%

=== VALIDATION BY STATUS ===
risk_tier_label  high_risk  low_risk  medium_risk
status                                           
acquired          0.751756  0.248244     0.000000
closed            1.000000  0.000000     0.000000
ipo               0.577601  0.422399     0.000000
operating         0.873589  0.093572     0.032839

=== RISK BY AGE GROUPS ===
risk_tier_label  high_risk  low_risk  medium_risk
age_group                                        
8-15y             0.919854  0.049031     0.031114
15+y              0.792937  0.176687     0.030376

Saved optimized targets to processed_data/companies_optimized_targets.csv
Final dataset: 196,553 companies with 16 columns

RISK TIER CHARACTERISTICS

--- TIER 0 (low_risk) ---
Count: 19,976 companies
Avg age: 18

# Task
Clean the dataset by handling missing values, standardizing categorical variables, and removing duplicates.

In [5]:
missing_percentages = df_optimized.isnull().mean() * 100
missing_percentages = missing_percentages.sort_values(ascending=False)

print("Percentage of missing values per column:")
print(missing_percentages[missing_percentages > 0])

Percentage of missing values per column:
parent_id              100.000000
ROI                     99.630634
last_investment_at      98.685851
first_investment_at     98.685851
invested_companies      98.681780
investment_rounds       98.681780
closed_at               98.667026
short_description       96.371971
funding_total_usd       85.818583
last_funding_at         83.970227
first_funding_at        83.970227
funding_rounds          83.868473
state_code              74.102151
twitter_username        58.997828
tag_list                58.559778
lng                     57.338733
lat                     57.338733
city                    57.319400
country_code            55.233448
founded_at              53.586564
milestones              53.346426
last_milestone_at       53.346426
first_milestone_at      53.346426
description             53.168865
logo_height             43.979486
logo_url                43.979486
logo_width              43.979486
category_code           37.326828
domain 

In [6]:
# Drop columns with missing percentage > 50%
columns_to_drop = missing_percentages[missing_percentages > 50].index
df_optimized = df_optimized.drop(columns=columns_to_drop)
print(f"Dropped {len(columns_to_drop)} columns with >50% missing values.")

moderate_missing_cols_categorical = ['category_code', 'country_code', 'state_code', 'region', 'age_group', 'city', 'twitter_username', 'tag_list', 'description', 'relationships', 'created_by', 'overview', 'homepage_url', 'domain']
moderate_missing_cols_numerical = ['lat', 'lng', 'logo_height', 'logo_width']

for col in moderate_missing_cols_categorical:
    if col in df_optimized.columns:
        mode_value = df_optimized[col].mode()[0]
        df_optimized[col] = df_optimized[col].fillna(mode_value)
        print(f"Imputed missing values in '{col}' with mode.")

for col in moderate_missing_cols_numerical:
    if col in df_optimized.columns:
        median_value = df_optimized[col].median()
        df_optimized[col] = df_optimized[col].fillna(median_value)
        print(f"Imputed missing values in '{col}' with median.")

# Drop rows for low missing percentages (< 20%)
low_missing_cols = ['name', 'normalized_name']

for col in low_missing_cols:
    if col in df_optimized.columns:
        rows_before = len(df_optimized)
        df_optimized.dropna(subset=[col], inplace=True)
        rows_after = len(df_optimized)
        print(f"Dropped {rows_before - rows_after} rows with missing values in '{col}'.")


# Verify remaining missing values
missing_percentages_after = df_optimized.isnull().mean() * 100
missing_percentages_after = missing_percentages_after.sort_values(ascending=False)

print("\nPercentage of missing values per column after cleaning:")
print(missing_percentages_after[missing_percentages_after > 0])

Dropped 24 columns with >50% missing values.
Imputed missing values in 'category_code' with mode.
Imputed missing values in 'region' with mode.
Imputed missing values in 'age_group' with mode.
Imputed missing values in 'relationships' with mode.
Imputed missing values in 'created_by' with mode.
Imputed missing values in 'overview' with mode.
Imputed missing values in 'homepage_url' with mode.
Imputed missing values in 'domain' with mode.
Imputed missing values in 'logo_height' with median.
Imputed missing values in 'logo_width' with median.
Dropped 23 rows with missing values in 'name'.
Dropped 4 rows with missing values in 'normalized_name'.

Percentage of missing values per column after cleaning:
logo_url    43.977896
dtype: float64


In [7]:
# Handle remaining columns with minimal missing values by dropping rows
remaining_cols_with_missing = missing_percentages_after[missing_percentages_after > 0].index.tolist()

for col in remaining_cols_with_missing:
    if col in df_optimized.columns:
        rows_before = len(df_optimized)
        df_optimized.dropna(subset=[col], inplace=True)
        rows_after = len(df_optimized)
        print(f"Dropped {rows_before - rows_after} rows with missing values in '{col}'.")


# Verify that there are no remaining missing values
missing_percentages_final = df_optimized.isnull().mean() * 100

print("\nPercentage of missing values per column after final cleaning:")
print(missing_percentages_final[missing_percentages_final > 0])

Dropped 86428 rows with missing values in 'logo_url'.

Percentage of missing values per column after final cleaning:
Series([], dtype: float64)


In [8]:
# Identify categorical columns
categorical_cols = df_optimized.select_dtypes(include='object').columns
print("Categorical columns identified:")
print(categorical_cols)

# Inspect unique values for potential inconsistencies in a few key categorical columns
cols_to_inspect = ['category_code', 'country_code', 'state_code', 'region', 'status', 'age_group', 'risk_tier_label']

for col in cols_to_inspect:
    if col in df_optimized.columns:
        print(f"\nUnique values for '{col}':")
        # Display a limited number of unique values if there are many
        unique_values = df_optimized[col].unique()
        if len(unique_values) > 50:
            print(unique_values[:50])
            print(f"... and {len(unique_values) - 50} more.")
        else:
            print(unique_values)

Categorical columns identified:
Index(['id', 'entity_type', 'name', 'normalized_name', 'permalink',
       'category_code', 'status', 'domain', 'homepage_url', 'logo_url',
       'overview', 'region', 'created_by', 'updated_at', 'risk_tier_label'],
      dtype='object')

Unique values for 'category_code':
['web' 'games_video' 'advertising' 'software' 'enterprise' 'other'
 'consulting' 'mobile' 'health' 'cleantech' 'analytics' 'network_hosting'
 'finance' 'education' 'medical' 'manufacturing' 'ecommerce'
 'public_relations' 'hardware' 'search' 'news' 'security' 'biotech'
 'photo_video' 'travel' 'social' 'legal' 'transportation' 'hospitality'
 'sports' 'nonprofit' 'semiconductor' 'fashion' 'messaging' 'music'
 'automotive' 'design' 'real_estate' 'local' 'nanotech' 'pets'
 'government']

Unique values for 'region':
['Seattle' 'Los Angeles' 'SF Bay' 'unknown' 'Agadir' 'New York' 'Santa Fe'
 'San Diego' 'Austin' 'Abbotsford' 'New Delhi' 'Columbus'
 'New Jersey - Other' 'Chicago' 'West Bridg

In [9]:
# Standardize categorical columns: convert to lowercase and strip whitespace
cols_to_standardize = ['category_code', 'country_code', 'state_code', 'region', 'status', 'risk_tier_label']

for col in cols_to_standardize:
    if col in df_optimized.columns and df_optimized[col].dtype == 'object':
        df_optimized[col] = df_optimized[col].str.lower().str.strip()
        print(f"Standardized '{col}'.")

# Re-verify unique values after standardization
for col in cols_to_standardize:
    if col in df_optimized.columns:
        print(f"\nUnique values for '{col}' after standardization:")
        unique_values = df_optimized[col].unique()
        if len(unique_values) > 50:
            print(unique_values[:50])
            print(f"... and {len(unique_values) - 50} more.")
        else:
            print(unique_values)

# age_group is already a categorical type with defined labels, no string standardization needed.
print("\nUnique values for 'age_group':")
print(df_optimized['age_group'].unique())

Standardized 'category_code'.
Standardized 'region'.
Standardized 'status'.
Standardized 'risk_tier_label'.

Unique values for 'category_code' after standardization:
['web' 'games_video' 'advertising' 'software' 'enterprise' 'other'
 'consulting' 'mobile' 'health' 'cleantech' 'analytics' 'network_hosting'
 'finance' 'education' 'medical' 'manufacturing' 'ecommerce'
 'public_relations' 'hardware' 'search' 'news' 'security' 'biotech'
 'photo_video' 'travel' 'social' 'legal' 'transportation' 'hospitality'
 'sports' 'nonprofit' 'semiconductor' 'fashion' 'messaging' 'music'
 'automotive' 'design' 'real_estate' 'local' 'nanotech' 'pets'
 'government']

Unique values for 'region' after standardization:
['seattle' 'los angeles' 'sf bay' 'unknown' 'agadir' 'new york' 'santa fe'
 'san diego' 'austin' 'abbotsford' 'new delhi' 'columbus'
 'new jersey - other' 'chicago' 'west bridgewater' 'houston' 'charlotte'
 'paris' 'shinagawa-ku' 'amsterdam' 'wilton' 'philadelphia' 'boston'
 'bangalore' 'cairo'

In [10]:
# Check for duplicate rows
duplicate_rows = df_optimized.duplicated()

# Count the number of duplicate rows
num_duplicates = duplicate_rows.sum()

print(f"Number of duplicate rows found: {num_duplicates}")

# Remove duplicate rows if any exist
if num_duplicates > 0:
    rows_before_dropping = len(df_optimized)
    df_optimized.drop_duplicates(inplace=True)
    rows_after_dropping = len(df_optimized)
    print(f"Removed {rows_before_dropping - rows_after_dropping} duplicate rows.")
else:
    print("No duplicate rows to remove.")

Number of duplicate rows found: 0
No duplicate rows to remove.


In [11]:
df_optimized.to_csv('../processed_data/companies_cleaned_data.csv', index=False)