In [1]:
# =========================================
# Startup Risk Analyzer
# 06_startup_cleaning.ipynb
# Cell 1: Rename & Drop Irrelevant Columns
# =========================================

import pandas as pd

# Load raw dataset
startup_df = pd.read_csv("/content/startup_funding.csv")

# Standardize column names
startup_df = startup_df.rename(columns={
    "Startup Name": "startup",
    "Industry Vertical": "industry",
    "SubVertical": "sub_industry",
    "City  Location": "city",
    "InvestmentnType": "investment_type",
    "Amount in USD": "amount_usd",
    "Date dd/mm/yyyy": "date"
})

# Drop high-missing / irrelevant columns
startup_df = startup_df.drop(columns=[
    "Sr No",
    "Remarks",
    "Investors Name"
])

print("Shape after column cleanup:", startup_df.shape)
print("\nRemaining columns:")
print(startup_df.columns.tolist())


Shape after column cleanup: (3044, 7)

Remaining columns:
['date', 'startup', 'industry', 'sub_industry', 'city', 'investment_type', 'amount_usd']


In [2]:
# =========================================
# Cell 2: Clean Funding Amount
# =========================================

import numpy as np

# Inspect unique problematic values
print("Sample raw funding values:")
print(startup_df["amount_usd"].dropna().head(10))

# Convert funding to numeric
def clean_funding(val):
    if pd.isna(val):
        return np.nan
    val = str(val).replace(",", "").strip()
    if val.lower() in ["undisclosed", "unknown", ""]:
        return np.nan
    try:
        return float(val)
    except:
        return np.nan

startup_df["amount_usd_clean"] = startup_df["amount_usd"].apply(clean_funding)

print("\nAfter cleaning:")
print(startup_df["amount_usd_clean"].describe())

print("\nMissing funding after cleaning:")
print(startup_df["amount_usd_clean"].isnull().sum())


Sample raw funding values:
0    20,00,00,000
1       80,48,394
2     1,83,58,860
3       30,00,000
4       18,00,000
5       90,00,000
6    15,00,00,000
7       60,00,000
8     7,00,00,000
9     5,00,00,000
Name: amount_usd, dtype: object

After cleaning:
count    2.065000e+03
mean     1.842990e+07
std      1.213734e+08
min      1.600000e+04
25%      4.700000e+05
50%      1.700000e+06
75%      8.000000e+06
max      3.900000e+09
Name: amount_usd_clean, dtype: float64

Missing funding after cleaning:
979


In [3]:
# =========================================
# Cell 3: Handle Missing & Normalize Categories
# =========================================

# Drop rows with missing funding (risk model needs funding signal)
startup_df_clean = startup_df.dropna(subset=["amount_usd_clean"]).copy()

print("Rows before drop:", startup_df.shape[0])
print("Rows after drop:", startup_df_clean.shape[0])

# Normalize text columns
def normalize_text(col):
    return (
        col.astype(str)
        .str.lower()
        .str.strip()
        .str.replace("&", "and")
    )

for col in ["industry", "city", "investment_type"]:
    startup_df_clean[col] = normalize_text(startup_df_clean[col])

# Check top categories after normalization
print("\nTop Industries (normalized):")
print(startup_df_clean["industry"].value_counts().head(10))

print("\nTop Cities (normalized):")
print(startup_df_clean["city"].value_counts().head(10))

print("\nInvestment Types (normalized):")
print(startup_df_clean["investment_type"].value_counts().head(10))


Rows before drop: 3044
Rows after drop: 2065

Top Industries (normalized):
industry
consumer internet    590
technology           310
ecommerce            170
nan                  131
finance               57
healthcare            45
e-commerce            34
logistics             23
food and beverage     20
education             19
Name: count, dtype: int64

Top Cities (normalized):
city
bangalore    456
mumbai       401
new delhi    241
gurgaon      198
nan          135
bengaluru    126
chennai       75
hyderabad     72
pune          71
noida         55
Name: count, dtype: int64

Investment Types (normalized):
investment_type
private equity          1066
seed funding             716
seed/ angel funding       48
seed / angel funding      38
debt funding              24
series a                  22
seed\\nfunding            22
series b                  20
seed/angel funding        18
series c                  14
Name: count, dtype: int64


In [4]:
# =========================================
# Cell 4: Fix Category Variants & NaNs
# =========================================

# Handle NaN-like strings
for col in ["industry", "city", "investment_type"]:
    startup_df_clean[col] = startup_df_clean[col].replace(
        ["nan", "none", "null"], "unknown"
    )

# Merge city variants
city_map = {
    "bengaluru": "bangalore",
    "gurugram": "gurgaon"
}
startup_df_clean["city"] = startup_df_clean["city"].replace(city_map)

# Normalize investment types into broad categories
def map_investment_type(val):
    if "seed" in val:
        return "seed"
    elif "series a" in val:
        return "series_a"
    elif "series b" in val:
        return "series_b"
    elif "series c" in val:
        return "series_c"
    elif "private equity" in val:
        return "private_equity"
    elif "debt" in val:
        return "debt"
    else:
        return "other"

startup_df_clean["investment_type_grouped"] = startup_df_clean[
    "investment_type"
].apply(map_investment_type)

print("Cities after merge:")
print(startup_df_clean["city"].value_counts().head(10))

print("\nInvestment Type Groups:")
print(startup_df_clean["investment_type_grouped"].value_counts())


Cities after merge:
city
bangalore    582
mumbai       401
gurgaon      241
new delhi    241
unknown      135
chennai       75
hyderabad     72
pune          71
noida         55
ahmedabad     27
Name: count, dtype: int64

Investment Type Groups:
investment_type_grouped
private_equity    1070
seed               860
other               43
series_a            30
debt                27
series_b            21
series_c            14
Name: count, dtype: int64


In [5]:
# =========================================
# Cell 5: Save Cleaned Startup Dataset
# =========================================

import os

# Create cleaned data directory if not exists
os.makedirs("/content/data/cleaned", exist_ok=True)

# Save cleaned startup dataset
startup_df_clean.to_csv(
    "/content/data/cleaned/startup_risk_cleaned.csv",
    index=False
)

print("✅ Startup Risk cleaned dataset saved successfully!")
print("Saved at: /content/data/cleaned/startup_risk_cleaned.csv")
print("\nFinal Shape:", startup_df_clean.shape)


✅ Startup Risk cleaned dataset saved successfully!
Saved at: /content/data/cleaned/startup_risk_cleaned.csv

Final Shape: (2065, 9)
