In [1]:
# =========================================
# Startup Risk Analyzer
# 06_startup_exploration.ipynb
# Cell 1: Load Dataset
# =========================================

import pandas as pd

# Load startup funding dataset
startup_df = pd.read_csv("/content/startup_funding.csv")

print("Dataset Shape:", startup_df.shape)
print("\nColumns:")
print(startup_df.columns.tolist())

print("\nSample Data:")
startup_df.head()


Dataset Shape: (3044, 10)

Columns:
['Sr No', 'Date dd/mm/yyyy', 'Startup Name', 'Industry Vertical', 'SubVertical', 'City  Location', 'Investors Name', 'InvestmentnType', 'Amount in USD', 'Remarks']

Sample Data:


Unnamed: 0,Sr No,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks
0,1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
1,2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,
2,3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,
4,5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,


In [2]:
# =========================================
# Cell 2: High-Level EDA
# =========================================

# Rename columns for easier handling (EDA only)
startup_df_eda = startup_df.rename(columns={
    "Industry Vertical": "industry",
    "City  Location": "city",
    "InvestmentnType": "investment_type",
    "Amount in USD": "amount_usd"
})

print("Top 10 Industries:")
print(startup_df_eda["industry"].value_counts().head(10))

print("\nTop 10 Cities:")
print(startup_df_eda["city"].value_counts().head(10))

print("\nInvestment Type Distribution:")
print(startup_df_eda["investment_type"].value_counts().head(10))

print("\nRaw Funding Amount Sample:")
print(startup_df_eda["amount_usd"].head(10))


Top 10 Industries:
industry
Consumer Internet    941
Technology           478
eCommerce            186
Healthcare            70
Finance               62
ECommerce             61
Logistics             32
E-Commerce            29
Education             24
Food & Beverage       23
Name: count, dtype: int64

Top 10 Cities:
city
Bangalore    700
Mumbai       567
New Delhi    421
Gurgaon      287
Bengaluru    141
Pune         105
Hyderabad     99
Chennai       97
Noida         92
Gurugram      50
Name: count, dtype: int64

Investment Type Distribution:
investment_type
Private Equity          1356
Seed Funding            1355
Seed/ Angel Funding       60
Seed / Angel Funding      47
Seed\\nFunding            30
Debt Funding              25
Series A                  24
Seed/Angel Funding        23
Series B                  20
Series C                  14
Name: count, dtype: int64

Raw Funding Amount Sample:
0    20,00,00,000
1       80,48,394
2     1,83,58,860
3       30,00,000
4       18,00,00

In [3]:
# =========================================
# Cell 3: Funding Distribution & Missing Values
# =========================================

# Check missing values
print("Missing Values per Column:")
print(startup_df.isnull().sum())

# Check how many funding values are missing or invalid
print("\nTotal rows:", len(startup_df))
print("Missing funding values:", startup_df["Amount in USD"].isnull().sum())

# Show unique funding samples (to inspect patterns)
print("\nSample funding values:")
print(startup_df["Amount in USD"].dropna().sample(10, random_state=42).values)


Missing Values per Column:
Sr No                   0
Date dd/mm/yyyy         0
Startup Name            0
Industry Vertical     171
SubVertical           936
City  Location        180
Investors Name         24
InvestmentnType         4
Amount in USD         960
Remarks              2625
dtype: int64

Total rows: 3044
Missing funding values: 960

Sample funding values:
['35,00,000' '22,00,000' '10,00,000' '60,00,000' '10,00,000' '4,00,00,000'
 '1,00,000' '1,00,000' '1,25,000' '5,37,000']
