In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("startup_funding.csv", encoding='utf-8')

# Show basic info
print(df.shape)
print(df.columns)
df.head()


(3044, 10)
Index(['Sr No', 'Date dd/mm/yyyy', 'Startup Name', 'Industry Vertical',
       'SubVertical', 'City  Location', 'Investors Name', 'InvestmentnType',
       'Amount in USD', 'Remarks'],
      dtype='object')


Unnamed: 0,Sr No,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks
0,1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
1,2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,
2,3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,
4,5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,


In [5]:

df.columns = df.columns.str.strip().str.replace(" ", "_").str.lower()

# Check again
print(df.columns)


Index(['sr_no', 'date_dd/mm/yyyy', 'startup_name', 'industry_vertical',
       'subvertical', 'city__location', 'investors_name', 'investmentntype',
       'amount_in_usd', 'remarks'],
      dtype='object')


In [6]:
# Drop rows with missing startup name or funding amount
df = df.dropna(subset=['startup_name', 'amount_in_usd'])

# Convert amount to numeric
df['amount_in_usd'] = df['amount_in_usd'].str.replace(",", "")
df['amount_in_usd'] = pd.to_numeric(df['amount_in_usd'], errors='coerce')

# Parse the date
df['date_dd/mm/yyyy'] = pd.to_datetime(df['date_dd/mm/yyyy'], errors='coerce')
df['year'] = df['date_dd/mm/yyyy'].dt.year
df['month'] = df['date_dd/mm/yyyy'].dt.month_name()

# Standardize city and industry
df['city__location'] = df['city__location'].str.strip().str.title()
df['industry_vertical'] = df['industry_vertical'].str.strip().str.title()

# Categorize funding size
df['funding_category'] = pd.cut(df['amount_in_usd'],
                                bins=[0, 1000000, 10000000, 100000000],
                                labels=['Small', 'Medium', 'Large'])

# Extract top investor
df['top_investor'] = df['investors_name'].str.split(',').str[0].str.strip()


In [7]:
df.to_csv("startup_funding_cleaned.csv", index=False)
print("✅ Cleaned CSV saved for Tableau")


✅ Cleaned CSV saved for Tableau
