# Import necessary libraries

In [180]:
import pandas as pd
import re

# Load the dataset

In [181]:
raw_data = pd.read_csv('startup_funding.csv')

In [182]:
data = raw_data.copy()

In [183]:
data.head()

Unnamed: 0,Sr No,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks
0,1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
1,2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,
2,3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,
4,5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,


In [184]:
print(data.isnull().sum())

Sr No                   0
Date dd/mm/yyyy         0
Startup Name            0
Industry Vertical     171
SubVertical           936
City  Location        180
Investors Name         24
InvestmentnType         4
Amount in USD         960
Remarks              2625
dtype: int64


# Rename Columns

In [185]:
data.rename(columns={
    'Sr No': 'S.No',
    'Date dd/mm/yyyy': 'Incorporation Date',
    'SubVertical': 'Sub-Vertical',
    'City  Location': 'Location',
    'InvestmentnType': 'Investment Type',
    'Amount in USD': 'Total Funding (USD)'
}, inplace=True)

# Handle missing values

In [186]:
data = data.fillna({
    'Startup Name': 'Unknown',
    'Industry Vertical': 'Unknown',
    'Sub-Vertical': 'Not Specified',
    'Location': 'Unknown',
    'Investment Type': 'Unknown',
    'Amount in USD': 'Not Specified',
    'Remarks': 'Not Specified'
})

In [187]:
data.head()

Unnamed: 0,S.No,Incorporation Date,Startup Name,Industry Vertical,Sub-Vertical,Location,Investors Name,Investment Type,Total Funding (USD),Remarks
0,1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,Not Specified
1,2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,Not Specified
2,3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,Not Specified
3,4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,Not Specified
4,5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,Not Specified


# General text cleaning in Dataset

In [188]:
def clean_text(text):
    if isinstance(text, str):
        text = text.encode('ascii', 'ignore').decode('ascii')
        text = re.sub(r'\\x[a-fA-F0-9]{2}', '', text)
        text = re.sub(r'[\n\t\\]', ' ', text)
        return text.strip()
    return text

### Clean Startup Name Column

In [189]:
def clean_startup_name(name):
    if isinstance(name, str):
        name = name.encode('ascii', 'ignore').decode('ascii')
        name = re.sub(r'\\x[a-fA-F0-9]{2}', '', name)
        name = re.sub(r'[\n\t\\]', ' ', name)
        name = re.sub(r'https?://(www\.)?', '', name, flags=re.IGNORECASE)
        name = re.sub(r'\.com|\.in', '', name, flags=re.IGNORECASE)
        name = name.rstrip('/')
        return name.strip()
    return name

data['Startup Name'] = data['Startup Name'].apply(clean_startup_name)

### Clean Industry Vertical Column

In [190]:
data['Industry Vertical'] = data['Industry Vertical'].apply(clean_text)

### Clean Sub-Vertical Column

In [191]:
data['Sub-Vertical'] = data['Sub-Vertical'].apply(clean_text)

### Clean Incorporation Date Column

In [192]:
def clean_dates(date):
    try:
        parsed_date = pd.to_datetime(date, errors='raise', dayfirst=True)
        return parsed_date.strftime('%d-%m-%Y')
    except Exception:
        return 'Unknown'

data['Incorporation Date'] = data['Incorporation Date'].apply(clean_dates)

### Clean and modify Location Column

In [193]:
data['Location'] = data['Location'].apply(clean_text)

city_mapping = {
    'Bangalore': 'Bengaluru',
    'Bengaluru': 'Bengaluru',
    'New Delhi': 'Delhi',
    'Delhi': 'Delhi',
    'Gurgaon': 'Gurugram',
    'Gurugram': 'Gurugram',
    'Ahemdabad': 'Ahmedabad',
    'Ahemadabad': 'Ahmedabad',
    'Bombay': 'Mumbai',
    'Calcutta': 'Kolkata',
}

data['Location'] = data['Location'].replace(city_mapping)

indian_locations_only = [
    'Bengaluru', 'Mumbai', 'Ahmedabad', 'Chennai', 'Pune', 'Kolkata', 'Surat', 'Hyderabad', 'Jaipur',
    'Indore', 'Delhi', 'Nagpur', 'Vadodara', 'Gurugram', 'Noida', 'Amritsar', 'Coimbatore', 'Bhopal', 
    'Goa', 'Lucknow', 'Kanpur', 'Trivandrum', 'Siliguri', 'Varanasi', 'Jodhpur', 'Gaya', 'Udupi', 'Hubli', 
    'Kochi', 'Thiruvananthapuram', 'Udaipur', 'Rourkela', 'Bhubaneswar', 'Chandigarh', 'Agra', 'Belgaum', 
    'Karur', 'Kozhikode', 'Gwalior', 'Faridabad', 'Panaji', 'Mysore'
]

data = data[data['Location'].isin(indian_locations_only)]

### Clean Investors Name Column

In [194]:
data['Investors Name'] = data['Investors Name'].apply(clean_text)

### Clean Investment Type Column

In [195]:
data['Investment Type'] = data['Investment Type'].apply(clean_text)

### Clean Total Funding (USD) Column

In [196]:
def clean_funding(funding):
    if isinstance(funding, str):
        funding = funding.replace('+', '').replace(',', '')
        if funding.isdigit():
            return int(funding)
        return 'Not Specified'
    return funding

data.loc['Total Funding (USD)'] = data['Total Funding (USD)'].apply(clean_funding)

### Drop Remarks Column

In [197]:
data = data.drop(columns=['Remarks'])

In [198]:
data.head()

Unnamed: 0,S.No,Incorporation Date,Startup Name,Industry Vertical,Sub-Vertical,Location,Investors Name,Investment Type,Total Funding (USD)
0,1.0,09-01-2020,BYJUS,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000
1,2.0,13-01-2020,Shuttl,Transportation,App based shuttle service,Gurugram,Susquehanna Growth Equity,Series C,8048394
2,3.0,09-01-2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860
3,4.0,02-01-2020,wealthbucket,FinTech,Online Investment,Delhi,Vinod Khatumal,Pre-series A,3000000
4,5.0,02-01-2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000
