In [1]:
## import the necessary libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import pyodbc
from dotenv import dotenv_values
import os


#data visualization 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
## Data environment access
# Load environment variables from .env file into a dictionary
environment_variables=dotenv_values('.env')


# Get the values for the credentials you set in the '.env' 
database=environment_variables.get("DATABASE")
server=environment_variables.get("SERVER")
username=environment_variables.get("USERNAME")
password=environment_variables.get("PASSWORD")


connection_string= f"DRIVER={{SQL Server}};SERVER={server};DATABASE={database};UID={username};PWD={password}"

In [3]:
# connect your data using pyodbc
connection=pyodbc.connect(connection_string)

### DATA UNDERSTANDING 
1. load in all the various dataset required for the analyse. Dataset is in various source platforms
2. check info for more information on each of the dataset and data type 
3. We will be concatenating the datasets and create one dataset source
4. check info with the new dataset 
5. assess the mean, std, percentile, count with describe
6. check for null values
7. Do an exploratory data analysis

### DEFINITION OF DATASET VARIABLE
| NAME              | DEFINITION
|:---:              |:---
| Company brand     | company's name
| Founded           | represents the year a company was started
| Headquaters       | the location of the business
| Sector            | the industry under which the companies are in 
| What it does      | overview of what the company does
| Founders          | the CEO or the person who started the company
| Investor          | a person who is investing into the company
| Amount            | the amount of funds received by the company 
| Stage             | the stage at which the company is in.

#### DATA LOADING AND OVERVIEW

In [4]:
query = "SELECT * FROM dbo.LP1_startup_funding2020"

data = pd.read_sql(query, connection)
data.head()

Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage,column10
0,Aqgromalin,2019.0,Chennai,AgriTech,Cultivating Ideas for Profit,"Prasanna Manogaran, Bharani C L",Angel investors,200000.0,,
1,Krayonnz,2019.0,Bangalore,EdTech,An academy-guardian-scholar centric ecosystem ...,"Saurabh Dixit, Gurudutt Upadhyay",GSF Accelerator,100000.0,Pre-seed,
2,PadCare Labs,2018.0,Pune,Hygiene management,Converting bio-hazardous waste to harmless waste,Ajinkya Dhariya,Venture Center,,Pre-seed,
3,NCOME,2020.0,New Delhi,Escrow,Escrow-as-a-service platform,Ritesh Tiwari,"Venture Catalysts, PointOne Capital",400000.0,,
4,Gramophone,2016.0,Indore,AgriTech,Gramophone is an AgTech platform enabling acce...,"Ashish Rajan Singh, Harshit Gupta, Nishant Mah...","Siana Capital Management, Info Edge",340000.0,,


In [5]:
#check for info 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company_Brand  1055 non-null   object 
 1   Founded        842 non-null    float64
 2   HeadQuarter    961 non-null    object 
 3   Sector         1042 non-null   object 
 4   What_it_does   1055 non-null   object 
 5   Founders       1043 non-null   object 
 6   Investor       1017 non-null   object 
 7   Amount         801 non-null    float64
 8   Stage          591 non-null    object 
 9   column10       2 non-null      object 
dtypes: float64(2), object(8)
memory usage: 82.5+ KB


In [6]:
data['year'] = 2020
data.head()

Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage,column10,year
0,Aqgromalin,2019.0,Chennai,AgriTech,Cultivating Ideas for Profit,"Prasanna Manogaran, Bharani C L",Angel investors,200000.0,,,2020
1,Krayonnz,2019.0,Bangalore,EdTech,An academy-guardian-scholar centric ecosystem ...,"Saurabh Dixit, Gurudutt Upadhyay",GSF Accelerator,100000.0,Pre-seed,,2020
2,PadCare Labs,2018.0,Pune,Hygiene management,Converting bio-hazardous waste to harmless waste,Ajinkya Dhariya,Venture Center,,Pre-seed,,2020
3,NCOME,2020.0,New Delhi,Escrow,Escrow-as-a-service platform,Ritesh Tiwari,"Venture Catalysts, PointOne Capital",400000.0,,,2020
4,Gramophone,2016.0,Indore,AgriTech,Gramophone is an AgTech platform enabling acce...,"Ashish Rajan Singh, Harshit Gupta, Nishant Mah...","Siana Capital Management, Info Edge",340000.0,,,2020


In [7]:
query = "SELECT * FROM dbo.LP1_startup_funding2021"

data1 = pd.read_sql(query, connection)
data1.head()

Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage
0,Unbox Robotics,2019.0,Bangalore,AI startup,Unbox Robotics builds on-demand AI-driven ware...,"Pramod Ghadge, Shahid Memon","BEENEXT, Entrepreneur First","$1,200,000",Pre-series A
1,upGrad,2015.0,Mumbai,EdTech,UpGrad is an online higher education platform.,"Mayank Kumar, Phalgun Kompalli, Ravijot Chugh,...","Unilazer Ventures, IIFL Asset Management","$120,000,000",
2,Lead School,2012.0,Mumbai,EdTech,LEAD School offers technology based school tra...,"Smita Deorah, Sumeet Mehta","GSV Ventures, Westbridge Capital","$30,000,000",Series D
3,Bizongo,2015.0,Mumbai,B2B E-commerce,Bizongo is a business-to-business online marke...,"Aniket Deb, Ankit Tomar, Sachin Agrawal","CDC Group, IDG Capital","$51,000,000",Series C
4,FypMoney,2021.0,Gurugram,FinTech,"FypMoney is Digital NEO Bank for Teenagers, em...",Kapil Banwari,"Liberatha Kallat, Mukesh Yadav, Dinesh Nagpal","$2,000,000",Seed


In [8]:
#check for info on data 1
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1209 entries, 0 to 1208
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company_Brand  1209 non-null   object 
 1   Founded        1208 non-null   float64
 2   HeadQuarter    1208 non-null   object 
 3   Sector         1209 non-null   object 
 4   What_it_does   1209 non-null   object 
 5   Founders       1205 non-null   object 
 6   Investor       1147 non-null   object 
 7   Amount         1206 non-null   object 
 8   Stage          781 non-null    object 
dtypes: float64(1), object(8)
memory usage: 85.1+ KB


In [9]:
data1['year'] = 2021
data1.head()

Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage,year
0,Unbox Robotics,2019.0,Bangalore,AI startup,Unbox Robotics builds on-demand AI-driven ware...,"Pramod Ghadge, Shahid Memon","BEENEXT, Entrepreneur First","$1,200,000",Pre-series A,2021
1,upGrad,2015.0,Mumbai,EdTech,UpGrad is an online higher education platform.,"Mayank Kumar, Phalgun Kompalli, Ravijot Chugh,...","Unilazer Ventures, IIFL Asset Management","$120,000,000",,2021
2,Lead School,2012.0,Mumbai,EdTech,LEAD School offers technology based school tra...,"Smita Deorah, Sumeet Mehta","GSV Ventures, Westbridge Capital","$30,000,000",Series D,2021
3,Bizongo,2015.0,Mumbai,B2B E-commerce,Bizongo is a business-to-business online marke...,"Aniket Deb, Ankit Tomar, Sachin Agrawal","CDC Group, IDG Capital","$51,000,000",Series C,2021
4,FypMoney,2021.0,Gurugram,FinTech,"FypMoney is Digital NEO Bank for Teenagers, em...",Kapil Banwari,"Liberatha Kallat, Mukesh Yadav, Dinesh Nagpal","$2,000,000",Seed,2021


In [10]:
data2 = pd.read_csv(r"C:\Users\lenovo\Downloads\startup_funding2019.csv")
data2.head()

Unnamed: 0,Company/Brand,Founded,HeadQuarter,Sector,What it does,Founders,Investor,Amount($),Stage
0,Bombay Shaving,,,Ecommerce,Provides a range of male grooming products,Shantanu Deshpande,Sixth Sense Ventures,"$6,300,000",
1,Ruangguru,2014.0,Mumbai,Edtech,A learning platform that provides topic-based ...,"Adamas Belva Syah Devara, Iman Usman.",General Atlantic,"$150,000,000",Series C
2,Eduisfun,,Mumbai,Edtech,It aims to make learning fun via games.,Jatin Solanki,"Deepak Parekh, Amitabh Bachchan, Piyush Pandey","$28,000,000",Fresh funding
3,HomeLane,2014.0,Chennai,Interior design,Provides interior designing solutions,"Srikanth Iyer, Rama Harinath","Evolvence India Fund (EIF), Pidilite Group, FJ...","$30,000,000",Series D
4,Nu Genes,2004.0,Telangana,AgriTech,"It is a seed company engaged in production, pr...",Narayana Reddy Punyala,Innovation in Food and Agriculture (IFA),"$6,000,000",


In [11]:
# check for info data 2
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company/Brand  89 non-null     object 
 1   Founded        60 non-null     float64
 2   HeadQuarter    70 non-null     object 
 3   Sector         84 non-null     object 
 4   What it does   89 non-null     object 
 5   Founders       86 non-null     object 
 6   Investor       89 non-null     object 
 7   Amount($)      89 non-null     object 
 8   Stage          43 non-null     object 
dtypes: float64(1), object(8)
memory usage: 6.4+ KB


In [12]:
data2['year'] = 2019
data2.head()

Unnamed: 0,Company/Brand,Founded,HeadQuarter,Sector,What it does,Founders,Investor,Amount($),Stage,year
0,Bombay Shaving,,,Ecommerce,Provides a range of male grooming products,Shantanu Deshpande,Sixth Sense Ventures,"$6,300,000",,2019
1,Ruangguru,2014.0,Mumbai,Edtech,A learning platform that provides topic-based ...,"Adamas Belva Syah Devara, Iman Usman.",General Atlantic,"$150,000,000",Series C,2019
2,Eduisfun,,Mumbai,Edtech,It aims to make learning fun via games.,Jatin Solanki,"Deepak Parekh, Amitabh Bachchan, Piyush Pandey","$28,000,000",Fresh funding,2019
3,HomeLane,2014.0,Chennai,Interior design,Provides interior designing solutions,"Srikanth Iyer, Rama Harinath","Evolvence India Fund (EIF), Pidilite Group, FJ...","$30,000,000",Series D,2019
4,Nu Genes,2004.0,Telangana,AgriTech,"It is a seed company engaged in production, pr...",Narayana Reddy Punyala,Innovation in Food and Agriculture (IFA),"$6,000,000",,2019


In [13]:
data3= pd.read_csv(r"C:\Users\lenovo\Downloads\startup_funding2018.csv")
data3.head()

Unnamed: 0,Company Name,Industry,Round/Series,Amount,Location,About Company
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f..."
1,Happy Cow Dairy,"Agriculture, Farming",Seed,"₹40,000,000","Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,"₹65,000,000","Gurgaon, Haryana, India",Leading Online Loans Marketplace in India
3,PayMe India,"Financial Services, FinTech",Angel,2000000,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,—,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...


In [14]:
#check for info on data 3
data3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526 entries, 0 to 525
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company Name   526 non-null    object
 1   Industry       526 non-null    object
 2   Round/Series   526 non-null    object
 3   Amount         526 non-null    object
 4   Location       526 non-null    object
 5   About Company  526 non-null    object
dtypes: object(6)
memory usage: 24.8+ KB


In [15]:
data3['year'] = 2018
data3.head()

Unnamed: 0,Company Name,Industry,Round/Series,Amount,Location,About Company,year
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f...",2018
1,Happy Cow Dairy,"Agriculture, Farming",Seed,"₹40,000,000","Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...,2018
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,"₹65,000,000","Gurgaon, Haryana, India",Leading Online Loans Marketplace in India,2018
3,PayMe India,"Financial Services, FinTech",Angel,2000000,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...,2018
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,—,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...,2018


In [16]:

# Replace '₹', commas, '—', and "''" in 'Amount' column
data3['Amount'] = data3['Amount'].str.replace(',', '').str.replace('—', '').str.replace("''",'').replace('', np.nan)

# Conditionally apply multiplication only where '₹' is present
mask = data3['Amount'].str.contains('₹', na=False)
data3.loc[mask, 'Amount'] = data3.loc[mask, 'Amount'].str.replace('₹', '').astype(float) * 0.0146

#### CONCATENATE THE DATA INTO ONE DATAFRAME
 The column names of the datasets above are different but with the same data.
 Create a function that will rename the columns and restructure the data.

In [17]:
def concat_dataframes(data,data1, data2, data3):
    """
    Concatenates four dataframes along the row axis.

    Parameters:
    data (pandas.DataFrame): The first dataframe to concatenate.
    data1 (pandas.DataFrame): The second dataframe to concatenate.
    data2 (pandas.DataFrame): The third dataframe to concatenate.
    data3 (pandas.DataFrame): The fourth dataframe to concatenate.

    Returns:
    pandas.DataFrame: The concatenated dataframe.
    """
    # Rename columns to make them consistent across dataframes
    df1 = data.rename(columns={'Company_Brand': 'company_brand', 'Founded': 'founded','HeadQuarter': 'headquarter','Sector': 'sector','What_it_does': 'about_company','Founders':'founders','Investor':'investor','Amount':'amount','Stage':'series', 'year' : 'year'})
    df2 = data1.rename(columns={'Company_Brand': 'company_brand', 'Founded': 'founded','HeadQuarter': 'headquarter','Sector': 'sector','What_it_does': 'about_company','Founders':'founders','Investor':'investor','Amount':'amount','Stage':'series','year' : 'year'})
    df3 = data2.rename(columns={'Company/Brand': 'company_brand', 'Founded': 'founded','HeadQuarter': 'headquarter','Sector': 'sector','What it does': 'about_company','Founders':'founders','Investor':'investor','Amount($)':'amount','Stage':'series','year' : 'year'})
    df4 = data3.rename(columns={'Company Name': 'company_brand','Location': 'headquarter','Industry': 'sector','About Company': 'about_company','Amount':'amount','Round/Series':'series','year' : 'year'})

    # Concatenate dataframes along the row axis
    result = pd.concat([df1, df2, df3,df4], axis=0)

    return result


In [18]:
dataset= concat_dataframes(data,data1,data2,data3)
dataset.head()

Unnamed: 0,company_brand,founded,headquarter,sector,about_company,founders,investor,amount,series,column10,year
0,Aqgromalin,2019.0,Chennai,AgriTech,Cultivating Ideas for Profit,"Prasanna Manogaran, Bharani C L",Angel investors,200000.0,,,2020
1,Krayonnz,2019.0,Bangalore,EdTech,An academy-guardian-scholar centric ecosystem ...,"Saurabh Dixit, Gurudutt Upadhyay",GSF Accelerator,100000.0,Pre-seed,,2020
2,PadCare Labs,2018.0,Pune,Hygiene management,Converting bio-hazardous waste to harmless waste,Ajinkya Dhariya,Venture Center,,Pre-seed,,2020
3,NCOME,2020.0,New Delhi,Escrow,Escrow-as-a-service platform,Ritesh Tiwari,"Venture Catalysts, PointOne Capital",400000.0,,,2020
4,Gramophone,2016.0,Indore,AgriTech,Gramophone is an AgTech platform enabling acce...,"Ashish Rajan Singh, Harshit Gupta, Nishant Mah...","Siana Capital Management, Info Edge",340000.0,,,2020


In [19]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2879 entries, 0 to 525
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   company_brand  2879 non-null   object 
 1   founded        2110 non-null   float64
 2   headquarter    2765 non-null   object 
 3   sector         2861 non-null   object 
 4   about_company  2879 non-null   object 
 5   founders       2334 non-null   object 
 6   investor       2253 non-null   object 
 7   amount         2474 non-null   object 
 8   series         1941 non-null   object 
 9   column10       2 non-null      object 
 10  year           2879 non-null   int64  
dtypes: float64(1), int64(1), object(9)
memory usage: 269.9+ KB


In [20]:
dataset.shape

(2879, 11)

In [21]:
dataset.describe()

Unnamed: 0,founded,year
count,2110.0,2879.0
mean,2016.079621,2020.023619
std,4.368006,1.086974
min,1963.0,2018.0
25%,2015.0,2020.0
50%,2017.0,2020.0
75%,2019.0,2021.0
max,2021.0,2021.0


In [22]:
dataset.describe(include='all')

Unnamed: 0,company_brand,founded,headquarter,sector,about_company,founders,investor,amount,series,column10,year
count,2879,2110.0,2765,2861,2879,2334,2253,2474,1941,2,2879.0
unique,2214,,172,873,2691,1980,1777,771,75,2,
top,BharatPe,,Bangalore,FinTech,Provides online learning classes,Byju Raveendran,Inflection Point Ventures,$Undisclosed,Seed,Pre-Seed,
freq,10,,764,173,5,7,36,73,606,1,
mean,,2016.079621,,,,,,,,,2020.023619
std,,4.368006,,,,,,,,,1.086974
min,,1963.0,,,,,,,,,2018.0
25%,,2015.0,,,,,,,,,2020.0
50%,,2017.0,,,,,,,,,2020.0
75%,,2019.0,,,,,,,,,2021.0


In [23]:
dataset.isna().sum()

company_brand       0
founded           769
headquarter       114
sector             18
about_company       0
founders          545
investor          626
amount            405
series            938
column10         2877
year                0
dtype: int64

In [24]:
dataset.duplicated().sum()

23

In [25]:
dataset.nunique()

company_brand    2214
founded            34
headquarter       172
sector            873
about_company    2691
founders         1980
investor         1777
amount            771
series             75
column10            2
year                4
dtype: int64

OBSERVATION
###  Issues with the data
1. There is a discrepancy in the naming conventions between the columns in the 2018 and 2019 datasets compared to the 2020 and 2021 datasets.

2. The 2018 dataset exhibits some missing columns, contributing to an incomplete representation of the data.

3. Conversely, the 2020 dataset contains an additional column that appears to be extraneous and does not serve a meaningful purpose in our analysis.

### Course of Action:
##### 1. Missing Column Engineering for 2018:
---> We will address the absence of certain columns in the 2018 dataset by employing data engineering techniques to create and populate the missing columns, ensuring a comprehensive and consistent dataset.

##### 2. Column Name Standardization:
---> To establish uniformity and coherence across all datasets, we will embark on a column renaming process for the 2018 and 2019 datasets. This action aims to align the naming conventions with those observed in the 2020 and 2021 datasets, facilitating seamless data integration and analysis. 

#### 3. Extraneous Column Removal in 2020:
---> The redundant column identified in the 2020 dataset will be removed, streamlining the dataset and eliminating unnecessary elements that do not contribute to the overall analysis objectives.

These actions collectively enhance the integrity, consistency, and completeness of the dataset, paving the way for a more robust and coherent analytical process. ical process


BUSINESS QUESTIONS
1. Which industries/sectors have the largest funding?
2. Who are the top ten competitors with the largest funding?
3. Who are the top investors?(frequency)
4. Which year was funds allocated the most?
5. Which is the largest headquaters for startup business in India?
6. What is the average funds granted to startups?

BUSINESS QUESTIONS 
1. How does funding vary across different industry sectors in India?
2. How does funding vary with the loaction of the start-ups
3. What is the relationship between the amount of funding and the stage of the company?
4. How have funding trends evolved between 2018 and 2021?
5. What are the most attractive sectors for investors?
6. Does the location of the company influence its sector?




#### DATA PREPARATION AND CLEANING 

In [26]:
pd.set_option("display.max_rows", None)
pd.set_option('display.max_colwidth', None)


In [27]:
dataset= dataset.reset_index(drop=True)

In [28]:
dataset.tail()

Unnamed: 0,company_brand,founded,headquarter,sector,about_company,founders,investor,amount,series,column10,year
2874,Udaan,,"Bangalore, Karnataka, India","B2B, Business Development, Internet, Marketplace","Udaan is a B2B trade platform, designed specifically for small and medium businesses in India.",,,225000000.0,Series C,,2018
2875,Happyeasygo Group,,"Haryana, Haryana, India","Tourism, Travel",HappyEasyGo is an online travel domain.,,,,Series A,,2018
2876,Mombay,,"Mumbai, Maharashtra, India","Food and Beverage, Food Delivery, Internet",Mombay is a unique opportunity for housewives to start household food business and avail everyone with their homemade healthy dishes.,,,7500.0,Seed,,2018
2877,Droni Tech,,"Mumbai, Maharashtra, India",Information Technology,Droni Tech manufacture UAVs and develop software to service a range of industry requirements.,,,511000.0,Seed,,2018
2878,Netmeds,,"Chennai, Tamil Nadu, India","Biotechnology, Health Care, Pharmaceutical",Welcome to India's most convenient pharmacy!,,,35000000.0,Series C,,2018


In [29]:
#Drop the extreneous column 10
dataset.drop('column10', axis=1, inplace= True)

In [70]:
#create a list of the wrong entries
## Pharmaceuticals - index 1297, 1311 (this is duplicated) start from end to forward
## Online Media - index 2155 (this is duplicated) start from end to forward
## Computer Games - index 1153  ( this is duplicated ) start from end to forward , replace with none 
## Information Technology & Services - index 2231 
## Food & Beverages index 1296,1310     (this is duplicated) change sector and  HQ

In [71]:
def swap_row_cells(df, row_index, col1_index, col2_index):
    """
    Swaps the values in the given columns and row of a data array.

    Parameters:
    df (list): A 2D list representing the data array.
    row_index (int): The index of the first column.
    col1_index (int): The index of the second column.
    col2_index (int): The index of the row where the values are swapped.

    Returns:
    list: The modified data array with swapped values.
    """
    df.loc[row_index, [col1_index,col2_index]] = df.loc[row_index,[col2_index,col1_index]].values
    
    return dataset.head()
    

In [72]:
# correcting all the cells with wrong entries
#### row index 2155 
swap_row_cells(dataset, 2155, 'investor', 'amount')
swap_row_cells(dataset, 2155, 'founders', 'investor')
swap_row_cells(dataset, 2155, 'about_company', 'founders')
swap_row_cells(dataset, 2155, 'sector', 'about_company')
swap_row_cells(dataset, 2155, 'headquarter', 'sector')

#### row index 1312
swap_row_cells(dataset, 1312, 'amount', 'series')
swap_row_cells(dataset, 1312, 'investor', 'amount')
swap_row_cells(dataset, 1312,'founders', 'investor')
swap_row_cells(dataset, 1312,'about_company', 'founders')
swap_row_cells(dataset, 1312,'sector', 'about_company')

#### row index 1297
swap_row_cells(dataset,1297,'amount', 'series' )
swap_row_cells(dataset,1297,'investor', 'amount' )
swap_row_cells(dataset, 1297, 'founders', 'investor')
swap_row_cells(dataset, 1297, 'about_company', 'founders')
swap_row_cells(dataset, 1297, 'sector', 'about_company')
swap_row_cells(dataset,1297, 'headquarter', 'sector')

### row index 1311
swap_row_cells(dataset, 1311, 'amount', 'series')
swap_row_cells(dataset, 1311, 'investor', 'amount')
swap_row_cells(dataset, 1311, 'founders', 'investor')
swap_row_cells(dataset, 1311, 'about_company', 'founders')
swap_row_cells(dataset, 1311, 'sector', 'about_company')
swap_row_cells(dataset, 1311, 'headquarter', 'sector')

#### row index 1296
swap_row_cells(dataset, 1296, 'headquarter', 'sector')

#### row index 2231
swap_row_cells(dataset, 2231, 'headquarter', 'sector')

#### row index 1310
swap_row_cells(dataset, 1310, 'headquarter', 'sector')

#### row index 1593
swap_row_cells(dataset, 1593, 'amount', 'series')

#### row index 1600
swap_row_cells(dataset, 1600, 'amount', 'series')
swap_row_cells(dataset, 1600, 'investor', 'amount')

### row index 1606
swap_row_cells(dataset, 1606, 'amount', 'series')

#### row index 1732
swap_row_cells(dataset, 1732, 'amount', 'series')

#### row index 2203
swap_row_cells(dataset, 2203, 'amount', 'series')
swap_row_cells(dataset,2203, 'investor', 'amount')

#### row index 1153
swap_row_cells(dataset, 1153, 'amount', 'series')

#### row index 1166
swap_row_cells(dataset, 1166, 'amount', 'series')

### row index 1729
swap_row_cells(dataset, 1729, 'amount', 'series')



# replace computer games with none 
dataset['headquarter'] = dataset['headquarter'].replace('Computer Games', None)


DATA CLEANING ON EACH COLUMN 

1. COMPANY_BRAND 

In [74]:
#check for missing values
print(f" There are {dataset['company_brand'].isnull().sum()} missing values")


 There are 0 missing values


In [75]:
dataset['company_brand'].unique()

array(['Aqgromalin', 'Krayonnz', 'PadCare Labs', ..., 'Mombay',
       'Droni Tech', 'Netmeds'], dtype=object)

In [76]:
# check for duplicates and unique company names
dataset['company_brand'].unique
print(f" There are {dataset['company_brand'].duplicated().sum()} duplicates")

 There are 665 duplicates


Since there are companies with unique names but dealing in different sectors we wont alter the duplicates.

The column company brand is cleaning

2. FOUNDED

In [77]:
# there are missing years in the dataset 
print(f" There are {dataset['founded'].isna().sum()} missing values")

# check the data type
print(f"The column is a {dataset['founded'].dtype} data type")

 There are 769 missing values
The column is a float64 data type


COURSE OF ACTION

---> We will be cleaning the missing values by fill them using interpolate method which is suitable for time series data.

---> We will also be converting the data type from float to datetime for purpose of our analysis.

In [78]:
dataset["founded"].interpolate(method='linear',inplace= True)

print(f"There are {dataset['founded'].isna().sum()} missing values")

There are 0 missing values


In [79]:
# Convert to datetime
dataset['founded'] = pd.to_datetime(dataset['founded'], format='%Y')

# Convert to period
dataset['founded'] = dataset['founded'].dt.to_period('Y')


3. HEADQUARTER


In [80]:
dataset['headquarter'].unique()

array(['Chennai', 'Bangalore', 'Pune', 'New Delhi', 'Indore', 'Hyderabad',
       'Gurgaon', 'Belgaum', 'Noida', 'Mumbai', 'Andheri', 'Jaipur',
       'Ahmedabad', 'Kolkata', 'Tirunelveli, Tamilnadu', 'Thane', None,
       'Singapore', 'Gurugram', 'Gujarat', 'Haryana', 'Kerala', 'Jodhpur',
       'Jaipur, Rajastan', 'Delhi', 'Frisco, Texas, United States',
       'California', 'Dhingsara, Haryana', 'New York, United States',
       'Patna', 'San Francisco, California, United States',
       'San Francisco, United States', 'San Ramon, California',
       'Paris, Ile-de-France, France', 'Plano, Texas, United States',
       'Sydney', 'San Francisco Bay Area, Silicon Valley, West Coast',
       'Bangaldesh', 'London, England, United Kingdom',
       'Sydney, New South Wales, Australia', 'Milano, Lombardia, Italy',
       'Palmwoods, Queensland, Australia', 'France',
       'San Francisco Bay Area, West Coast, Western US',
       'Trivandrum, Kerala, India', 'Cochin', 'Samastipur, Bihar',


In [81]:
print(f"there are {dataset['headquarter'].isnull().sum()} missing values")


there are 119 missing values


Observation

---> There are companies which have unidentified locations and have been placed as none.

----> There are wrong entries in the column

---> Clean the locations with only the city for accurate analysis.

Course of action

----> Since there are companies where their location is unidentified as none we will fill the missing values with none.

----> split the long locations with one location.

----> assess the wrong entries and place a function that returns the orginal data into the column.


In [82]:
def clean_replace(dataframe,column_name):
    dataframe[column_name] = dataframe[column_name].replace('-', np.nan)
    #clean the nan values with none
    dataframe[column_name] = dataframe[column_name].fillna(value='None')

    return dataset.head()

In [83]:
clean_replace(dataset,"headquarter")

Unnamed: 0,company_brand,founded,headquarter,sector,about_company,founders,investor,amount,series,year
0,Aqgromalin,2019,Chennai,AgriTech,Cultivating Ideas for Profit,"Prasanna Manogaran, Bharani C L",Angel investors,200000.0,,2020
1,Krayonnz,2019,Bangalore,EdTech,An academy-guardian-scholar centric ecosystem which provides state of the art technological solutions.,"Saurabh Dixit, Gurudutt Upadhyay",GSF Accelerator,100000.0,Pre-seed,2020
2,PadCare Labs,2018,Pune,Hygiene management,Converting bio-hazardous waste to harmless waste,Ajinkya Dhariya,Venture Center,,Pre-seed,2020
3,NCOME,2020,New Delhi,Escrow,Escrow-as-a-service platform,Ritesh Tiwari,"Venture Catalysts, PointOne Capital",400000.0,,2020
4,Gramophone,2016,Indore,AgriTech,Gramophone is an AgTech platform enabling access to agri inputs and powering efficient farm management.,"Ashish Rajan Singh, Harshit Gupta, Nishant Mahatre, Tauseef Khan","Siana Capital Management, Info Edge",340000.0,,2020


In [84]:
print(f"There are {dataset['headquarter'].isnull().sum()} missing values")

There are 0 missing values


In [85]:
dataset['headquarter']= dataset['headquarter'].str.split(',').str[0].str.replace("'","", regex=True)
dataset['headquarter'] = dataset['headquarter'].str.strip('\t#REF!')

In [86]:
dataset['headquarter']=dataset['headquarter'].replace({'Bengaluru': 'Bangalore', 'Banglore': 'Bangalore', 'Gurugram': 'Gurgaon', 'Hyderebad': 'Hyderabad', 
                                      'New Delhi': 'Delhi', 'Ahmadabad': 'Ahmedabad', 'Ernakulam': 'Cochin', 'Telugana': 'Telangana',
                                      'Rajastan': 'Rajasthan', 'San Franciscao': 'San Francisco', 'Samsitpur': 'Samastipur', 'Santra': 'Samtra',
                                      'Rajsamand': 'Rajasthan', 'Kerala': 'Kochi','The Nilgiris': 'Nilgiris', 'Gurugram': 'Gurgaon', 
                                      'California': 'San Francisco', 'San Francisco Bay Area': 'San Francisco', 'Hyderebad': 'Hyderabad' ,'Small Towns': 'Small Towns','Orissia': 'Odisha', 
                                      'Santra': 'Samtra', 'Vadodara': 'Vadodara', 'Bangaldesh': 'Bangladesh',}) 


4. SECTOR 


In [87]:
dataset["sector"].unique()

array(['AgriTech', 'EdTech', 'Hygiene management', 'Escrow',
       'Networking platform', 'FinTech', 'Crowdsourcing',
       'Food & Bevarages', 'HealthTech', 'Fashion startup',
       'Food Industry', 'Food Delivery', 'Virtual auditing startup',
       'E-commerce', 'Gaming', 'Work fulfillment', 'AI startup',
       'Telecommunication', 'Logistics', 'Tech Startup', 'Sports',
       'Retail', 'Medtech', 'Tyre management', 'Cloud company',
       'Software company', 'Venture capitalist', 'Renewable player',
       'IoT startup', 'SaaS startup', 'Aero company', 'Marketing company',
       'Retail startup', 'Co-working Startup', 'Finance company',
       'Tech company', 'Solar Monitoring Company',
       'Video sharing platform', 'Gaming startup',
       'Video streaming platform', 'Consumer appliances',
       'Blockchain startup', 'Conversational AI platform', 'Real Estate',
       'SaaS platform', 'AI platform', 'Fusion beverages', 'HR Tech',
       'Job portal', 'Hospitality', 'Digit

In [88]:
print(f" There are {dataset['sector'].isnull().sum()} missing values")

 There are 19 missing values


Observation 

----> There are unidentified sectors/industries which have none

----> There are unidentified sectors that need to be replaced with actual industry names

Course of action 

----> We will be replacing the missing values with none to show some companies have unidentified industry they are in.

----> Replace the unidentified names with industry names

In [89]:
clean_replace(dataset, 'sector')
print(f" There are {dataset['sector'].isna().sum()} missing values")

 There are 0 missing values


In [90]:
dataset['sector']= dataset['sector'].str.split(',').str[0].str.replace("'","", regex=True)

In [91]:
# there are certain sectors which are not defined well so we need to replace them with actual sectors
# create a list :
## Android - 
## Helathcare
## crowd funding
## heath care
## wellness
## basketball 
## catering 
## cooking


dataset['sector'].replace('Android','Edtech', inplace=True)
dataset['sector'].replace('Crowdfunding', 'Crowdsourcing', inplace=True)
dataset['sector'].replace('Wellness', 'HealthCare',inplace=True)
dataset['sector'].replace('Basketball', ' Apps', inplace=True)
dataset['sector'].replace('Catering', 'Food Delivery', inplace=True)
dataset['sector'].replace('Cooking', 'Food Delivery',inplace=True)
dataset['sector'] = dataset['sector'].replace({'Edtech': 'EdTech', 
                                               'Fintech': 'FinTech', 
                                               'Agriculture': 'AgriTech', 
                                               'Food & Beverages': 'Food and Beverages',
                                               'Financial Services': 'FinTech',
                                                'Healthcare': 'HealthTech', 'HealthTech': 'HealthTech', 'Medical': 'HealthTech','Medtech': 'HealthTech', 'Pharmaceutical': 'HealthTech', 
                                                'Health Insurance': 'HealthTech','Biotechnology': 'HealthTech', 'Health Diagnostics': 'HealthTech', 
                                                'Hospital': 'HealthTech','Hospital & Health Care': 'HealthTech', 'Wellness': 'HealthTech', 
                                                'Dental': 'HealthTech','Alternative Medicine': 'HealthTech', 'Nutrition': 'HealthTech', 
                                                'Fitness': 'HealthTech','Mental Health': 'HealthTech', 'Healthcare/Edtech': 'HealthTech',
                                                'Life sciences': 'HealthTech', 'Biotech': 'Healthcare', 'Nutrition Tech': 'HealthTech','E-mobility': 'HealthTech', 
                                                'Med Tech': 'HealthTech', 'FemTech': 'HealthTech','Cannabis startup': 'HealthTech', 'Pharmacy': 'HealthTech', 'Medical Device': 'HealthTech',
                                                'BioTechnology': 'HealthTech', 'Fertility tech': 'HealthTech', 'Ayurveda tech': 'HealthTech','E-tail': 'Healthcare', 'E store': 'E-Commerce', 'E-store': 'Healthcare', 
                                                'Telemedicine': 'Healthcare','HealthCare': 'HealthTech',
                                                'AI startup': 'AI', 
                                                'Information Services': 'InfoTech','Healthtech': 'HealthTech',
                                                'Finance': 'FinTech', 'Health Care': 'HealthTech',
                                                'Logistics & Supply Chain': 'Logistics', 
                                                'Food Industry': 'FoodTech', 'Foodtech': 'FoodTech', 
                                                'SaaS startup': 'SaaS', 'Health': 'HealthTech', 
                                                'Ecommerce': 'E-Commerce','Tech Startup': 'Tech', 
                                                'Mobility': 'Transportation', 'SaaS': 'Tech', 'Artificial Intelligence': 'AI',
                                                'Food and Beverage': 'Food and Beverages', 
                                                'Information Technology': 'InfoTech','Internet': 'Tech', 'Apps': 'Tech', 'Computer Software': 'Tech', 
                                                'E-commerce': 'E-Commerce','Agritech': 'AgriTech', 'Hospital & Health Care': 'HealthTech', 'Food': 'Foodtech', 
                                                'Cosmetics': 'Consumer Goods','Tech company': 'Tech', 
                                                'Automobile': 'Automotive', 'Apparel & Fashion': 'Fashion', 'Education': 'EdTech',
                                                'Social Media': 'Media', 'Digital Media': 'Media', 'IT': 'InfoTech', 'IoT': 'AI','Software': 'Tech', 
                                                'Industrial Automation': 'AI', 'Technology': 'Tech','Information Technology & Services': 'InfoTech & Services', None: 'Unknown'})


5. ABOUT_COMPANY

In [92]:
dataset["about_company"].unique()

array(['Cultivating Ideas for Profit',
       'An academy-guardian-scholar centric ecosystem which provides state of the art technological solutions.',
       'Converting bio-hazardous waste to harmless waste', ...,
       'Mombay is a unique opportunity for housewives to start household food business and avail everyone with their homemade healthy dishes.',
       'Droni Tech manufacture UAVs and develop software to service a range of industry requirements.',
       "Welcome to India's most convenient pharmacy!"], dtype=object)

In [93]:
print(f"There are {dataset['about_company'].isnull().sum()} missing values")

There are 0 missing values


6. FOUNDERS

In [94]:
dataset['founders'].unique()

array(['Prasanna Manogaran, Bharani C L',
       'Saurabh Dixit, Gurudutt Upadhyay', 'Ajinkya Dhariya', ...,
       'Pavan Kushwaha', 'Jeevan Chowdary M, Harshit Harchani',
       'Niraj Singh, Ramanshu Mahaur, Ganesh Pawar, Mohit Gupta'],
      dtype=object)

In [95]:
dataset['founders'].isnull().sum()

545

In [96]:
clean_replace(dataset, 'founders')
print(f" There are {dataset['founders'].isna().sum()} missing values")

 There are 0 missing values


In [97]:
# strip off characters 
dataset['founders'] = dataset['founders'].str.strip('\t#REF!')

7. INVESTORS

In [98]:
dataset['investor'].unique()

array(['Angel investors', 'GSF Accelerator', 'Venture Center', ...,
       'Norwest Venture Partners, General Catalyst, Fundamentum, Accel Partners',
       'TPG, Norwest Venture Partners, Evolvence India', nan],
      dtype=object)

In [99]:
print(f" There are {dataset['investor'].isnull().sum()} missing values")

 There are 628 missing values


Observation 

---> There are unidentified investors and there are some with not applicable(nan).
---> There are missing values in the column

Course of action 

----> We will be replacing the missing values with none 

----> Change the nan and replace with none. 

In [100]:
clean_replace(dataset, 'investor')

print(f" There are {dataset['investor'].isna().sum()} missing values")

 There are 0 missing values


In [101]:
dataset.loc[:, 'investor'] = dataset.loc[:, 'investor'].replace('nan', 'None', regex=True)
dataset.loc[:, 'investor'] = dataset.loc[:, 'investor'].replace('http://100x.vc/', 'None', regex=True)
dataset['investor']= dataset['investor'].str.split(',').str[0].str.replace("'","", regex=True)
dataset['investor'] = dataset['investor'].str.strip('\t#REF!')

8. AMOUNT

In [102]:
dataset['amount'].unique()

array([200000.0, 100000.0, nan, 400000.0, 340000.0, 600000.0, 45000000.0,
       1000000.0, 2000000.0, 1200000.0, 660000000.0, 120000.0, 7500000.0,
       5000000.0, 500000.0, 3000000.0, 10000000.0, 145000000.0,
       100000000.0, 21000000.0, 4000000.0, 20000000.0, 560000.0, 275000.0,
       4500000.0, 15000000.0, 390000000.0, 7000000.0, 5100000.0,
       700000000.0, 2300000.0, 700000.0, 19000000.0, 9000000.0,
       40000000.0, 750000.0, 1500000.0, 7800000.0, 50000000.0, 80000000.0,
       30000000.0, 1700000.0, 2500000.0, 40000.0, 33000000.0, 35000000.0,
       300000.0, 25000000.0, 3500000.0, 200000000.0, 6000000.0, 1300000.0,
       4100000.0, 575000.0, 800000.0, 28000000.0, 18000000.0, 3200000.0,
       900000.0, 250000.0, 4700000.0, 75000000.0, 8000000.0, 121000000.0,
       55000000.0, 3300000.0, 11000000.0, 16000000.0, 5400000.0,
       150000000.0, 4200000.0, 22000000.0, 52000000.0, 1100000.0,
       118000000.0, 1600000.0, 18500000.0, 70000000000.0, 800000000.0,
       4000

In [103]:
print(f"There are {dataset['amount'].isna().sum()} missing values")

There are 403 missing values


Observation

----> The amount column has rupies and dollar currency.
----> There is a shuffle of row cells between the stage and amount
----> There are missing values in the column

Course of action

---> We will be changing the currency of the amount to dollar
---> We will also fill the missing values with 0
---> We will be changing the currency to dollar 


In [105]:

# Remove dollar sign
dataset['amount'] = dataset['amount'].replace('\$', '', regex=True)

# Remove commas
dataset['amount'] = dataset['amount'].str.replace(',', '')

# Remove all other irrelevant characters, words and symbols
dataset['amount'] = dataset['amount'].replace(["Upsparks", 'undisclosed', 'Undisclosed',
                                                      ',' ], '')

# strip off characters 
dataset['amount'] = dataset['amount'].str.strip('\t#REF!')

# Convert the 'amount($)' column to numeric
dataset['amount'] = pd.to_numeric(dataset['amount'])

9. SERIES


Startups start with pre-seed, progress through seed, Series A, Series B, etc., securing resources for development and strategies. Additional rounds like Series C or D may follow. External funding at each stage fuels growth toward the venture's full potential.

**Pre-Seed Funding**  
Entrepreneurial idea in early development; small funds needed; limited informal channels for raising funds.

**Seed Funding**  
First official equity funding; investors provide funds for equity ownership.

**Series A Financing**  
First venture capital round; developed product, consistent revenue, long-term profit plan.

**Series B Financing**  
For established startups; substantial user base and revenue; funding for expansion.

**Series C and Beyond**  
Optional rounds for final push before IPO or unmet objectives; Series C is the third venture capital round.

**Initial Public Offering (IPO)**  
Process of offering corporate shares to the public; used for funding or divestment.

link: https://www.startupindia.gov.in/content/sih/en/funding.html

In [106]:
# Cleaning stage column
dataset['series'].unique()
dataset['series']=dataset['series'].replace('https://docs.google.com/spreadsheets/d/1x9ziNeaz6auNChIHnMI8U6kS7knTr3byy_YBGfQaoUA/edit#gid=1861303593',np.NaN)

In [107]:
# Standardize funding stages in the 'stage' column
dataset['series'] = dataset['series'].replace(['Series A', 'Seies A', 'Series A-1', 'Series A2', 'Series A+', 'Series A+'], 'Series A')
dataset['series'] = dataset['series'].replace(['Pre-seed', 'Pre-seed Round', 'Pre seed Round', 'Pre seed round'], 'Pre-Seed Stage')
dataset['series'] = dataset['series'].replace(['Pre series A', 'Pre-series A', 'Pre Series A', 'Pre series A1', 'Pre-series A1', 'Pre- series A'], 'Pre series A')
dataset['series'] = dataset['series'].replace(['Series B', 'Series B+', 'Series B2', 'Series B3'], 'Series B')
dataset['series'] = dataset['series'].replace(['Series C', 'Series C', 'Series C, D','Series C', 'Private Equity','PE', 'Post-IPO Equity','Series D', 'Series E', 'Series F', 'Series G', 'Series H', 'Series I','Series D1','Series F2', 'Series F1'], 'Series C and Beyond')
dataset['series'] = dataset['series'].replace(['Venture - Series Unknown', None,'Grant','Debt','Debt Financing','Post-IPO Debt','Non-equity Assistance','Bridge','Bridge Round','Fresh funding','Funding Round','Mid series','Edge','ah! Ventures','ITO Angel Network LetsVenture','JITO Angel Network LetsVenture'], 'unknown')
dataset['series'] = dataset['series'].replace(['Corporate Round','Undisclosed','Secondary Market','Pre-series','Post series A','Pre-series B','Pre-Series B','Pre series B','Pre-series C','Pre series C'], 'Other Stages')
dataset['series'] = dataset['series'].replace(['Seed','Seed funding','Pre-Seed','Angel', 'Angel Round','Seed fund', 'Seed round', 'Seed A','Seed Funding', 'Seed Round & Series A', 'Series E2', 'Seed Round','Seed Investment','Seed+','Early seed'],'Seed Stage')

In [108]:
dataset['series'].value_counts()

series
unknown                            1051
Seed Stage                          748
Series A                            309
Pre series A                        292
Series C and Beyond                 234
Series B                            138
Pre-Seed Stage                       65
Other Stages                         38
Upsparks                              2
ITO Angel Network, LetsVenture        1
JITO Angel Network, LetsVenture       1
Name: count, dtype: int64

In [109]:
dataset['series'].isna().sum()

0

In [110]:
# strip off characters 
dataset['series'] = dataset['series'].str.strip('\t#REF!')

10. YEAR 

In [111]:
dataset['year'].dtype

dtype('int64')

In [112]:
# # Convert the data_year column to date
dataset['year']=pd.to_datetime(dataset['year'], format='%Y')
dataset['year']=dataset['year'].dt.to_period('y')
# # df['founded']=pd.to_datetime(df['founded']).dt.year

General cleaning 

In [113]:
dataset[dataset.duplicated()]

Unnamed: 0,company_brand,founded,headquarter,sector,about_company,founders,investor,amount,series,year
145,Krimanshi,2015,Jodhpur,Biotechnology company,Krimanshi aims to increase rural income by improving the productivity of Indian cattle with better quality feed.,Nikhil Bohra,ajasthan Venture Capital Fund,,Seed Stage,2020
205,Nykaa,2012,Mumbai,Consumer Goods,Nykaa is an online marketplace for different beauty and wellness products.,alguni Nayar,Alia Bhatt,,unknown,2020
362,Byju’s,2011,Bangalore,EdTech,An Indian educational technology and online tutoring firm,Byju Raveendran,Owl Ventures,,unknown,2020
367,Zomato,2008,Haryana,Food devlivery,Get online food delivery from restaurants near you,"Deepinder Goyal, Pankaj Chaddah",MacRitchie Investments,,unknown,2020
813,Nykaa,2012,Mumbai,E-Commerce,Deals in cosmetic and wellness products,alguni Nayar,Steadview capital,,unknown,2020
963,Vogo,2016,Bangalore,Automotive,A scooter-sharing platform allowing users to rent a two-wheeler from specific designated pick up points,"Anand Ayyadurai, Padmanabhan Balakrishnan, Sanchit Mittal",Lightstone Aspada,,Series C and Beyond,2020
1009,Bounce,2014,Bangalore,Automotive and Rentals,Offers a variety of bikes and scooters that can be rented on a subscription basis,"Vivekananda Hallekere, Anil Giri Raju,Arun Agni",Accel Partners,,Series C and Beyond,2020
1162,Curefoods,2020,Bangalore,Food and Beverages,Healthy & nutritious foods and cold pressed juices produced in Edinburgh. Currently distributing wholesale within the Edinburgh region.,Ankit Nagori,Iron Pillar,13000000.0,unknown,2021
1164,Bewakoof,2012,Mumbai,Fashion,"Bewakoof is a lifestyle fashion brand that makes creative, distinctive fashion for the trendy, contemporary Indian.",Prabhkiran Singh,InvestCorp,8000000.0,unknown,2021
1166,FanPlay,2020,,Computer Games,A real money game app specializing in trivia games,YC W21,Pritesh Kumar,1200000.0,Upsparks,2021


In [114]:
dataset.drop_duplicates(keep='first', inplace= True)
dataset.head()

Unnamed: 0,company_brand,founded,headquarter,sector,about_company,founders,investor,amount,series,year
0,Aqgromalin,2019,Chennai,AgriTech,Cultivating Ideas for Profit,"Prasanna Manogaran, Bharani C L",Angel investors,,unknown,2020
1,Krayonnz,2019,Bangalore,EdTech,An academy-guardian-scholar centric ecosystem which provides state of the art technological solutions.,"Saurabh Dixit, Gurudutt Upadhyay",GSF Accelerator,,Pre-Seed Stage,2020
2,PadCare Labs,2018,Pune,Hygiene management,Converting bio-hazardous waste to harmless waste,Ajinkya Dhariya,Venture Center,,Pre-Seed Stage,2020
3,NCOME,2020,Delhi,Escrow,Escrow-as-a-service platform,itesh Tiwari,Venture Catalysts,,unknown,2020
4,Gramophone,2016,Indore,AgriTech,Gramophone is an AgTech platform enabling access to agri inputs and powering efficient farm management.,"Ashish Rajan Singh, Harshit Gupta, Nishant Mahatre, Tauseef Khan",Siana Capital Management,,unknown,2020


In [116]:
print(f" There are {dataset.duplicated().sum()} duplicates")

 There are 0 duplicates
