In [2]:
#Importing all the necessary packages
import pyodbc #just installed with pip
import os
from dotenv import dotenv_values #import the dotenv_values function from the dotenv package
import pandas as pd
import warnings


warnings. filterwarnings('ignore')

In [3]:
# Load environment variables from .env file into a dictionary
environment_variables = dotenv_values('.env')

# Get the values for the credentials you set in the '.env' file
database = environment_variables.get("DB_NAME")

server = environment_variables.get("DB_SERVER")
username = environment_variables.get("SQL_DB_LOGIN")
password = environment_variables.get("DB_PASS")

connection_string = f"DRIVER={{SQL Server}} ; SERVER={server}; DATABASE={database}; UID={username} ; PWD={password}"

In [4]:
# Use the connect method of the pyodbc library and pass in the connection string.

connection = pyodbc.connect( connection_string)

# Now the sql query to get the data is what what you see below.


In [20]:
#Querying the database to retrieve all relevant files from table 1
query1 = "SELECT * FROM dbo.LP1_startup_funding2020"
df_2020 = pd.read_sql(query1, connection)
df_2020.head(2)

Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage,column10
0,Aqgromalin,2019.0,Chennai,AgriTech,Cultivating Ideas for Profit,"Prasanna Manogaran, Bharani C L",Angel investors,200000.0,,
1,Krayonnz,2019.0,Bangalore,EdTech,An academy-guardian-scholar centric ecosystem ...,"Saurabh Dixit, Gurudutt Upadhyay",GSF Accelerator,100000.0,Pre-seed,


In [21]:
#Querying the database to retrieve all relevant files from table 2
query2 = "SELECT * FROM dbo.LP1_startup_funding2021"

df_2021 = pd.read_sql(query2, connection)
df_2021.head(2)

Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage
0,Unbox Robotics,2019.0,Bangalore,AI startup,Unbox Robotics builds on-demand AI-driven ware...,"Pramod Ghadge, Shahid Memon","BEENEXT, Entrepreneur First","$1,200,000",Pre-series A
1,upGrad,2015.0,Mumbai,EdTech,UpGrad is an online higher education platform.,"Mayank Kumar, Phalgun Kompalli, Ravijot Chugh,...","Unilazer Ventures, IIFL Asset Management","$120,000,000",


In [7]:
# Renaming columns to lowercase with underscores
df_2020= df_2020.rename(columns=lambda x: x.lower().replace(' ', '_'))
df_2020.head(2)

Unnamed: 0,company_brand,founded,headquarter,sector,what_it_does,founders,investor,amount,stage,column10
0,Aqgromalin,2019.0,Chennai,AgriTech,Cultivating Ideas for Profit,"Prasanna Manogaran, Bharani C L",Angel investors,200000.0,,
1,Krayonnz,2019.0,Bangalore,EdTech,An academy-guardian-scholar centric ecosystem ...,"Saurabh Dixit, Gurudutt Upadhyay",GSF Accelerator,100000.0,Pre-seed,


In [8]:
#performing high level enquiries on the data
df_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   company_brand  1055 non-null   object 
 1   founded        842 non-null    float64
 2   headquarter    961 non-null    object 
 3   sector         1042 non-null   object 
 4   what_it_does   1055 non-null   object 
 5   founders       1043 non-null   object 
 6   investor       1017 non-null   object 
 7   amount         801 non-null    float64
 8   stage          591 non-null    object 
 9   column10       2 non-null      object 
dtypes: float64(2), object(8)
memory usage: 82.6+ KB


From the observations, there are strings and numbers in the dataset. There are 10 columns in the 2020_dataset with nulls in almost all of them.

In [9]:
# Rename columns to lowercase with underscores
df_2021= df_2021.rename(columns=lambda x: x.lower().replace(' ', '_'))
df_2021.head(2)

Unnamed: 0,company_brand,founded,headquarter,sector,what_it_does,founders,investor,amount,stage
0,Unbox Robotics,2019.0,Bangalore,AI startup,Unbox Robotics builds on-demand AI-driven ware...,"Pramod Ghadge, Shahid Memon","BEENEXT, Entrepreneur First","$1,200,000",Pre-series A
1,upGrad,2015.0,Mumbai,EdTech,UpGrad is an online higher education platform.,"Mayank Kumar, Phalgun Kompalli, Ravijot Chugh,...","Unilazer Ventures, IIFL Asset Management","$120,000,000",


In [10]:
#performing high level enquiries on the second dataset
df_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1209 entries, 0 to 1208
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   company_brand  1209 non-null   object 
 1   founded        1208 non-null   float64
 2   headquarter    1208 non-null   object 
 3   sector         1209 non-null   object 
 4   what_it_does   1209 non-null   object 
 5   founders       1205 non-null   object 
 6   investor       1147 non-null   object 
 7   amount         1206 non-null   object 
 8   stage          781 non-null    object 
dtypes: float64(1), object(8)
memory usage: 85.1+ KB


From the observations, there are strings and numbers in the dataset. There are 9 columns in the 2021_dataset with nulls in almost all of them.

In [11]:
#reading third dataset from csv file into a pandas dataframe
df_2019 = pd.read_csv("startup_funding2019.csv")
# Rename columns to lowercase with underscores
df_2019= df_2019.rename(columns=lambda x: x.lower().replace(' ', '_'))
df_2019.head(2)

Unnamed: 0,company/brand,founded,headquarter,sector,what_it_does,founders,investor,amount($),stage
0,Bombay Shaving,,,Ecommerce,Provides a range of male grooming products,Shantanu Deshpande,Sixth Sense Ventures,"$6,300,000",
1,Ruangguru,2014.0,Mumbai,Edtech,A learning platform that provides topic-based ...,"Adamas Belva Syah Devara, Iman Usman.",General Atlantic,"$150,000,000",Series C


In [12]:
#removing symbols from columns to make later concatenation uniform
df_2019 = df_2019.rename(columns={"company/brand":"company_brand", "amount($)":"amount"})

In [16]:
#requesting info of 2019 dataset
df_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   company_brand  89 non-null     object 
 1   founded        60 non-null     float64
 2   headquarter    70 non-null     object 
 3   sector         84 non-null     object 
 4   what_it_does   89 non-null     object 
 5   founders       86 non-null     object 
 6   investor       89 non-null     object 
 7   amount         89 non-null     object 
 8   stage          43 non-null     object 
dtypes: float64(1), object(8)
memory usage: 6.4+ KB


In [14]:
#reading third dataset from csv file into a pandas dataframe
df_2018 = pd.read_csv("startup_funding2018.csv")
df_2018= df_2018.rename(columns=lambda x: x.lower().replace(' ', '_'))
df_2018.head()

Unnamed: 0,company_name,industry,round/series,amount,location,about_company
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f..."
1,Happy Cow Dairy,"Agriculture, Farming",Seed,"₹40,000,000","Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,"₹65,000,000","Gurgaon, Haryana, India",Leading Online Loans Marketplace in India
3,PayMe India,"Financial Services, FinTech",Angel,2000000,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,—,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...


In [15]:
df_2018 = df_2018.rename(columns={"company_name":"company_brand", 
                                   "industry":"sector",
                                   "round/series":"stage",
                                   "location":"headquarter",
                                   "about_company":"what_it_does"})

In [16]:
df_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526 entries, 0 to 525
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   company_brand  526 non-null    object
 1   sector         526 non-null    object
 2   stage          526 non-null    object
 3   amount         526 non-null    object
 4   headquarter    526 non-null    object
 5   what_it_does   526 non-null    object
dtypes: object(6)
memory usage: 24.8+ KB


In [18]:
#concatenating three datasets into a single dataframe
df = pd.concat([df_2018, df_2019,df_2020,df_2021],ignore_index=True)
df.head()
#df.to_csv("startup_2019_2020_2021.csv")

Unnamed: 0,company_brand,sector,stage,amount,headquarter,what_it_does,founded,founders,investor,column10
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f...",,,,
1,Happy Cow Dairy,"Agriculture, Farming",Seed,"₹40,000,000","Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...,,,,
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,"₹65,000,000","Gurgaon, Haryana, India",Leading Online Loans Marketplace in India,,,,
3,PayMe India,"Financial Services, FinTech",Angel,2000000,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...,,,,
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,—,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...,,,,


In [19]:
df_2018.shape

(526, 6)

In [18]:
#checking shape of 2019 dataframe
df_2019.shape

(89, 9)

In [19]:
#checking shape of 2020 dataframe
df_2020.shape

(1055, 10)

In [20]:
#checking shape of 2021 dataframe
df_2021.shape

(1209, 9)