## Libraries

In [2]:
import pandas as pd
import numpy as np 
import plotly.express as px
import matplotlib.pyplot as plt



In [3]:
uc = pd.read_csv("/content/Unicorn_Companies.csv")

In [4]:
uc.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country,Continent,Year Founded,Funding,Select Investors
0,Bytedance,$180B,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,$100B,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,$100B,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,$95B,2014-01-23,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG"
4,Klarna,$46B,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita..."


In [5]:
uc[uc.Company == 'Workhuman']

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country,Continent,Year Founded,Funding,Select Investors
792,Workhuman,$1B,2020-06-23,Internet software & services,Dublin,Ireland,Europe,1999,$9M,ICG


In [6]:
len(uc.Funding)

1074

In [7]:
#for _ in len(uc.Funding):
#x = pd.DataFrame()
#if (uc.Funding.astype("str").str.get(4) == 'M').any():
 #     x['Funding'] = uc.Funding.str.extract('(\d+)')
  #    x['Funding'] = pd.to_numeric(x.Funding) 
   #   uc.Funding / 1000
    #  print(uc.Funding)
#uc['Funding'] = uc.Funding.str.extract('(\d+)')

In [8]:
dd = pd.read_csv("/content/Data_Dictionary.csv") # description for every column in (uc) dataframe

In [9]:
dd.head()

Unnamed: 0,Field,Description
0,Company,Company name
1,Valuation,Company valuation in billions (B) of dollars
2,Date Joined,The date in which the company reached $1 billi...
3,Industry,Company industry
4,City,City the company was founded in


## Let's start cleaning some data 🧹

In [10]:
uc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Company           1074 non-null   object
 1   Valuation         1074 non-null   object
 2   Date Joined       1074 non-null   object
 3   Industry          1074 non-null   object
 4   City              1058 non-null   object
 5   Country           1074 non-null   object
 6   Continent         1074 non-null   object
 7   Year Founded      1074 non-null   int64 
 8   Funding           1074 non-null   object
 9   Select Investors  1073 non-null   object
dtypes: int64(1), object(9)
memory usage: 84.0+ KB


In [11]:
uc.columns

Index(['Company', 'Valuation', 'Date Joined', 'Industry', 'City', 'Country',
       'Continent', 'Year Founded', 'Funding', 'Select Investors'],
      dtype='object')

> We need to wrangle both "Valuation" and "Funding" columns to integers 

In [12]:
uc['Valuation'] = uc['Valuation'].astype("string")    

uc['Funding'] = uc['Funding'].astype("string") 

uc['Valuation'] = uc.Valuation.str.extract('(\d+)')

uc['Funding'] = uc.Funding.str.extract('(\d+)')

In [13]:
uc['Valuation'] = uc.Valuation.astype("float")

uc['Funding'] = pd.to_numeric(uc.Funding)


In [14]:
uc['Year Founded'] = pd.to_datetime(uc['Year Founded'])
uc['Date Joined'] = pd.to_datetime(uc['Date Joined'])

In [15]:
uc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Company           1074 non-null   object        
 1   Valuation         1074 non-null   float64       
 2   Date Joined       1074 non-null   datetime64[ns]
 3   Industry          1074 non-null   object        
 4   City              1058 non-null   object        
 5   Country           1074 non-null   object        
 6   Continent         1074 non-null   object        
 7   Year Founded      1074 non-null   datetime64[ns]
 8   Funding           1062 non-null   float64       
 9   Select Investors  1073 non-null   object        
dtypes: datetime64[ns](2), float64(2), object(6)
memory usage: 84.0+ KB


In [16]:
uc.describe()

Unnamed: 0,Valuation,Funding
count,1074.0,1062.0
mean,3.455307,338.091337
std,8.547022,237.333149
min,1.0,0.0
25%,1.0,166.0
50%,2.0,300.0
75%,3.0,491.5
max,180.0,999.0


In [17]:
uc['Industry'].value_counts()

Fintech                                224
Internet software & services           205
E-commerce & direct-to-consumer        111
Health                                  74
Artificial intelligence                 73
Other                                   58
Supply chain, logistics, & delivery     57
Cybersecurity                           50
Data management & analytics             41
Mobile & telecommunications             38
Hardware                                34
Auto & transportation                   31
Edtech                                  28
Consumer & retail                       25
Travel                                  14
Artificial Intelligence                 11
Name: Industry, dtype: int64

In [18]:
uc['Industry'] = uc.Industry.astype("str").str.title()

In [19]:
px.bar(uc['Industry'].value_counts())


In [20]:
uc.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country,Continent,Year Founded,Funding,Select Investors
0,Bytedance,180.0,2017-04-07,Artificial Intelligence,Beijing,China,Asia,1970-01-01 00:00:00.000002012,8.0,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,100.0,2012-12-01,Other,Hawthorne,United States,North America,1970-01-01 00:00:00.000002002,7.0,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,100.0,2018-07-03,E-Commerce & Direct-To-Consumer,Shenzhen,China,Asia,1970-01-01 00:00:00.000002008,2.0,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,95.0,2014-01-23,Fintech,San Francisco,United States,North America,1970-01-01 00:00:00.000002010,2.0,"Khosla Ventures, LowercaseCapital, capitalG"
4,Klarna,46.0,2011-12-12,Fintech,Stockholm,Sweden,Europe,1970-01-01 00:00:00.000002005,4.0,"Institutional Venture Partners, Sequoia Capita..."


In [21]:
px.bar(uc.Country.value_counts())

In [22]:
uc.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country,Continent,Year Founded,Funding,Select Investors
0,Bytedance,180.0,2017-04-07,Artificial Intelligence,Beijing,China,Asia,1970-01-01 00:00:00.000002012,8.0,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,100.0,2012-12-01,Other,Hawthorne,United States,North America,1970-01-01 00:00:00.000002002,7.0,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,100.0,2018-07-03,E-Commerce & Direct-To-Consumer,Shenzhen,China,Asia,1970-01-01 00:00:00.000002008,2.0,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,95.0,2014-01-23,Fintech,San Francisco,United States,North America,1970-01-01 00:00:00.000002010,2.0,"Khosla Ventures, LowercaseCapital, capitalG"
4,Klarna,46.0,2011-12-12,Fintech,Stockholm,Sweden,Europe,1970-01-01 00:00:00.000002005,4.0,"Institutional Venture Partners, Sequoia Capita..."


In [23]:
uc.columns = [columns.replace(" ", "_") for columns in uc.columns]

x = (uc['Date_Joined'] <= "2012-12-31") & (uc['Valuation'] >= 1.0) 

len(uc[x])

7

In [24]:
uc.head()

Unnamed: 0,Company,Valuation,Date_Joined,Industry,City,Country,Continent,Year_Founded,Funding,Select_Investors
0,Bytedance,180.0,2017-04-07,Artificial Intelligence,Beijing,China,Asia,1970-01-01 00:00:00.000002012,8.0,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,100.0,2012-12-01,Other,Hawthorne,United States,North America,1970-01-01 00:00:00.000002002,7.0,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,100.0,2018-07-03,E-Commerce & Direct-To-Consumer,Shenzhen,China,Asia,1970-01-01 00:00:00.000002008,2.0,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,95.0,2014-01-23,Fintech,San Francisco,United States,North America,1970-01-01 00:00:00.000002010,2.0,"Khosla Ventures, LowercaseCapital, capitalG"
4,Klarna,46.0,2011-12-12,Fintech,Stockholm,Sweden,Europe,1970-01-01 00:00:00.000002005,4.0,"Institutional Venture Partners, Sequoia Capita..."


> We can see that before the beginning of 2013 there was only 7 unicorn companies 📅

In [25]:

len(uc.query("Date_Joined > '2012-12-31' and Valuation >=1.0"))

1067

And 1067 unicorn companies after 2012 

In [29]:
def norm(x):

  if x > 9:
    x = x / 1000 

  return x 

In [32]:
uc['Funding'] = uc['Funding'].apply(norm)


In [35]:
group = uc.groupby("Funding")['Valuation'].mean()

In [36]:
px.scatter(group)

In [39]:
px.line(group)