In [16]:
#Importing Libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [17]:
# Adjust the path
unicorns_csv_path= '/Users/eleni_icon/Unicorns/2 Data/Raw data/List of Unicorns in the World.csv'
unicorns_data =pd.read_csv(unicorns_csv_path)

In [18]:
# Check initial data types and missing values in the dataset
initial_data_types = unicorns_data.dtypes
missing_values = unicorns_data.isnull().sum()

initial_data_types, missing_values

(Unnamed: 0         int64
 Company           object
 Valuation ($B)    object
 Date Joined       object
 Country           object
 City              object
 Industry          object
 dtype: object,
 Unnamed: 0        0
 Company           0
 Valuation ($B)    0
 Date Joined       0
 Country           0
 City              0
 Industry          0
 dtype: int64)

In [19]:
# Remove the 'Unnamed: 0' column as it is redundant
unicorns_data.drop(columns=['Unnamed: 0'], inplace=True)

In [20]:
# Convert 'Valuation ($B)' to numeric by removing '$' and 'B', then converting to float
unicorns_data['Valuation ($B)'] = unicorns_data['Valuation ($B)'].replace('[\$,B]', '', regex=True).astype(float)

In [21]:
# Convert 'Date Joined' to datetime format
unicorns_data['Date Joined'] = pd.to_datetime(unicorns_data['Date Joined'])

In [22]:
# Verify the changes
cleaned_data_types = unicorns_data.dtypes
unicorns_data.head(), cleaned_data_types

(     Company  Valuation ($B) Date Joined        Country            City  \
 0  ByteDance           225.0  2017-04-07          China         Beijing   
 1     SpaceX           150.0  2012-12-01  United States       Hawthorne   
 2     OpenAI            80.0  2019-07-22  United States   San Francisco   
 3      SHEIN            66.0  2018-07-03      Singapore  Singapore City   
 4     Stripe            65.0  2014-01-23  United States   San Francisco   
 
                 Industry  
 0  Media & Entertainment  
 1            Industrials  
 2        Enterprise Tech  
 3      Consumer & Retail  
 4     Financial Services  ,
 Company                   object
 Valuation ($B)           float64
 Date Joined       datetime64[ns]
 Country                   object
 City                      object
 Industry                  object
 dtype: object)

# Descriptive Statistics

In [23]:
# Calculating descriptive statistics for the 'Valuation ($B)' column
valuation_stats = unicorns_data['Valuation ($B)'].describe()

In [24]:
# Calculating the earliest and most recent dates companies joined the unicorn status
earliest_date = unicorns_data['Date Joined'].min()
most_recent_date = unicorns_data['Date Joined'].max()

In [25]:
# Counting the number of unicorns per country and industry
unicorns_per_country = unicorns_data['Country'].value_counts()
unicorns_per_industry = unicorns_data['Industry'].value_counts()

In [26]:
# Counting the number of new unicorns per year
unicorns_data['Year Joined'] = unicorns_data['Date Joined'].dt.year
unicorns_per_year = unicorns_data['Year Joined'].value_counts().sort_index()

valuation_stats, earliest_date, most_recent_date, unicorns_per_country, unicorns_per_industry, unicorns_per_year

(count    1233.000000
 mean        3.124501
 std         8.871706
 min         1.000000
 25%         1.080000
 50%         1.500000
 75%         3.000000
 max       225.000000
 Name: Valuation ($B), dtype: float64,
 Timestamp('2007-07-02 00:00:00'),
 Timestamp('2024-03-26 00:00:00'),
 Country
 United States           656
 China                   168
 India                    71
 United Kingdom           54
 Germany                  31
 France                   26
 Israel                   25
 Canada                   21
 Brazil                   17
 Singapore                17
 South Korea              15
 Australia                 9
 Netherlands               9
 Mexico                    8
 Hong Kong                 7
 Ireland                   7
 Japan                     7
 Indonesia                 7
 Switzerland               6
 Sweden                    6
 Spain                     5
 United Arab Emirates      5
 Finland                   4
 Norway                    4
 Italy    

Data Profile Summary
Descriptive Statistics for Valuation:

Count: 1233 companies

Mean Valuation: $3.12 billion

Median Valuation: $1.5 billion

Minimum Valuation: $1 billion

Maximum Valuation: $225 billion

Standard Deviation: $8.87 billion

Timeline Analysis:

Earliest Unicorn: Joined on July 2, 2007
    
Most Recent Unicorn: Joined on March 26, 2024

Frequency of Companies Achieving Unicorn Status per Year:

The number of unicorns peaked in 2021 with 503 new companies achieving this status.
After a peak in 2021, there was a noticeable decrease in new unicorns in the following years.

Distribution by Geography and Sector:

Countries with Most Unicorns: United States (656), China (168), India (71)
    
Industries with Most Unicorns: Enterprise Tech (388), Financial Services (220), Consumer & Retail (213)

In [27]:
# Save the cleaned dataset to a new CSV file
unicorns_data.to_csv('/Users/eleni_icon/Unicorns/2 Data/Prepared data/Cleaned_Unicorns_Data.csv', index=False)