In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode
plt.style.use('dark_background')
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
import plotly.io as pio
pio.renderers.default = 'colab'



In [2]:
data = pd.read_csv('Billionaire.csv')

In [3]:
data.head()

Unnamed: 0,Name,NetWorth,Country,Source,Rank,Age,Industry
0,Jeff Bezos,$177 B,United States,Amazon,1,57.0,Technology
1,Elon Musk,$151 B,United States,"Tesla, SpaceX",2,49.0,Automotive
2,Bernard Arnault & family,$150 B,France,LVMH,3,72.0,Fashion & Retail
3,Bill Gates,$124 B,United States,Microsoft,4,65.0,Technology
4,Mark Zuckerberg,$97 B,United States,Facebook,5,36.0,Technology


# Data cleaning

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2755 entries, 0 to 2754
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      2755 non-null   object 
 1   NetWorth  2755 non-null   object 
 2   Country   2755 non-null   object 
 3   Source    2755 non-null   object 
 4   Rank      2755 non-null   int64  
 5   Age       2676 non-null   float64
 6   Industry  2755 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 150.8+ KB


In [5]:
data.isnull().sum()

Unnamed: 0,0
Name,0
NetWorth,0
Country,0
Source,0
Rank,0
Age,79
Industry,0


In [6]:
data = data.dropna()

In [7]:
data.duplicated().sum()

0

In [8]:
data['NetWorth'] = data['NetWorth'].str.strip('$').str.strip('B')

In [9]:
data['NetWorth'] = data['NetWorth'].astype(float)

# Exploratory data analysis

In [10]:
data.describe()

Unnamed: 0,NetWorth,Rank,Age
count,2676.0,2676.0,2676.0
mean,4.796525,1343.791106,63.113602
std,9.743198,773.724884,13.445153
min,1.0,1.0,18.0
25%,1.5,680.0,54.0
50%,2.3,1362.0,63.0
75%,4.2,2035.0,73.0
max,177.0,2674.0,99.0


In [11]:
data.drop(columns = ['Rank'], axis = 'columns', inplace = True)

In [12]:
data.describe()

Unnamed: 0,NetWorth,Age
count,2676.0,2676.0
mean,4.796525,63.113602
std,9.743198,13.445153
min,1.0,18.0
25%,1.5,54.0
50%,2.3,63.0
75%,4.2,73.0
max,177.0,99.0


- Maximum networth is 177B while the minimum is 1B and average is 4.7B
- Maximum age is 99 years while minimum is 19.Average age is 64 years

# Overall analysis

In [13]:
fig = px.bar(data.sort_values(by = 'NetWorth',ascending = False)[:10],x = 'Name',y = 'NetWorth',template ='plotly_dark', color = 'NetWorth', opacity = 0.8, title = 'Top 10 billionaires and their networth')
fig.show()

In [14]:
fig = px.scatter(data.sort_values(by = 'NetWorth',ascending = False)[:10],x= 'Name',y = 'NetWorth', template = 'plotly_dark',size = 'NetWorth',color = 'Country',opacity = 0.85,title = 'Countries of the top ten billionaires')
fig.show()

In [15]:
fig = px.scatter(data.sort_values(by = 'NetWorth',ascending = False)[:10],x = 'Name',y = 'NetWorth', template = 'plotly_dark',size = 'NetWorth',color = 'Industry',opacity = 0.85,title = 'Industries of top ten billionaires')
fig.show()

# Analysis according to Age

In [16]:
fig = px.histogram(data , x = 'Age' , template = 'plotly_dark',color = 'Age',opacity = 0.9, title = 'age distribution of the billionaires')
fig.show()

In [17]:
fig = px.bar(data.sort_values(by = 'NetWorth', ascending = False)[:10], x = 'Name', y = 'Age', template = 'plotly_dark', color = 'Age', opacity = 0.8, title = 'Top 10 billionaires and their age')
fig.show()

# Analysis according to country

In [18]:
d1 = data['Country'].value_counts().reset_index().head(10)
d1.columns = ['index', 'Country']  # Rename the columns to match the original code
fig = px.bar(d1, x='index', y='Country', template='plotly_dark', color='Country', opacity=0.8, title='Top ten countries with the most billionaires')
fig.show()



In [19]:
d1 = data['Country'].value_counts().reset_index().tail(30)
d1.columns = ['Country', 'count']
fig = px.bar(d1, x='Country', y='count', template='plotly_dark', color='Country', opacity=0.8, title='Countries with the least no of billionaires')
fig.show()

# analysis according to source

In [20]:
D1 = data['Source'].value_counts().reset_index().head(20)
D1.columns = ['index', 'Source']
fig = px.bar(D1, x='index', y='Source', template='plotly_dark', color='Source', opacity=0.8, title='Top 20 Sources with the most billionaires in the world')
fig.update_xaxes(tickangle=90)

fig.show()



# USA analysis

In [21]:
dataus = data[data['Country'] == 'United States']
d1 = dataus['Source'].value_counts().reset_index().head(10)
d1.columns = ['index', 'Source']

fig = px.bar(d1, x='index', y='Source', template='plotly_dark', color='Source', opacity=0.8, title='Top 10 Sources with the most billionaires in USA')
fig.show()



In [22]:
dataus = data[data['Country'] == 'United States']
d2 = dataus.sort_values(by = 'NetWorth',ascending = False)[:10]
fig = px.bar(d2, x = 'Name' , y = 'NetWorth' ,template = 'plotly_dark' , color = 'NetWorth', opacity = 0.8 , title = 'Top ten billionaires in the United States')
fig.show()

In [23]:
dataus = data[data['Country'] == 'United States']
d1 = dataus['Industry'].value_counts().reset_index().head(10)


d1.columns = ['index', 'Industry']
fig = px.pie(d1, names='index', values='Industry', template='plotly_dark', opacity=0.8,
             title='Top 10 Industries with the most billionaires in USA')
fig.show()


# CHINA analysis

In [24]:
datachina = data[data['Country'] == 'China']
d2 = datachina.sort_values(by = 'NetWorth',ascending = False)[:10]
fig = px.bar(d2, x = 'Name' , y = 'NetWorth' ,template = 'plotly_dark' , color = 'NetWorth', opacity = 0.8 , title = 'Top ten billionaires in China')
fig.show()

In [25]:
datachina = data[data['Country'] == 'China']
d1 = datachina['Source'].value_counts().reset_index().head(10)
d1.columns = ['index', 'Source']
fig = px.bar(d1, x='index', y='Source', template='plotly_dark', color='Source', opacity=0.8, title='Top 10 Sources with the most billionaires in China')
fig.show()



In [26]:
datachina = data[data['Country'] == 'China']
d1 = datachina['Industry'].value_counts().reset_index().head(10)
d1.columns = ['index', 'Industry']
fig = px.pie(d1, names='index', values='Industry', template='plotly_dark', opacity=0.8, title='Top 10 Industries with the most billionaires in China')
fig.show()


## Analysis according to location

In [27]:
d3 = data.groupby("Country")['NetWorth'].sum().reset_index()
fig = px.choropleth(d3,locations = 'Country',locationmode = 'country names',color = 'NetWorth',template = 'plotly_dark',title ='This map shows the total networth of billionaires in various countries')
fig.show()

In [28]:
d4 = data.groupby("Country")['NetWorth'].sum().sort_values(ascending = False).reset_index().head(20)
fig = px.choropleth(d4,locations = 'Country',locationmode = 'country names',color = 'NetWorth',template = 'plotly_dark',title ='This map shows top 20 countries by the total networth of billionaires')
fig.show()

- Jeff Bezos and Elon Musk who are from United States are richest in the world and their Networth are more than 150 billion.
- Real Estate and Pharmaceuticals are the sources with the most billionaires in the world.
- Finance & investment and Technology are the industries with the most billionaires in the world.
- Top countries with the most number of billionaires are United Stated and China. And then, over 66 percent of billionaires are from these two countries.
- Hedge funds and Real Estate are the sources with the most billionaires in the United States.