In [3]:
import pandas as pd
import numpy as np

Import dataset

In [4]:
#Avoid Unnamed columns
columns = ["Rank", "Name", "Total Net Worth", "$ Last Change", "$ YTD Change", "Country", "Industry"]
#Specify file path
file_path = "500_richest_people_2021.csv"
#read df, specify separator
richest_people = pd.read_csv(file_path, sep=';', usecols=columns)
#show dataframe
richest_people

Unnamed: 0,Rank,Name,Total Net Worth,$ Last Change,$ YTD Change,Country,Industry
0,1.0,Jeff Bezos,$188B,+$1.68B,-$2.31B,United States,Technology
1,2.0,Elon Musk,$170B,-$2.89B,+$773M,United States,Technology
2,3.0,Bernard Arnault,$155B,+$892M,+$40.9B,France,Consumer
3,4.0,Bill Gates,$144B,-$1.32B,+$12.2B,United States,Technology
4,5.0,Mark Zuckerberg,$114B,+$203M,+$10.9B,United States,Technology
...,...,...,...,...,...,...,...
498,500.0,Odd Reitan,$5.72B,-$19.9M,+$669M,Norway,Food & Beverage
499,,,,,,,
500,,,,,,,
501,,,,,,,


Clean Dataset

In [5]:
# Eliminate duplicate values
richest_people = richest_people.drop_duplicates()

#remove rows with missing values 
richest_people.dropna(inplace=True)

#remove unnecessary rows
richest_people = richest_people.drop(columns=['$ YTD Change', '$ Last Change'])

# Replace spaces with underscores in the column names
richest_people.columns = richest_people.columns.str.replace(' ', '_')

#check datatypes
print(richest_people.dtypes)






Rank               float64
Name                object
Total_Net_Worth     object
Country             object
Industry            object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  richest_people.dropna(inplace=True)


In [6]:
# Remove non-numeric characters from the 'Total_Net_Worth' column
richest_people['Total_Net_Worth'] = richest_people['Total_Net_Worth'].str.replace(r'[^\d.]', '', regex=True)


In [7]:
# Convert the 'Total_Net_Worth' column to float, converting non-numeric values to NaN
richest_people['Total_Net_Worth'] = pd.to_numeric(richest_people['Total_Net_Worth'], errors='coerce')

# Fill NaN values with 0
richest_people['Total_Net_Worth'] = richest_people['Total_Net_Worth'].fillna(0)

# Round the values to 2 decimal places
richest_people['Total_Net_Worth'] = richest_people['Total_Net_Worth'].round(2)

# Rename the 'Total_Net_Worth' column to 'Total_Networth_Billions'
richest_people = richest_people.rename(columns={'Total_Net_Worth': 'Total_Networth_Billions'})

# Print the DataFrame to verify the changes
print(richest_people)

#Print datatypes to verify changes
print(richest_people.dtypes)

      Rank                    Name  Total_Networth_Billions        Country  \
0      1.0              Jeff Bezos                   188.00  United States   
1      2.0               Elon Musk                   170.00  United States   
2      3.0         Bernard Arnault                   155.00         France   
3      4.0              Bill Gates                   144.00  United States   
4      5.0         Mark Zuckerberg                   114.00  United States   
..     ...                     ...                      ...            ...   
494  496.0             Lino Saputo                     5.75         Canada   
495  497.0        Prajogo Pangestu                     5.74      Indonesia   
496  498.0  Charles Dolan & Family                     5.74  United States   
497  499.0            Vladimir Kim                     5.72     Kazakhstan   
498  500.0              Odd Reitan                     5.72         Norway   

            Industry  
0         Technology  
1         Technol

In [11]:
# Save the DataFrame as a CSV file
richest_people.to_csv('500_richest_people_2021_clean.csv', index=False)

In [8]:
richest_people

Unnamed: 0,Rank,Name,Total_Networth_Billions,Country,Industry
0,1.0,Jeff Bezos,188.00,United States,Technology
1,2.0,Elon Musk,170.00,United States,Technology
2,3.0,Bernard Arnault,155.00,France,Consumer
3,4.0,Bill Gates,144.00,United States,Technology
4,5.0,Mark Zuckerberg,114.00,United States,Technology
...,...,...,...,...,...
494,496.0,Lino Saputo,5.75,Canada,Food & Beverage
495,497.0,Prajogo Pangestu,5.74,Indonesia,Energy
496,498.0,Charles Dolan & Family,5.74,United States,Media & Telecom
497,499.0,Vladimir Kim,5.72,Kazakhstan,Commodities


In [11]:
# Create the country_money DataFrame
country_money = richest_people.groupby('Country')['Total_Networth_Billions'].sum().reset_index()
country_money.columns = ['Country', 'Total_Networth_Billions']

# Sort the DataFrame in descending order by 'Total_Networth_Billions'
country_money = country_money.sort_values(by='Total_Networth_Billions', ascending=False)

# Reset the index and add a column numbered from 1
country_money.reset_index(drop=True, inplace=True)
country_money.index += 1

# Show the country_money DataFrame
print(country_money)

                 Country  Total_Networth_Billions
1          United States                  3221.41
2                  China                  1194.78
3                 France                   468.86
4                 Russia                   373.37
5                Germany                   371.34
6                  India                   338.89
7              Hong Kong                   275.11
8         United Kingdom                   178.64
9                  Japan                   132.17
10             Australia                   118.97
11                Sweden                   117.34
12             Singapore                   115.02
13                 Italy                   105.15
14                Canada                   104.44
15                Mexico                    98.29
16                Brazil                    80.00
17                Spain                     79.30
18           Switzerland                    73.82
19                 Korea                    63.28
