# Web scraping data about countries

This is my first data analysis project, including scraping two websites using BeatufilSoup and Numpy with Pandas for data cleaning. The collected data is merged into a single dataframe and used for visualization in Power BI, available on GitHub.


The code below works as of 08/14/2023.<br>

Scraped websites:<br>
https://en.wikipedia.org/wiki/List_of_countries_by_stock_market_capitalization <br>
https://www.worldometers.info/world-population/population-by-country/

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd
import numpy as np

In [2]:
pd.options.display.max_rows = 999

## First table

In [3]:
url1 = 'https://en.wikipedia.org/wiki/List_of_countries_by_stock_market_capitalization'

page1 = requests.get(url1)
soup1 = bs(page1.text, 'html')

In [4]:
table1 = soup1.find_all('table')[0]

In [5]:
print(table1)

<table class="wikitable sortable static-row-numbers">
<tbody><tr>
<th>Country
</th>
<th>Total market cap<br/>(in <b>mil.</b> US$)<sup class="reference" id="cite_ref-2"><a href="#cite_note-2">[2]</a></sup>
</th>
<th>Total market cap<br/>(% of <a href="/wiki/Gross_domestic_product" title="Gross domestic product">GDP</a>)<sup class="reference" id="cite_ref-3"><a href="#cite_note-3">[3]</a></sup>
</th>
<th>Number of domestic<br/>companies listed<sup class="reference" id="cite_ref-4"><a href="#cite_note-4">[4]</a></sup>
</th>
<th>Year
</th></tr>
<tr>
<td><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="650" data-file-width="1235" decoding="async" height="12" src="//upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_States.svg/23px-Flag_of_the_United_States.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_States.svg/35px-Flag_of_the_United_States.svg.png 1.5x, //upl

In [6]:
titles = table1.find_all('th')

In [7]:
titles

[<th>Country
 </th>,
 <th>Total market cap<br/>(in <b>mil.</b> US$)<sup class="reference" id="cite_ref-2"><a href="#cite_note-2">[2]</a></sup>
 </th>,
 <th>Total market cap<br/>(% of <a href="/wiki/Gross_domestic_product" title="Gross domestic product">GDP</a>)<sup class="reference" id="cite_ref-3"><a href="#cite_note-3">[3]</a></sup>
 </th>,
 <th>Number of domestic<br/>companies listed<sup class="reference" id="cite_ref-4"><a href="#cite_note-4">[4]</a></sup>
 </th>,
 <th>Year
 </th>]

In [8]:
titles_names = [title.text.strip() for title in titles]

In [9]:
titles_names

['Country',
 'Total market cap(in mil. US$)[2]',
 'Total market cap(% of GDP)[3]',
 'Number of domesticcompanies listed[4]',
 'Year']

In [10]:
#removing [digit] and adding spaces
clean_titles = []
for phrase in titles_names:
    clean_title = re.sub(r'\[\d*\]', '', phrase)
    clean_title = re.sub(r'\(', ' (', clean_title)
    clean_title = re.sub(r'cc', 'c c', clean_title)
    clean_titles.append(clean_title)
print(clean_titles)

['Country', 'Total market cap (in mil. US$)', 'Total market cap (% of GDP)', 'Number of domestic companies listed', 'Year']


In [11]:
df1 = pd.DataFrame(columns = clean_titles)

In [12]:
#names of columns
df1

Unnamed: 0,Country,Total market cap (in mil. US$),Total market cap (% of GDP),Number of domestic companies listed,Year


In [13]:
column_values1 =  table1.find_all('tr')

In [14]:
#adding rows to a dataframe
for row in column_values1[1:]:
    row_data = row.find_all('td')
    row_values = [data.text.strip() for data in row_data]
    length = len(df1)
    df1.loc[length] = row_values

In [15]:
df1

Unnamed: 0,Country,Total market cap (in mil. US$),Total market cap (% of GDP),Number of domestic companies listed,Year
0,United States,44719661,194.5,4266,2020
1,China,13214311,83.0,4154,2020
2,Japan,6718220,122.2,3754,2020
3,Hong Kong,6130420,1768.8,2353,2020
4,India,"3,612,985[5]",103.0,5270,2023
5,France,2823000,84.9,457,2022
6,United Kingdom,2821000,100.0,1858,2022
7,Canada,2641455,160.7,3922,2020
8,Saudi Arabia,2429102,347.0,207,2020
9,Germany,2284109,60.0,438,2020


In [16]:
#cleaning data
cols_to_check = df1.columns[1:]

df1[cols_to_check] = df1[cols_to_check].replace({',' : ''}, regex=True)
df1[cols_to_check] = df1[cols_to_check].replace({'\.\.\.' : '0'}, regex=True)
df1['Total market cap (in mil. US$)'] = df1['Total market cap (in mil. US$)'].replace({'\[\d*\]' : '', }, regex=True)

#first dataframe ready
df1.head()

Unnamed: 0,Country,Total market cap (in mil. US$),Total market cap (% of GDP),Number of domestic companies listed,Year
0,United States,44719661,194.5,4266,2020
1,China,13214311,83.0,4154,2020
2,Japan,6718220,122.2,3754,2020
3,Hong Kong,6130420,1768.8,2353,2020
4,India,3612985,103.0,5270,2023


## Second table

In [17]:
url2 = 'https://www.worldometers.info/world-population/population-by-country/'

page2 = requests.get(url2)
soup2 = bs(page2.text, 'html')

In [18]:
table2 = soup2.find('table')
table2

<table cellspacing="0" class="table table-striped table-bordered" id="example2" width="100%"> <thead> <tr> <th>#</th> <th>Country (or dependency)</th> <th>Population<br/> (2023)</th> <th>Yearly<br/> Change</th> <th>Net<br/> Change</th> <th>Density<br/> (P/Km²)</th> <th>Land Area<br/> (Km²)</th> <th>Migrants<br/> (net)</th> <th>Fert.<br/> Rate</th> <th>Med.<br/> Age</th> <th>Urban<br/> Pop %</th> <th>World<br/> Share</th> </tr> </thead> <tbody> <tr> <td>1</td> <td style="font-weight: bold; font-size:15px; text-align:left"><a href="/world-population/india-population/">India</a></td> <td style="font-weight: bold;">1,428,627,663</td> <td>0.81 %</td> <td>11,454,490</td> <td>481</td> <td>2,973,190</td> <td>-486,136</td> <td>2.0</td> <td>28</td> <td>36 %</td> <td>17.76 %</td> </tr> <tr> <td>2</td> <td style="font-weight: bold; font-size:15px; text-align:left"><a href="/world-population/china-population/">China</a></td> <td style="font-weight: bold;">1,425,671,352</td> <td>-0.02 %</td> <td>-21

In [19]:
table2_columns = table2.find_all('th')[1:]
table2_columns

[<th>Country (or dependency)</th>,
 <th>Population<br/> (2023)</th>,
 <th>Yearly<br/> Change</th>,
 <th>Net<br/> Change</th>,
 <th>Density<br/> (P/Km²)</th>,
 <th>Land Area<br/> (Km²)</th>,
 <th>Migrants<br/> (net)</th>,
 <th>Fert.<br/> Rate</th>,
 <th>Med.<br/> Age</th>,
 <th>Urban<br/> Pop %</th>,
 <th>World<br/> Share</th>]

In [20]:
table2_columns_names = [col.text.strip() for col in table2_columns]
table2_columns_names

['Country (or dependency)',
 'Population (2023)',
 'Yearly Change',
 'Net Change',
 'Density (P/Km²)',
 'Land Area (Km²)',
 'Migrants (net)',
 'Fert. Rate',
 'Med. Age',
 'Urban Pop %',
 'World Share']

In [21]:
df2 = pd.DataFrame(columns = table2_columns_names)
df2

Unnamed: 0,Country (or dependency),Population (2023),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share


In [22]:
column2_values =  table2.find_all('tr')[1:]
column2_values

[<tr> <td>1</td> <td style="font-weight: bold; font-size:15px; text-align:left"><a href="/world-population/india-population/">India</a></td> <td style="font-weight: bold;">1,428,627,663</td> <td>0.81 %</td> <td>11,454,490</td> <td>481</td> <td>2,973,190</td> <td>-486,136</td> <td>2.0</td> <td>28</td> <td>36 %</td> <td>17.76 %</td> </tr>,
 <tr> <td>2</td> <td style="font-weight: bold; font-size:15px; text-align:left"><a href="/world-population/china-population/">China</a></td> <td style="font-weight: bold;">1,425,671,352</td> <td>-0.02 %</td> <td>-215,985</td> <td>152</td> <td>9,388,211</td> <td>-310,220</td> <td>1.2</td> <td>39</td> <td>65 %</td> <td>17.72 %</td> </tr>,
 <tr> <td>3</td> <td style="font-weight: bold; font-size:15px; text-align:left"><a href="/world-population/us-population/">United States</a></td> <td style="font-weight: bold;">339,996,563</td> <td>0.50 %</td> <td>1,706,706</td> <td>37</td> <td>9,147,420</td> <td>999,700</td> <td>1.7</td> <td>38</td> <td>83 %</td> <td>4

In [23]:
#adding rows to a dataframe
for row in column2_values:
    row_data = row.find_all('td')[1:]
    row_values = [data.text.strip() for data in row_data]
    length = len(df2)
    df2.loc[length] = row_values
df2

Unnamed: 0,Country (or dependency),Population (2023),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,India,1428627663,0.81 %,11454490,481,2973190,-486136,2.0,28.0,36 %,17.76 %
1,China,1425671352,-0.02 %,-215985,152,9388211,-310220,1.2,39.0,65 %,17.72 %
2,United States,339996563,0.50 %,1706706,37,9147420,999700,1.7,38.0,83 %,4.23 %
3,Indonesia,277534122,0.74 %,2032783,153,1811570,-49997,2.1,30.0,59 %,3.45 %
4,Pakistan,240485658,1.98 %,4660796,312,770880,-165988,3.3,21.0,35 %,2.99 %
5,Nigeria,223804632,2.41 %,5263420,246,910770,-59996,5.1,17.0,54 %,2.78 %
6,Brazil,216422446,0.52 %,1108948,26,8358140,6000,1.6,34.0,88 %,2.69 %
7,Bangladesh,172954319,1.03 %,1767947,1329,130170,-309977,1.9,27.0,41 %,2.15 %
8,Russia,144444359,-0.19 %,-268955,9,16376870,-136414,1.5,39.0,75 %,1.80 %
9,Mexico,128455567,0.75 %,951442,66,1943950,-50239,1.8,30.0,88 %,1.60 %


In [24]:
#removing % from values in dataframe
percent_columns2 = ['Yearly Change', 'Urban Pop %', 'World Share']
for col in percent_columns2:
    df2[col] = df2[col].str.strip('%')

df2

Unnamed: 0,Country (or dependency),Population (2023),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,India,1428627663,0.81,11454490,481,2973190,-486136,2.0,28.0,36,17.76
1,China,1425671352,-0.02,-215985,152,9388211,-310220,1.2,39.0,65,17.72
2,United States,339996563,0.5,1706706,37,9147420,999700,1.7,38.0,83,4.23
3,Indonesia,277534122,0.74,2032783,153,1811570,-49997,2.1,30.0,59,3.45
4,Pakistan,240485658,1.98,4660796,312,770880,-165988,3.3,21.0,35,2.99
5,Nigeria,223804632,2.41,5263420,246,910770,-59996,5.1,17.0,54,2.78
6,Brazil,216422446,0.52,1108948,26,8358140,6000,1.6,34.0,88,2.69
7,Bangladesh,172954319,1.03,1767947,1329,130170,-309977,1.9,27.0,41,2.15
8,Russia,144444359,-0.19,-268955,9,16376870,-136414,1.5,39.0,75,1.8
9,Mexico,128455567,0.75,951442,66,1943950,-50239,1.8,30.0,88,1.6


In [25]:
#cleaning data
cols2 = df2.columns

df2[cols2] = df2[cols2].replace({',' : ''}, regex=True)
df2[cols2] = df2[cols2].replace({'N.A.' : '0'}, regex=True)
df2[cols2] = df2[cols2].replace({'' : '0'}, regex=True)

In [26]:
df2

Unnamed: 0,Country (or dependency),Population (2023),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,India,1428627663,0.81,11454490,481,2973190,-486136,2.0,28,36,17.76
1,China,1425671352,-0.02,-215985,152,9388211,-310220,1.2,39,65,17.72
2,United States,339996563,0.5,1706706,37,9147420,999700,1.7,38,83,4.23
3,Indonesia,277534122,0.74,2032783,153,1811570,-49997,2.1,30,59,3.45
4,Pakistan,240485658,1.98,4660796,312,770880,-165988,3.3,21,35,2.99
5,Nigeria,223804632,2.41,5263420,246,910770,-59996,5.1,17,54,2.78
6,Brazil,216422446,0.52,1108948,26,8358140,6000,1.6,34,88,2.69
7,Bangladesh,172954319,1.03,1767947,1329,130170,-309977,1.9,27,41,2.15
8,Russia,144444359,-0.19,-268955,9,16376870,-136414,1.5,39,75,1.8
9,Mexico,128455567,0.75,951442,66,1943950,-50239,1.8,30,88,1.6


In [27]:
#merging dataframes by countries
df3 = df1.merge(df2, left_on ='Country', right_on = 'Country (or dependency)')

In [28]:
df3

Unnamed: 0,Country,Total market cap (in mil. US$),Total market cap (% of GDP),Number of domestic companies listed,Year,Country (or dependency),Population (2023),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,United States,44719661,194.5,4266,2020,United States,339996563,0.5,1706706,37,9147420,999700,1.7,38,83,4.23
1,China,13214311,83.0,4154,2020,China,1425671352,-0.02,-215985,152,9388211,-310220,1.2,39,65,17.72
2,Japan,6718220,122.2,3754,2020,Japan,123294513,-0.53,-657179,338,364555,99994,1.3,49,94,1.53
3,Hong Kong,6130420,1768.8,2353,2020,Hong Kong,7491609,0.04,2744,7135,1050,19999,0.8,46,0,0.09
4,India,3612985,103.0,5270,2023,India,1428627663,0.81,11454490,481,2973190,-486136,2.0,28,36,17.76
5,France,2823000,84.9,457,2022,France,64756584,0.2,129956,118,547557,67761,1.8,42,84,0.8
6,United Kingdom,2821000,100.0,1858,2022,United Kingdom,67736802,0.34,227866,280,241930,165790,1.6,40,85,0.84
7,Canada,2641455,160.7,3922,2020,Canada,38781291,0.85,326964,4,9093510,249746,1.5,41,81,0.48
8,Saudi Arabia,2429102,347.0,207,2020,Saudi Arabia,36947025,1.48,538205,17,2149690,28998,2.4,31,83,0.46
9,Germany,2284109,60.0,438,2020,Germany,83294633,-0.09,-75210,239,348560,155751,1.5,45,77,1.04


In [29]:
#countries that are in df1, but not in df2 
df1 [~df1['Country'].isin(df2['Country (or dependency)'])]

Unnamed: 0,Country,Total market cap (in mil. US$),Total market cap (% of GDP),Number of domestic companies listed,Year
55,Czech Republic,26614,10.9,20,2020
70,Ivory Coast,7331,12.0,46,2020
82,Palestine,3447,22.2,46,2020


In [30]:
#checking if above countries are in df2 
df_missing_countries = df1 [~df1['Country'].isin(df2['Country (or dependency)'])]
missing_countries = df_missing_countries['Country'].tolist()
df2_countries = df2['Country (or dependency)'].tolist()

for index, b in enumerate(df2_countries):
    for a in missing_countries:
        if a in b:
            print(list((index, b)))

[88, 'Czech Republic (Czechia)']
[120, 'State of Palestine']


In [31]:
#replacing name of the country
df2.replace('Czech Republic (Czechia)', 'Czech Republic', inplace = True)
df2

Unnamed: 0,Country (or dependency),Population (2023),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,India,1428627663,0.81,11454490,481,2973190,-486136,2.0,28,36,17.76
1,China,1425671352,-0.02,-215985,152,9388211,-310220,1.2,39,65,17.72
2,United States,339996563,0.5,1706706,37,9147420,999700,1.7,38,83,4.23
3,Indonesia,277534122,0.74,2032783,153,1811570,-49997,2.1,30,59,3.45
4,Pakistan,240485658,1.98,4660796,312,770880,-165988,3.3,21,35,2.99
5,Nigeria,223804632,2.41,5263420,246,910770,-59996,5.1,17,54,2.78
6,Brazil,216422446,0.52,1108948,26,8358140,6000,1.6,34,88,2.69
7,Bangladesh,172954319,1.03,1767947,1329,130170,-309977,1.9,27,41,2.15
8,Russia,144444359,-0.19,-268955,9,16376870,-136414,1.5,39,75,1.8
9,Mexico,128455567,0.75,951442,66,1943950,-50239,1.8,30,88,1.6


In [32]:
#another way of replacing name of the country
df2['Country (or dependency)'].iloc[120] = 'Palestine'
df2

Unnamed: 0,Country (or dependency),Population (2023),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,India,1428627663,0.81,11454490,481,2973190,-486136,2.0,28,36,17.76
1,China,1425671352,-0.02,-215985,152,9388211,-310220,1.2,39,65,17.72
2,United States,339996563,0.5,1706706,37,9147420,999700,1.7,38,83,4.23
3,Indonesia,277534122,0.74,2032783,153,1811570,-49997,2.1,30,59,3.45
4,Pakistan,240485658,1.98,4660796,312,770880,-165988,3.3,21,35,2.99
5,Nigeria,223804632,2.41,5263420,246,910770,-59996,5.1,17,54,2.78
6,Brazil,216422446,0.52,1108948,26,8358140,6000,1.6,34,88,2.69
7,Bangladesh,172954319,1.03,1767947,1329,130170,-309977,1.9,27,41,2.15
8,Russia,144444359,-0.19,-268955,9,16376870,-136414,1.5,39,75,1.8
9,Mexico,128455567,0.75,951442,66,1943950,-50239,1.8,30,88,1.6


In [33]:
#dataframe with Czech Republic and Palestine
df3 = df1.merge(df2, left_on ='Country', right_on = 'Country (or dependency)')

In [34]:
df3

Unnamed: 0,Country,Total market cap (in mil. US$),Total market cap (% of GDP),Number of domestic companies listed,Year,Country (or dependency),Population (2023),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,United States,44719661,194.5,4266,2020,United States,339996563,0.5,1706706,37,9147420,999700,1.7,38,83,4.23
1,China,13214311,83.0,4154,2020,China,1425671352,-0.02,-215985,152,9388211,-310220,1.2,39,65,17.72
2,Japan,6718220,122.2,3754,2020,Japan,123294513,-0.53,-657179,338,364555,99994,1.3,49,94,1.53
3,Hong Kong,6130420,1768.8,2353,2020,Hong Kong,7491609,0.04,2744,7135,1050,19999,0.8,46,0,0.09
4,India,3612985,103.0,5270,2023,India,1428627663,0.81,11454490,481,2973190,-486136,2.0,28,36,17.76
5,France,2823000,84.9,457,2022,France,64756584,0.2,129956,118,547557,67761,1.8,42,84,0.8
6,United Kingdom,2821000,100.0,1858,2022,United Kingdom,67736802,0.34,227866,280,241930,165790,1.6,40,85,0.84
7,Canada,2641455,160.7,3922,2020,Canada,38781291,0.85,326964,4,9093510,249746,1.5,41,81,0.48
8,Saudi Arabia,2429102,347.0,207,2020,Saudi Arabia,36947025,1.48,538205,17,2149690,28998,2.4,31,83,0.46
9,Germany,2284109,60.0,438,2020,Germany,83294633,-0.09,-75210,239,348560,155751,1.5,45,77,1.04


In [35]:
df3.loc[df3['Country'] == 'Palestine']

Unnamed: 0,Country,Total market cap (in mil. US$),Total market cap (% of GDP),Number of domestic companies listed,Year,Country (or dependency),Population (2023),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
81,Palestine,3447,22.2,46,2020,Palestine,5371230,2.31,121158,892,6020,-5000,3.4,20,83,0.07


In [36]:
df4 = df3[['Country','Total market cap (in mil. US$)','Number of domestic companies listed', 'Population (2023)', 'Land Area (Km²)']]


In [37]:
#converting into numerical values
ignore = ['Country']
df4 = (df4.set_index(ignore, append=True)
        .astype(int)
        .reset_index(ignore))
df4.dtypes

Country                                object
Total market cap (in mil. US$)          int32
Number of domestic companies listed     int32
Population (2023)                       int32
Land Area (Km²)                         int32
dtype: object

#### Adding new columns

In [38]:
df4['Capitalization per capita in USD'] = df4['Total market cap (in mil. US$)'] / df4['Population (2023)']
df4['Capitalization per capita in USD'] = df4['Capitalization per capita in USD'] * 1e6
df4['Capitalization per capita in USD'] = df4['Capitalization per capita in USD'].astype('int64')

In [39]:
df4['Capitalization per area (km²) in USD'] = (df4['Total market cap (in mil. US$)'].div(df4['Land Area (Km²)'])) * 1e6
df4['Capitalization per area (km²) in USD'] = df4['Capitalization per area (km²) in USD'].astype(int)

In [40]:
df4['Capitalization per area (km²) in USD'] = df4['Capitalization per area (km²) in USD'].astype('i')

In [41]:
df4['number of people per domestic company'] = (df4['Population (2023)'].div(df4['Number of domestic companies listed']).replace(np.inf, 0))
df4['number of people per domestic company'] = df4['number of people per domestic company'].astype('int64')

In [42]:
df4.sort_values(by = ['Number of domestic companies listed'], ascending=False).reset_index()

Unnamed: 0,index,Country,Total market cap (in mil. US$),Number of domestic companies listed,Population (2023),Land Area (Km²),Capitalization per capita in USD,Capitalization per area (km²) in USD,number of people per domestic company
0,4,India,3612985,5270,1428627663,2973190,2528,1215188,271086
1,0,United States,44719661,4266,339996563,9147420,131529,4888773,79699
2,1,China,13214311,4154,1425671352,9388211,9268,1407543,343204
3,7,Canada,2641455,3922,38781291,9093510,68111,290476,9888
4,2,Japan,6718220,3754,123294513,364555,54489,18428549,32843
5,19,Spain,759175,2711,47519628,498800,15976,1522002,17528
6,3,Hong Kong,6130420,2353,7491609,1050,818304,-2147483648,3183
7,10,South Korea,2176190,2318,51784059,97230,42024,22381878,22339
8,13,Australia,1720556,1902,26439111,7682300,65076,223963,13900
9,6,United Kingdom,2821000,1858,67736802,241930,41646,11660397,36456


In [43]:
df4.loc[df4['Country'] == 'Poland' ]

Unnamed: 0,Country,Total market cap (in mil. US$),Number of domestic companies listed,Population (2023),Land Area (Km²),Capitalization per capita in USD,Capitalization per area (km²) in USD,number of people per domestic company
36,Poland,177508,782,41026067,306230,4326,579655,52463


In [44]:
df4

Unnamed: 0,Country,Total market cap (in mil. US$),Number of domestic companies listed,Population (2023),Land Area (Km²),Capitalization per capita in USD,Capitalization per area (km²) in USD,number of people per domestic company
0,United States,44719661,4266,339996563,9147420,131529,4888773,79699
1,China,13214311,4154,1425671352,9388211,9268,1407543,343204
2,Japan,6718220,3754,123294513,364555,54489,18428549,32843
3,Hong Kong,6130420,2353,7491609,1050,818304,-2147483648,3183
4,India,3612985,5270,1428627663,2973190,2528,1215188,271086
5,France,2823000,457,64756584,547557,43594,5155627,141699
6,United Kingdom,2821000,1858,67736802,241930,41646,11660397,36456
7,Canada,2641455,3922,38781291,9093510,68111,290476,9888
8,Saudi Arabia,2429102,207,36947025,2149690,65745,1129977,178488
9,Germany,2284109,438,83294633,348560,27422,6552986,190170


In [45]:
df4.to_csv(r'D:\Data sets\web scrapping\final_dataframe1.csv', index = False )