In [1]:
# import libraries
import requests
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'

r = requests.get(url)

if r.status_code == 200:
    html_doc = r.text
    
# get BeautifulSoup object
soup = BeautifulSoup(html_doc)

# find the table elements
tables = soup.find_all("table")

# the table we want is the first one
list_of_companies = tables[0]

# all of the rows of the table
rows = list_of_companies.find_all("tr")

# storage for the extracted data
output = []

# specify column names
column_names = ["Name", "Industry", "Revenue", "Profit",
                "Employees", "Headquarters", "Ref"]

# create a for loop statement
for company in rows:
    company_data = company.find_all("td")
    if company_data:
        # extract the text within each element
        company_text = [td.text for td in company_data]
        output.append(dict(zip(column_names, company_text)))
        
# import pandas
import pandas as pd

# create a DataFrame
data = pd.DataFrame(output)

# subset data set to only relevant columns
data_companies = data[['Name', 'Revenue', 'Headquarters']]

# view DataFrame
data_companies

Unnamed: 0,Name,Revenue,Headquarters
0,Walmart,"$559,151",United States
1,State Grid,"$386,617",China\n
2,Amazon,"$386,064",United States\n
3,China National Petroleum\n,"$283,958",China
4,Sinopec Group,"$283,728",China
5,Apple,"$274,515",United States
6,CVS Health,"$268,706",United States
7,UnitedHealth,"$257,141",United States\n
8,Toyota\n,"$256,722",Japan
9,Volkswagen\n,"$253,965",Germany


In [2]:
# import JSON library
import json

# export the output as a JSON file
output_json = json.dumps(output)

# view the output
print(output_json)

[{"Name": "Walmart", "Industry": "Retail", "Revenue": " $559,151", "Profit": "$13,510", "Employees": "2,300,000", "Headquarters": " United States", "Ref": "[4]\n"}, {"Name": "State Grid", "Industry": "Electricity", "Revenue": " $386,617", "Profit": "$5,580", "Employees": "896,360\n", "Headquarters": " China\n", "Ref": "[5]\n"}, {"Name": "Amazon", "Industry": "Retail, Information Technology", "Revenue": " $386,064", "Profit": "$21,331", "Employees": "1,608,000\n", "Headquarters": " United States\n", "Ref": "[6]\n"}, {"Name": "China National Petroleum\n", "Industry": "Oil and gas\n", "Revenue": " $283,958", "Profit": "$4,575", "Employees": "1,242,245", "Headquarters": " China", "Ref": "[7]\n"}, {"Name": "Sinopec Group", "Industry": "Oil and gas", "Revenue": " $283,728", "Profit": "$6,205", "Employees": "553,833", "Headquarters": " China", "Ref": "[8]\n"}, {"Name": "Apple", "Industry": "Electronics", "Revenue": " $274,515", "Profit": "$57,411", "Employees": "147,000", "Headquarters": " Un

In [3]:
# read json using pandas, output to .csv
pd.read_json(output_json).to_csv("list_of_companies.csv", index=False)

# save the json file to .json
with open("list_of_companies.json", "w") as f:
    json.dump(output, f)

In [4]:
# import and read CSV file
data_csv = pd.read_csv("list_of_companies.csv")

# view the data
print(data_csv.head())

# import and read JSON file
data_json = pd.read_json("list_of_companies.json")

# view the data
data_json.head()

                         Name                        Industry    Revenue  \
0                     Walmart                          Retail   $559,151   
1                  State Grid                     Electricity   $386,617   
2                      Amazon  Retail, Information Technology   $386,064   
3  China National Petroleum\n                   Oil and gas\n   $283,958   
4               Sinopec Group                     Oil and gas   $283,728   

    Profit    Employees      Headquarters    Ref  
0  $13,510    2,300,000     United States  [4]\n  
1   $5,580    896,360\n           China\n  [5]\n  
2  $21,331  1,608,000\n   United States\n  [6]\n  
3   $4,575    1,242,245             China  [7]\n  
4   $6,205      553,833             China  [8]\n  


Unnamed: 0,Name,Industry,Revenue,Profit,Employees,Headquarters,Ref
0,Walmart,Retail,"$559,151","$13,510",2300000,United States,[4]\n
1,State Grid,Electricity,"$386,617","$5,580","896,360\n",China\n,[5]\n
2,Amazon,"Retail, Information Technology","$386,064","$21,331","1,608,000\n",United States\n,[6]\n
3,China National Petroleum\n,Oil and gas\n,"$283,958","$4,575",1242245,China,[7]\n
4,Sinopec Group,Oil and gas,"$283,728","$6,205",553833,China,[8]\n
