In [1]:
# import the required libraries/dependencies

from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# send a get request to the url of the webpage to fetch the htlml data

url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'
response = requests.get(url)

In [3]:
# Use beatulifulsoup html parser to the document to easily iterate or extract the data

soup = BeautifulSoup(response.text,'lxml')

In [4]:
# Proceed out data extraction (specify the data we want from the webpage)
# using find method to get only the first table

Table = soup.find('table')

In [5]:
# Getting all the headers of the first table

headers = Table.find_all('th')

print(headers)

[<th>Rank
</th>, <th>Name
</th>, <th>Industry
</th>, <th>Revenue <br/>(USD millions)
</th>, <th>Revenue growth
</th>, <th>Employees
</th>, <th>Headquarters
</th>]


In [6]:
# using list comprehensive to get only the text of the headers

col = [title.text.strip() for title in headers]
print(col)

['Rank', 'Name', 'Industry', 'Revenue (USD millions)', 'Revenue growth', 'Employees', 'Headquarters']


In [7]:
# Finding all the rows within the table

column_data = Table.find_all('tr')

In [8]:
data_list = []

for row in column_data[1:]:                                   # looping through rows skipping the first row using slice operation
  row_data=row.find_all('td')                                 # finding all the data in the row
  individual_data=[data.text.strip() for data in row_data]    # extracting and cleaning text from each cell
  data_list.append(individual_data)                           # appending the extracted data into the empty list

In [9]:
# storing it in pandas dataframe

df = pd.DataFrame(data_list,columns=col)

In [12]:
df.head()

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington"
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California"
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota"
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska"


In [13]:
csv_file = df.to_csv('largest_companies_by_finance',index=False)

In [17]:
analyze = pd.read_csv('/content/largest_companies_by_finance')

In [18]:
analyze.head()

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington"
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California"
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota"
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska"


In [20]:
analyze.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Rank                    100 non-null    int64 
 1   Name                    100 non-null    object
 2   Industry                100 non-null    object
 3   Revenue (USD millions)  100 non-null    object
 4   Revenue growth          100 non-null    object
 5   Employees               100 non-null    object
 6   Headquarters            100 non-null    object
dtypes: int64(1), object(6)
memory usage: 5.6+ KB


In [19]:
analyze.describe()

Unnamed: 0,Rank
count,100.0
mean,50.5
std,29.011492
min,1.0
25%,25.75
50%,50.5
75%,75.25
max,100.0
