# Data Extraction

In [170]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

## Earnings

In [171]:
# Generate a list of URLs for the first 10 pages of the most profitable companies

urls = []
base = 'https://companiesmarketcap.com/most-profitable-companies/page/'
for pg in range(1, 11):
  url = f'{base}{pg}'
  urls.append(url)


In [172]:
all_df = []
for url in urls:
  r = requests.get(url)
  soup = BeautifulSoup(r.text, 'lxml')
  table = soup.find('tbody')

  names = soup.find_all('div', {'class':'company-name'})
  names = [name.text.strip() for name in names]

  trends =[]
  earnings = soup.find_all('td', {'class':'rh-sm'})
  for earn in earnings:
      if earn.find('span', {'class':'percentage-red'}):
        trend = 'down'
      else:
        trend = 'up'
      trends.append(trend)

  day_ct = [earning.text for earning in earnings]

  tds = soup.find_all('td', {'class':'td-right'})
  earnings = []
  prices = []
  for i in range(0, len(tds), 3):
        earnings.append(tds[i+1].text.strip())
        prices.append(tds[i+2].text.strip())

  countries = soup.find_all('span',{'class':'responsive-hidden'})
  countries = countries[1:]
  countries = [country.text.strip() for country in countries]

  data = {'Company':names, 'Earnings':earnings, 'Share Price':prices, 'Pct_change':day_ct, 'Trend':trends, 'Country':countries }
  df = pd.DataFrame(data)

  # Data cleaning
  # Function to convert values to billions
  def convert_to_billion(value):
      if 'T' in value:
          return float(value.replace('T', '')) * 1000
      elif 'B' in value:
          return float(value.replace('B', ''))
      else:
          return float(value)

  # Apply the conversion function and strip '$' and 'T'
  df['Earnings'] = df['Earnings'].str.replace(r'[$,]', '', regex=True).apply(convert_to_billion)

  # Rename the column
  df = df.rename(columns={'Earnings': 'Earnings($B)'})
  df['Share Price'] = df['Share Price'].str.replace(r'[$, ,]', '', regex=True).astype(float)
  df['Pct_change'] = df['Pct_change'].str.replace(r'%', '', regex=True).astype(float)
  all_df.append(df)

# Concatenate all DataFrames into one
final_df = pd.concat(all_df, ignore_index=True)

# Drop duplicates 
final_df.drop_duplicates(subset='Company', keep='first', inplace=True)

final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Company       1000 non-null   object 
 1   Earnings($B)  1000 non-null   float64
 2   Share Price   1000 non-null   float64
 3   Pct_change    1000 non-null   float64
 4   Trend         1000 non-null   object 
 5   Country       1000 non-null   object 
dtypes: float64(3), object(3)
memory usage: 47.0+ KB


In [173]:
final_df.to_csv('Mostprofit.csv', index=False)

In [174]:
earnings_df = pd.read_csv('Mostprofit.csv')

# Sort alphabetically and reset index
earnings_df = earnings_df.sort_values(by='Company')
earnings_df = earnings_df.reset_index(drop=True)

earnings_df.head(10)

Unnamed: 0,Company,Earnings($B),Share Price,Pct_change,Trend,Country
0,3i Group,5.66,39.71,0.16,down,UK
1,7-Eleven,1.89,11.27,1.57,down,Japan
2,ABB,4.84,57.59,0.23,down,Switzerland
3,ABN AMRO,4.05,17.2,0.69,down,Netherlands
4,AES,1.46,17.0,10.01,down,USA
5,AIA,4.56,6.87,3.25,down,Hong Kong
6,AIB Group (Allied Irish Banks),3.08,5.73,0.96,up,Ireland
7,ANA Holdings,1.57,19.03,1.05,down,Japan
8,ANZ Bank,6.48,20.1,0.17,down,Australia
9,ASML,9.12,1064.0,1.99,down,Netherlands


## Market cap

In [175]:
# Generate urls for companies with the highest market caps

base = 'https://companiesmarketcap.com/page/'
urls = []
for pg in range(1,11):
  url = f'{base}{pg}'
  urls.append(url)

In [176]:
all_df = []
for url in urls:
  r = requests.get(url)
  soup = BeautifulSoup(r.text, 'lxml')
  table = soup.find('tbody')

  names = soup.find_all('div', {'class':'company-name'})
  names = [name.text.strip() for name in names]

  trends =[]
  earnings = soup.find_all('td', {'class':'rh-sm'})
  for earn in earnings:
      if earn.find('span', {'class':'percentage-red'}):
        trend = 'down'
      else:
        trend = 'up'
      trends.append(trend)

  day_ct = [earning.text for earning in earnings]

  tds = soup.find_all('td', {'class':'td-right'})
  earnings = []
  prices = []
  for i in range(0, len(tds), 3):
        earnings.append(tds[i+1].text.strip())
        prices.append(tds[i+2].text.strip())

  countries = soup.find_all('span',{'class':'responsive-hidden'})
  countries = countries[1:]
  countries = [country.text.strip() for country in countries]

  data = {'Company':names, 'Market cap':earnings, 'Share Price':prices, 'Pct_change':day_ct, 'Trend':trends, 'Country':countries }
  df = pd.DataFrame(data)

  # Data cleaning
  # Function to convert values to billions
  def convert_to_billion(value):
      if 'T' in value:
          return float(value.replace('T', '')) * 1000
      elif 'B' in value:
          return float(value.replace('B', ''))
      else:
          return float(value)

  # Apply the conversion function and strip '$' and 'T'
  df['Market cap'] = df['Market cap'].str.replace(r'[$,]', '', regex=True).apply(convert_to_billion)

  # Rename the column
  df = df.rename(columns={'Market cap': 'Market cap($B)'})
  df['Share Price'] = df['Share Price'].str.replace(r'[$, ,]', '', regex=True).astype(float)
  df['Pct_change'] = df['Pct_change'].str.replace(r'%', '', regex=True).astype(float)
  all_df.append(df)

# Concatenate all DataFrames into one
final_df = pd.concat(all_df, ignore_index=True)

final_df.drop_duplicates(subset='Company', keep='first', inplace=True)
final_df.info()

# Save to csv
final_df.to_csv('Marketcap.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Company         1000 non-null   object 
 1   Market cap($B)  1000 non-null   float64
 2   Share Price     1000 non-null   float64
 3   Pct_change      1000 non-null   float64
 4   Trend           1000 non-null   object 
 5   Country         1000 non-null   object 
dtypes: float64(3), object(3)
memory usage: 47.0+ KB


In [177]:
market_df = pd.read_csv('Marketcap.csv')

# Sort alphabetically and reset index
market_df = market_df.sort_values(by='Company')
market_df = market_df.reset_index(drop=True)
market_df.head(10)

Unnamed: 0,Company,Market cap($B),Share Price,Pct_change,Trend,Country
0,3M,56.84,102.72,1.27,down,USA
1,3i Group,38.29,39.71,0.16,down,UK
2,7-Eleven,29.24,11.27,1.57,down,Japan
3,ABB,106.43,57.59,0.23,down,Switzerland
4,ABB India,20.88,98.54,0.23,down,India
5,ACWA POWER Company,75.64,103.27,3.53,up,S. Arabia
6,ADNOC Drilling Company,19.25,1.2,0.45,up,UAE
7,ADNOC Gas,66.86,0.87,0.0,up,UAE
8,AIA,76.28,6.87,3.25,down,Hong Kong
9,AMD,290.66,179.83,0.98,down,USA


## Revenue

In [178]:
base = 'https://companiesmarketcap.com/largest-companies-by-revenue/page/'
urls = []
for pg in range(1,11):
  url = f'{base}{pg}'
  urls.append(url)

In [179]:
all_df = []
for url in urls:
  r = requests.get(url)
  soup = BeautifulSoup(r.text, 'lxml')
  table = soup.find('tbody')

  names = soup.find_all('div', {'class':'company-name'})
  names = [name.text for name in names]

  trends =[]
  earnings = soup.find_all('td', {'class':'rh-sm'})
  for earn in earnings:
      if earn.find('span', {'class':'percentage-red'}):
        trend = 'down'
      else:
        trend = 'up'
      trends.append(trend)

  day_ct = [earning.text for earning in earnings]

  tds = soup.find_all('td', {'class':'td-right'})
  earnings = []
  prices = []
  for i in range(0, len(tds), 3):
        earnings.append(tds[i+1].text.strip())
        prices.append(tds[i+2].text.strip())

  countries = soup.find_all('span',{'class':'responsive-hidden'})
  countries = countries[1:]
  countries = [country.text for country in countries]

  data = {'Company':names, 'Revenue':earnings, 'Share Price':prices, 'Pct_change':day_ct, 'Trend':trends, 'Country':countries }
  df = pd.DataFrame(data)

  # Data cleaning
  # Function to convert values to billions
  def convert_to_billion(value):
      if 'T' in value:
          return float(value.replace('T', '')) * 1000
      elif 'B' in value:
          return float(value.replace('B', ''))
      else:
          return float(value)

  # Apply the conversion function and strip '$' and 'T'
  df['Revenue'] = df['Revenue'].str.replace(r'[$,]', '', regex=True).apply(convert_to_billion)

  # Rename the column
  df = df.rename(columns={'Revenue': 'Revenue($B)'})
  df['Share Price'] = df['Share Price'].str.replace(r'[$, ,]', '', regex=True).astype(float)
  df['Pct_change'] = df['Pct_change'].str.replace(r'%', '', regex=True).astype(float)
  all_df.append(df)

# Concatenate all DataFrames into one
final_df = pd.concat(all_df, ignore_index=True)

final_df.drop_duplicates(subset='Company', keep='first', inplace=True)
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Company      1000 non-null   object 
 1   Revenue($B)  1000 non-null   float64
 2   Share Price  1000 non-null   float64
 3   Pct_change   1000 non-null   float64
 4   Trend        1000 non-null   object 
 5   Country      1000 non-null   object 
dtypes: float64(3), object(3)
memory usage: 47.0+ KB


In [180]:
final_df.to_csv('Revenue.csv', index=False)

In [181]:
revenue_df = pd.read_csv('Revenue.csv')

# Sort alphabetically and reset index
revenue_df = revenue_df.sort_values(by='Company')
revenue_df = revenue_df.reset_index(drop=True)
revenue_df.head(10)

Unnamed: 0,Company,Revenue($B),Share Price,Pct_change,Trend,Country
0,3M,32.65,102.72,1.27,down,USA
1,7-Eleven,78.6,11.27,1.57,down,Japan
2,A2A,18.24,2.01,1.18,down,Italy
3,ABB,32.24,57.59,0.23,down,Switzerland
4,AECOM,15.34,87.26,0.11,up,USA
5,AEGON\n,12.01,6.32,1.4,down,Netherlands
6,AES,12.51,17.0,10.01,down,USA
7,AGC,13.85,33.23,1.2,down,Japan
8,AGCO,14.0,98.78,0.74,up,USA
9,AIA,20.69,6.87,3.25,down,Hong Kong


## Employee size

In [182]:
base = 'https://companiesmarketcap.com/largest-companies-by-number-of-employees/page/'
urls = []
for pg in range(1,11):
  url = f'{base}{pg}'
  urls.append(url)

In [183]:
all_df = []
for url in urls:
  r = requests.get(url)
  soup = BeautifulSoup(r.text, 'lxml')
  table = soup.find('tbody')

  names = soup.find_all('div', {'class':'company-name'})
  names = [name.text.strip() for name in names]

  trends =[]
  earnings = soup.find_all('td', {'class':'rh-sm'})
  for earn in earnings:
      if earn.find('span', {'class':'percentage-red'}):
        trend = 'down'
      else:
        trend = 'up'
      trends.append(trend)

  day_ct = [earning.text for earning in earnings]

  tds = soup.find_all('td', {'class':'td-right'})
  employees = []
  prices = []
  for i in range(0, len(tds), 3):
        employees.append(tds[i+1].text.strip())
        prices.append(tds[i+2].text.strip())

  countries = soup.find_all('span',{'class':'responsive-hidden'})
  countries = countries[1:]
  countries = [country.text.strip() for country in countries]

  data = {'Company':names, 'Employees':employees, 'Share Price':prices, 'Pct_change':day_ct, 'Trend':trends, 'Country':countries }
  df = pd.DataFrame(data)

  # Data cleaning
  df['Employees'] = df['Employees'].str.replace(r'[,]', '', regex=True).astype(int)
  df['Share Price'] = df['Share Price'].str.replace(r'[$, ,]', '', regex=True).astype(float)
  df['Pct_change'] = df['Pct_change'].str.replace(r'%', '', regex=True).astype(float)
  all_df.append(df)

# Concatenate all DataFrames into one
final_df = pd.concat(all_df, ignore_index=True)

final_df.drop_duplicates(subset='Company', keep='first', inplace=True)
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 996 entries, 0 to 999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Company      996 non-null    object 
 1   Employees    996 non-null    int64  
 2   Share Price  996 non-null    float64
 3   Pct_change   996 non-null    float64
 4   Trend        996 non-null    object 
 5   Country      996 non-null    object 
dtypes: float64(2), int64(1), object(3)
memory usage: 54.5+ KB


In [184]:
# Save dataframe to csv
final_df.to_csv('Employees.csv', index=False)

In [185]:
employee_df = pd.read_csv('Employees.csv')

# Sort alphabetically and reset index
employee_df = employee_df.sort_values(by='Company')
employee_df = employee_df.reset_index(drop=True)
employee_df.head(10)

Unnamed: 0,Company,Employees,Share Price,Pct_change,Trend,Country
0,3M,85000,102.72,1.27,down,USA
1,7-Eleven,84154,11.27,1.57,down,Japan
2,ABB,108700,57.59,0.23,down,Switzerland
3,ABM Industries,100000,51.33,0.1,up,USA
4,AECOM,52000,87.26,0.11,up,USA
5,AGC,56724,33.23,1.2,down,Japan
6,AGCO,27900,98.78,0.74,up,USA
7,AIER Eye Hospital,32326,1.45,0.57,up,China
8,ALSOK,39039,5.98,0.41,down,Japan
9,ALTEN,57000,116.32,2.2,down,France


# Exploration

### Merge dataframes

In [152]:
# Select only the relevant columns from each DataFrame
earns = earnings_df[['Company', 'Earnings($B)', 'Country', 'Trend']]
cap = market_df[['Company', 'Market cap($B)']]
revenue = revenue_df[['Company', 'Revenue($B)']]
employee = employee_df[['Company', 'Employees']]

# Merge the DataFrames on the 'Company' column
merged_df = pd.merge(earns, cap, on='Company')
merged_df = pd.merge(merged_df, revenue, on='Company')
merged_df = pd.merge(merged_df, employee, on='Company')

# Reorder columns
cols = merged_df.columns.tolist()
cols.remove('Trend')
cols.remove('Country')  
cols.append('Country')
cols.append('Trend')
merged_df = merged_df[cols] 

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334 entries, 0 to 333
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Company         334 non-null    object 
 1   Earnings($B)    334 non-null    float64
 2   Market cap($B)  334 non-null    float64
 3   Revenue($B)     334 non-null    float64
 4   Employees       334 non-null    int64  
 5   Country         334 non-null    object 
 6   Trend           334 non-null    object 
dtypes: float64(3), int64(1), object(3)
memory usage: 18.4+ KB


In [153]:
merged_df.head(10)

Unnamed: 0,Company,Earnings($B),Market cap($B),Revenue($B),Employees,Country,Trend
0,7-Eleven,1.89,29.24,78.6,84154,Japan,down
1,ABB,4.84,106.35,32.24,108700,Switzerland,down
2,ANZ Bank,6.48,60.32,13.05,40342,Australia,down
3,ASML,9.12,428.63,28.26,40940,Netherlands,down
4,AT&T,18.95,133.22,122.31,148290,USA,down
5,AXA,7.36,78.15,134.68,94705,France,down
6,AbbVie,7.53,296.71,54.4,50000,USA,down
7,Abbott Laboratories,6.53,179.11,40.32,114000,USA,down
8,Accenture,9.37,199.1,64.47,750000,Ireland,up
9,Adobe,6.57,250.83,20.42,29945,USA,up


In [189]:
merged_df.to_csv('Allcompanies07-16', index=False)

### Explore top companies

In [154]:
metrics = cols[1:-2]

print(f'Sample metrics include:')
for i, metric in enumerate(metrics, start=1):
    print(f"{i}. {metric}")

Sample metrics include:
1. Earnings($B)
2. Market cap($B)
3. Revenue($B)
4. Employees


In [155]:
# Sort and display dataframe
sort_key = input('Enter a metric to sort by:')
choice = metrics[int(sort_key)-1]
sort_df = merged_df.sort_values(by=choice, ascending=False)
sort_df = sort_df.reset_index(drop=True)
print(f'\nDisplaying top companies by {choice}:\n')
sort_df.head(20)

ValueError: invalid literal for int() with base 10: ''

### Explore trends

In [None]:
trend = input('Select a trend; up or down:')

filtered_df = merged_df[merged_df['Trend'] == trend]
df_size = len(filtered_df)

if trend == 'up':
  if df_size > 0:
    print(f"There are {len(filtered_df)} Companies with increasing trends today:")
    print(filtered_df)
  else:
    print("No companies found with increasing trends.")
elif trend =='down':
  if df_size > 0:
    print(f"There are {len(filtered_df)} Companies with decreasing trends today:")
    print(filtered_df)
  else:
    print("No companies found with decreasing trends.")
else:
  print("Invalid input. Please try again.") 


There are 142 Companies with increasing trends today:
               Company  Earnings($B)  Market cap($B)  Revenue($B)  Employees  \
8            Accenture          9.37          199.10        64.47     750000   
9                Adobe          6.57          250.83        20.42      29945   
11      Ahold Delhaize          2.44           29.35        96.06     400000   
16            Allstate          1.91           44.05        58.56      53000   
17   Alphabet (Google)         96.14         2314.00       318.14     180895   
..                 ...           ...             ...          ...        ...   
327   Waste Management          3.70           86.63        20.69      48000   
328        Wells Fargo         21.37          196.43        82.73     224824   
329   Woolworths Group          1.53           28.47        42.56     200364   
331         Yili Group          2.30           22.37        17.45      67199   
332       Zijin Mining          5.11           68.24        40.85 