# **Data Extraction**

In [21]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import sqlite3
import random

In [22]:
# List of user agents
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    # Add more user agents as needed
]

# Select a random user agent
user_agent = random.choice(user_agents)

# Define headers with the selected user agent
headers = {
    'User-Agent': user_agent,
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',}
    # Add more headers as needed

## Assets

In [4]:
url = 'https://companiesmarketcap.com/assets-by-market-cap/'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find('table', {'class':'default-table table marketcap-table dataTable'})


entries = table.find_all('tr')
entries = entries[1:]

names = []
trends =[]
pct_change = []
m_cap = []
prices = []
links = []
asset_desc = []

for entry in entries:
    name = entry.find('div', {'class':'company-name'})
    names.append(name.text.strip())

    earn = entry.find('td', {'class':'rh-sm'})
    trend = 'down' if earn.find('span', {'class': 'percentage-red'}) else 'up'
    trends.append(trend)
    
    day_ct = earn.text
    pct_change.append(day_ct)

    fintext = entry.find_all('td', {'class':'td-right'})
    cap = fintext[1].text.strip()
    m_cap.append(cap)
    
    price = fintext[2].text.strip()
    prices.append(price)
    

    url = entry.find('a')
    if url:
        link = url['href']
        full_link = f'https://companiesmarketcap.com{link}'
    else:
        full_link = None

    links.append(full_link)

for link in links:
    if link:
        r = requests.get(link)
        soup = BeautifulSoup(r.text, 'lxml')
        
        div = soup.find('div', {'class': 'col-sm-9'})
        if not div:
            div = soup.find('div', {'class': 'col-lg-4 company-description'})
        
        text = div.text.strip() if div else None
        asset_desc.append(text)
    else:
        asset_desc.append(None)      
        
# Save to dataframe        
asset_df = pd.DataFrame({'Asset': names, 'Market Cap': m_cap, 'Share Price': prices,
                   'Day Change': pct_change, 'Trend': trends,  'Description': asset_desc
                   })


# Data cleaning

# Function to convert values to billions
def convert_to_billion(value):
    if 'T' in value:
        return float(value.replace('T', '')) * 1000
    elif 'B' in value:
        return float(value.replace('B', ''))
    else:
        return float(value)

# Apply the conversion function and strip '$' and 'T'
asset_df['Market Cap'] = asset_df['Market Cap'].str.replace(r'[$,]', '', regex=True).apply(convert_to_billion)

# Rename the column
asset_df = asset_df.rename(columns={'Market Cap': 'Market Cap($B)'})
asset_df['Share Price'] = asset_df['Share Price'].str.replace(r'[$, ,]', '', regex=True).astype(float)
asset_df['As Of'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')
asset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Asset           100 non-null    object 
 1   Market Cap($B)  100 non-null    float64
 2   Share Price     100 non-null    float64
 3   Day Change      100 non-null    object 
 4   Trend           100 non-null    object 
 5   Description     63 non-null     object 
 6   As Of           100 non-null    object 
dtypes: float64(2), object(5)
memory usage: 5.6+ KB


In [5]:
# Save to SQLite Database
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Update table
asset_df.to_sql('Assets', conn, if_exists='append', index=False)

conn.commit()

print('Data successfully added to database table')

# Close the connection
conn.close()

Data successfully added to database table


## Earnings

In [6]:
# Generate a list of URLs for the first 10 pages of the most profitable companies

urls = []
base = 'https://companiesmarketcap.com/most-profitable-companies/page/'
for pg in range(1, 11):
  url = f'{base}{pg}'
  urls.append(url)


In [7]:
all_df = []
for url in urls:
  r = requests.get(url, headers=headers)
  soup = BeautifulSoup(r.text, 'lxml')
  table = soup.find('tbody')

  names = soup.find_all('div', {'class':'company-name'})
  names = [name.text.strip() for name in names]

  trends =[]
  earnings = soup.find_all('td', {'class':'rh-sm'})
  for earn in earnings:
      if earn.find('span', {'class':'percentage-red'}):
        trend = 'down'
      else:
        trend = 'up'
      trends.append(trend)

  day_ct = [earning.text for earning in earnings]

  tds = soup.find_all('td', {'class':'td-right'})
  earnings = []
  prices = []
  for i in range(0, len(tds), 3):
        earnings.append(tds[i+1].text.strip())
        prices.append(tds[i+2].text.strip())

  countries = soup.find_all('span',{'class':'responsive-hidden'})
  countries = countries[1:]
  countries = [country.text.strip() for country in countries]

  data = {'Company':names, 'Earnings':earnings, 'Share Price':prices, 'Pct_change':day_ct, 'Trend':trends, 'Country':countries }
  df = pd.DataFrame(data)

  # Data cleaning
  # Function to convert values to billions
  def convert_to_billion(value):
      if 'T' in value:
          return float(value.replace('T', '')) * 1000
      elif 'B' in value:
          return float(value.replace('B', ''))
      else:
          return float(value)

  # Apply the conversion function and strip '$' and 'T'
  df['Earnings'] = df['Earnings'].str.replace(r'[$,]', '', regex=True).apply(convert_to_billion)

  # Rename the column
  df = df.rename(columns={'Earnings': 'Earnings($B)'})
  df['Share Price'] = df['Share Price'].str.replace(r'[$, ,]', '', regex=True).astype(float)
  df['Pct_change'] = df['Pct_change'].str.replace(r'%', '', regex=True).astype(float)
  all_df.append(df)

# Concatenate all DataFrames into one
earnings_df = pd.concat(all_df, ignore_index=True)

# Drop duplicates 
earnings_df.drop_duplicates(subset='Company', keep='first', inplace=True)
earnings_df['As Of'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')

# Sort alphabetically and reset index
earnings_df = earnings_df.sort_values(by='Company')
earnings_df = earnings_df.reset_index(drop=True)
earnings_df['As Of'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')

earnings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Company       1000 non-null   object 
 1   Earnings($B)  1000 non-null   float64
 2   Share Price   1000 non-null   float64
 3   Pct_change    1000 non-null   float64
 4   Trend         1000 non-null   object 
 5   Country       1000 non-null   object 
 6   As Of         1000 non-null   object 
dtypes: float64(3), object(4)
memory usage: 54.8+ KB


In [8]:
earnings_df.head()

Unnamed: 0,Company,Earnings($B),Share Price,Pct_change,Trend,Country,As Of
0,3i Group,5.66,39.34,0.1,up,UK,2024-07-21 19:41
1,7-Eleven,1.89,11.44,0.5,down,Japan,2024-07-21 19:41
2,ABB,4.84,54.37,0.02,down,Switzerland,2024-07-21 19:41
3,ABN AMRO,4.05,17.69,1.1,down,Netherlands,2024-07-21 19:41
4,AES,1.46,17.1,0.12,up,USA,2024-07-21 19:41


In [9]:
# Save to SQLite Database
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Update table
earnings_df.to_sql('Earnings', conn, if_exists='append', index=False)

conn.commit()

print('Data successfully added to database table')

# Close the connection
conn.close()

Data successfully added to database table


## Market cap

In [10]:
# Generate urls for companies with the highest market caps

base = 'https://companiesmarketcap.com/page/'
urls = []
for pg in range(1,11):
  url = f'{base}{pg}'
  urls.append(url)

In [11]:
all_df = []
for url in urls:
  r = requests.get(url, headers=headers)
  soup = BeautifulSoup(r.text, 'lxml')
  table = soup.find('tbody')

  names = soup.find_all('div', {'class':'company-name'})
  names = [name.text.strip() for name in names]

  trends =[]
  earnings = soup.find_all('td', {'class':'rh-sm'})
  for earn in earnings:
      if earn.find('span', {'class':'percentage-red'}):
        trend = 'down'
      else:
        trend = 'up'
      trends.append(trend)

  day_ct = [earning.text for earning in earnings]

  tds = soup.find_all('td', {'class':'td-right'})
  earnings = []
  prices = []
  for i in range(0, len(tds), 3):
        earnings.append(tds[i+1].text.strip())
        prices.append(tds[i+2].text.strip())

  countries = soup.find_all('span',{'class':'responsive-hidden'})
  countries = countries[1:]
  countries = [country.text.strip() for country in countries]

  data = {'Company':names, 'Market cap':earnings, 'Share Price':prices, 'Pct_change':day_ct, 'Trend':trends, 'Country':countries }
  df = pd.DataFrame(data)

  # Data cleaning
  # Function to convert values to billions
  def convert_to_billion(value):
      if 'T' in value:
          return float(value.replace('T', '')) * 1000
      elif 'B' in value:
          return float(value.replace('B', ''))
      else:
          return float(value)

  # Apply the conversion function and strip '$' and 'T'
  df['Market cap'] = df['Market cap'].str.replace(r'[$,]', '', regex=True).apply(convert_to_billion)

  # Rename the column
  df = df.rename(columns={'Market cap': 'Market cap($B)'})
  df['Share Price'] = df['Share Price'].str.replace(r'[$, ,]', '', regex=True).astype(float)
  df['Pct_change'] = df['Pct_change'].str.replace(r'%', '', regex=True).astype(float)
  all_df.append(df)

# Concatenate all DataFrames into one
market_df = pd.concat(all_df, ignore_index=True)
market_df.drop_duplicates(subset='Company', keep='first', inplace=True)

# Sort alphabetically and reset index
market_df = market_df.sort_values(by='Company')
market_df = market_df.reset_index(drop=True)
market_df['As Of'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')

market_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Company         1000 non-null   object 
 1   Market cap($B)  1000 non-null   float64
 2   Share Price     1000 non-null   float64
 3   Pct_change      1000 non-null   float64
 4   Trend           1000 non-null   object 
 5   Country         1000 non-null   object 
 6   As Of           1000 non-null   object 
dtypes: float64(3), object(4)
memory usage: 54.8+ KB


In [12]:
market_df.head()

Unnamed: 0,Company,Market cap($B),Share Price,Pct_change,Trend,Country,As Of
0,3M,57.5,103.92,0.1,down,USA,2024-07-21 19:41
1,3i Group,29.38,39.34,0.1,up,UK,2024-07-21 19:41
2,7-Eleven,29.67,11.44,0.5,down,Japan,2024-07-21 19:41
3,ABB,100.48,54.37,0.02,down,Switzerland,2024-07-21 19:41
4,ABB India,19.29,91.04,3.45,down,India,2024-07-21 19:41


In [13]:
# Save to SQLite Database
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Update table
market_df.to_sql('Marketcap', conn, if_exists='append', index=False)

conn.commit()

print('Data successfully added to database table')

# Close the connection
conn.close()

Data successfully added to database table


## Revenue

In [14]:
base = 'https://companiesmarketcap.com/largest-companies-by-revenue/page/'
urls = []
for pg in range(1,11):
  url = f'{base}{pg}'
  urls.append(url)

In [15]:
all_df = []
for url in urls:
  r = requests.get(url, headers=headers)
  soup = BeautifulSoup(r.text, 'lxml')
  table = soup.find('tbody')

  names = soup.find_all('div', {'class':'company-name'})
  names = [name.text for name in names]

  trends =[]
  earnings = soup.find_all('td', {'class':'rh-sm'})
  for earn in earnings:
      if earn.find('span', {'class':'percentage-red'}):
        trend = 'down'
      else:
        trend = 'up'
      trends.append(trend)

  day_ct = [earning.text for earning in earnings]

  tds = soup.find_all('td', {'class':'td-right'})
  earnings = []
  prices = []
  for i in range(0, len(tds), 3):
        earnings.append(tds[i+1].text.strip())
        prices.append(tds[i+2].text.strip())

  countries = soup.find_all('span',{'class':'responsive-hidden'})
  countries = countries[1:]
  countries = [country.text for country in countries]

  data = {'Company':names, 'Revenue':earnings, 'Share Price':prices, 'Pct_change':day_ct, 'Trend':trends, 'Country':countries }
  df = pd.DataFrame(data)

  # Data cleaning
  # Function to convert values to billions
  def convert_to_billion(value):
      if 'T' in value:
          return float(value.replace('T', '')) * 1000
      elif 'B' in value:
          return float(value.replace('B', ''))
      else:
          return float(value)

  # Apply the conversion function and strip '$' and 'T'
  df['Revenue'] = df['Revenue'].str.replace(r'[$,]', '', regex=True).apply(convert_to_billion)

  # Rename the column
  df = df.rename(columns={'Revenue': 'Revenue($B)'})
  df['Share Price'] = df['Share Price'].str.replace(r'[$, ,]', '', regex=True).astype(float)
  df['Pct_change'] = df['Pct_change'].str.replace(r'%', '', regex=True).astype(float)
  all_df.append(df)

# Concatenate all DataFrames into one
revenue_df = pd.concat(all_df, ignore_index=True)

revenue_df.drop_duplicates(subset='Company', keep='first', inplace=True)

# Sort alphabetically and reset index
revenue_df = revenue_df.sort_values(by='Company')
revenue_df = revenue_df.reset_index(drop=True)
revenue_df['As Of'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')

revenue_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Company      1000 non-null   object 
 1   Revenue($B)  1000 non-null   float64
 2   Share Price  1000 non-null   float64
 3   Pct_change   1000 non-null   float64
 4   Trend        1000 non-null   object 
 5   Country      1000 non-null   object 
 6   As Of        1000 non-null   object 
dtypes: float64(3), object(4)
memory usage: 54.8+ KB


In [16]:
revenue_df.head(10)

Unnamed: 0,Company,Revenue($B),Share Price,Pct_change,Trend,Country,As Of
0,3M,32.65,103.92,0.1,down,USA,2024-07-21 19:42
1,7-Eleven,78.6,11.44,0.5,down,Japan,2024-07-21 19:42
2,A2A,18.24,2.08,1.85,down,Italy,2024-07-21 19:42
3,ABB,32.24,54.37,0.02,down,Switzerland,2024-07-21 19:42
4,AECOM,15.34,89.16,0.47,down,USA,2024-07-21 19:42
5,AEGON\n,12.01,6.2,0.64,down,Netherlands,2024-07-21 19:42
6,AES,12.51,17.1,0.12,up,USA,2024-07-21 19:42
7,AGC,13.85,33.21,1.49,down,Japan,2024-07-21 19:42
8,AGCO,14.0,102.03,0.21,down,USA,2024-07-21 19:42
9,AIA,20.69,6.93,0.73,down,Hong Kong,2024-07-21 19:42


In [17]:
# Save to SQLite Database
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Update table
revenue_df.to_sql('Revenue', conn, if_exists='append', index=False)

conn.commit()

print('Data successfully added to database table')

# Close the connection
conn.close()

Data successfully added to database table


## Employee size

In [23]:
base = 'https://companiesmarketcap.com/largest-companies-by-number-of-employees/page/'
urls = []
for pg in range(1,11):
  url = f'{base}{pg}'
  urls.append(url)

In [24]:
all_df = []
for url in urls:
  r = requests.get(url, headers=headers)
  soup = BeautifulSoup(r.text, 'lxml')
  table = soup.find('tbody')

  names = soup.find_all('div', {'class':'company-name'})
  names = [name.text.strip() for name in names]

  trends =[]
  earnings = soup.find_all('td', {'class':'rh-sm'})
  for earn in earnings:
      if earn.find('span', {'class':'percentage-red'}):
        trend = 'down'
      else:
        trend = 'up'
      trends.append(trend)

  day_ct = [earning.text for earning in earnings]

  tds = soup.find_all('td', {'class':'td-right'})
  employees = []
  prices = []
  for i in range(0, len(tds), 3):
        employees.append(tds[i+1].text.strip())
        prices.append(tds[i+2].text.strip())

  countries = soup.find_all('span',{'class':'responsive-hidden'})
  countries = countries[1:]
  countries = [country.text.strip() for country in countries]

  data = {'Company':names, 'Employees':employees, 'Share Price':prices, 'Pct_change':day_ct, 'Trend':trends, 'Country':countries }
  df = pd.DataFrame(data)

  # Data cleaning
  df['Employees'] = df['Employees'].str.replace(r'[,]', '', regex=True).astype(int)
  df['Share Price'] = df['Share Price'].str.replace(r'[$, ,]', '', regex=True).astype(float)
  df['Pct_change'] = df['Pct_change'].str.replace(r'%', '', regex=True).astype(float)
  all_df.append(df)

# Concatenate all DataFrames into one
employee_df = pd.concat(all_df, ignore_index=True)

employee_df.drop_duplicates(subset='Company', keep='first', inplace=True)

# Sort alphabetically and reset index
employee_df = employee_df.sort_values(by='Company')
employee_df = employee_df.reset_index(drop=True)
employee_df['As Of'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')

employee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Company      1000 non-null   object 
 1   Employees    1000 non-null   int64  
 2   Share Price  1000 non-null   float64
 3   Pct_change   1000 non-null   float64
 4   Trend        1000 non-null   object 
 5   Country      1000 non-null   object 
 6   As Of        1000 non-null   object 
dtypes: float64(2), int64(1), object(4)
memory usage: 54.8+ KB


In [25]:
employee_df.head(10)

Unnamed: 0,Company,Employees,Share Price,Pct_change,Trend,Country,As Of
0,3M,85000,103.92,0.1,down,USA,2024-07-21 19:47
1,7-Eleven,84154,11.44,0.5,down,Japan,2024-07-21 19:47
2,ABB,108700,54.37,0.02,down,Switzerland,2024-07-21 19:47
3,ABM Industries,100000,53.59,0.32,down,USA,2024-07-21 19:47
4,AECOM,52000,89.16,0.47,down,USA,2024-07-21 19:47
5,AGC,56724,33.21,1.49,down,Japan,2024-07-21 19:47
6,AGCO,27900,102.03,0.21,down,USA,2024-07-21 19:47
7,AIER Eye Hospital,32326,1.49,0.55,down,China,2024-07-21 19:47
8,ALSOK,39039,6.09,0.72,up,Japan,2024-07-21 19:47
9,ALTEN,57000,111.69,5.96,down,France,2024-07-21 19:47


In [26]:
# Save to SQLite Database
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Update table
employee_df.to_sql('Employees', conn, if_exists='append', index=False)

conn.commit()

print('Data successfully added to database table')

# Close the connection
conn.close()

Data successfully added to database table


# **Transformation**

In [34]:
# Select only the relevant columns from each DataFrame
earns = earnings_df[['Company', 'Earnings($B)', 'Share Price', 'Country', 'Trend']]
cap = market_df[['Company', 'Market cap($B)', 'Share Price']]
revenue = revenue_df[['Company', 'Revenue($B)', 'Share Price']]
employee = employee_df[['Company', 'Employees', 'Share Price']]

# Merge the DataFrames on the 'Company' column
merged_df = pd.merge(earns, cap, on='Company', suffixes=('_earn', '_cap'))
merged_df = pd.merge(merged_df, revenue, on='Company', suffixes=('', '_rev'))
merged_df = pd.merge(merged_df, employee, on='Company', suffixes=('', '_emp'))

# Reorder columns
cols = merged_df.columns.tolist()
cols.remove('Trend')
cols.remove('Country')  
cols.append('Country')
cols.append('Trend')
merged_df = merged_df[cols] 
merged_df['As Of'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336 entries, 0 to 335
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           336 non-null    object 
 1   Earnings($B)      336 non-null    float64
 2   Share Price_earn  336 non-null    float64
 3   Market cap($B)    336 non-null    float64
 4   Share Price_cap   336 non-null    float64
 5   Revenue($B)       336 non-null    float64
 6   Share Price       336 non-null    float64
 7   Employees         336 non-null    int64  
 8   Share Price_emp   336 non-null    float64
 9   Country           336 non-null    object 
 10  Trend             336 non-null    object 
 11  As Of             336 non-null    object 
dtypes: float64(7), int64(1), object(4)
memory usage: 31.6+ KB


In [35]:
merged_df.head(10)

Unnamed: 0,Company,Earnings($B),Share Price_earn,Market cap($B),Share Price_cap,Revenue($B),Share Price,Employees,Share Price_emp,Country,Trend,As Of
0,7-Eleven,1.89,11.44,29.67,11.44,78.6,11.44,84154,11.44,Japan,down,2024-07-21 19:52
1,ABB,4.84,54.37,100.48,54.37,32.24,54.37,108700,54.37,Switzerland,down,2024-07-21 19:52
2,ANZ Bank,6.48,19.86,59.62,19.86,13.05,19.86,40342,19.86,Australia,down,2024-07-21 19:52
3,ASML,9.12,895.37,360.64,895.37,28.26,895.37,40940,895.37,Netherlands,down,2024-07-21 19:52
4,AT&T,18.95,19.12,137.09,19.12,122.31,19.12,148290,19.12,USA,down,2024-07-21 19:52
5,AXA,7.36,34.77,71.93,34.77,134.68,34.77,94705,34.77,France,down,2024-07-21 19:52
6,AbbVie,7.53,172.32,304.29,172.32,54.4,172.32,50000,172.32,USA,up,2024-07-21 19:52
7,Abbott Laboratories,6.53,102.03,177.49,102.03,40.32,102.03,114000,102.03,USA,up,2024-07-21 19:52
8,Accenture,9.37,329.19,206.19,329.19,64.47,329.19,750000,329.19,Ireland,up,2024-07-21 19:52
9,Adobe,6.57,551.0,244.31,551.0,20.42,551.0,29945,551.0,USA,down,2024-07-21 19:52


In [36]:
# Save to SQLite Database
conn = sqlite3.connect('data/Marketcap.db')
cursor = conn.cursor()

# Update table
merged_df.to_sql('CompanyMerge', conn, if_exists='replace', index=False)

conn.commit()

print('Data successfully added to database table')

# Close the connection
conn.close()

Data successfully added to database table


# **Insight Extraction**

### Explore top companies

In [None]:
metrics = cols[1:-2]

print(f'Sample metrics include:')
for i, metric in enumerate(metrics, start=1):
    print(f"{i}. {metric}")

Sample metrics include:
1. Earnings($B)
2. Market cap($B)
3. Revenue($B)
4. Employees


In [None]:
# Sort and display dataframe
sort_key = input('Enter a metric to sort by:')
choice = metrics[int(sort_key)-1]
sort_df = merged_df.sort_values(by=choice, ascending=False)
sort_df = sort_df.reset_index(drop=True)
print(f'\nDisplaying top companies by {choice}:\n')
sort_df.head(20)


Displaying top companies by Revenue($B):



Unnamed: 0,Company,Earnings($B),Market cap($B),Revenue($B),Employees,Country,Trend,As Of
0,Walmart,28.92,569.44,657.33,2100000,USA,down,2024-07-19 20:26
1,Amazon,49.42,1901.0,590.74,1525000,USA,down,2024-07-19 20:26
2,Saudi Aramco,230.48,1836.0,489.43,70000,S. Arabia,up,2024-07-19 20:26
3,Sinopec,15.55,102.24,473.53,374791,China,down,2024-07-19 20:26
4,PetroChina,36.82,241.77,430.65,398440,China,down,2024-07-19 20:26
5,Apple,120.0,3437.0,381.62,150000,USA,down,2024-07-19 20:26
6,UnitedHealth,21.78,520.62,379.48,440000,USA,up,2024-07-19 20:26
7,CVS Health,12.65,74.74,360.93,219000,USA,down,2024-07-19 20:26
8,Volkswagen,23.71,59.7,348.13,650951,Germany,down,2024-07-19 20:26
9,Exxon Mobil,48.34,524.58,331.46,62000,USA,down,2024-07-19 20:26


### Explore trends

In [32]:
trend = input('Select a trend; up or down:')

filtered_df = merged_df[merged_df['Trend'] == trend]
df_size = len(filtered_df)

if trend == 'up':
  if df_size > 0:
    print(f"There are {len(filtered_df)} Companies with increasing trends today:")
    print(filtered_df['Company'].head(30))
  else:
    print("No companies found with increasing trends.")
elif trend =='down':
  if df_size > 0:
    print(f"There are {len(filtered_df)} Companies with decreasing trends today:")
    filtered_df.head(30)
  else:
    print("No companies found with decreasing trends.")
else:
  print("Invalid input. Please try again.") 


There are 93 Companies with increasing trends today:
6                              AbbVie
7                 Abbott Laboratories
8                           Accenture
13                             Airbus
17                  Alphabet (Google)
19                              Ambev
20                      America Movil
26                              Apple
30                        AstraZeneca
31                           AutoZone
39                                BYD
42    Banco Bilbao Vizcaya Argentaria
44                      Bank of China
46                            Banorte
47                           Barclays
49                           Best Buy
50                  Boston Scientific
51           British American Tobacco
53                               CATL
57              CK Hutchison Holdings
60                               CRRC
61                                CSL
63                          CaixaBank
64                              Canon
70                            Cente