In [1]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [2]:
def scrape_year(year):
    # Build correct URL
    if year == 2025:
        url = "https://fortune.com/ranking/global500/"
    else:
        url = f"https://fortune.com/ranking/global500/{year}/"
    
    print(f"Scraping year: {year}  →  {url}")

    # Open Selenium
    chrome_options = Options()
    chrome_options.page_load_strategy = 'normal'
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(60)

    driver.get(url)
    time.sleep(3)

    # Parse rendered HTML
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    # Extract table data
    cells = soup.find_all('td', attrs={'data-cy': 'list-cell'})
    data = [c.get_text(strip=True) for c in cells]

    # Define columns exactly like your existing script
    columns = [
        'Rank', 'Company', 'Revenue', 'Growth', 'Profit', 'Profit_Margin',
        'Assets', 'Employees', 'Change', 'Years'
    ]

    rows = []
    for i in range(0, len(data), 10):
        row = data[i:i+10]
        if len(row) == 10:
            rows.append(row)

    df = pd.DataFrame(rows, columns=columns)

    # Add the year
    df["Year"] = year
    return df


In [9]:
all_years = []

for year in [2025, 2024, 2023, 2022, 2021]:
    try:
        df_year = scrape_year(year)
        all_years.append(df_year)
    except Exception as e:
        print(f"Failed for year {year}: {e}")


Scraping year: 2025  →  https://fortune.com/ranking/global500/
Scraping year: 2024  →  https://fortune.com/ranking/global500/2024/
Scraping year: 2023  →  https://fortune.com/ranking/global500/2023/
Scraping year: 2022  →  https://fortune.com/ranking/global500/2022/
Scraping year: 2021  →  https://fortune.com/ranking/global500/2021/


In [11]:
df = pd.concat(all_years, ignore_index=True)
df.to_csv("2021_2025_full_data.csv", index=False)
print("Scraping complete!")

Scraping complete!


In [13]:
df

Unnamed: 0,Rank,Company,Revenue,Growth,Profit,Profit_Margin,Assets,Employees,Change,Years,Year
0,1,Walmart,"$680,985",5.1%,"$19,436",25.3%,"$260,823",2100000,-,31,2025
1,2,Amazon,"$637,959",11%,"$59,248",94.7%,"$624,894",1556000,-,17,2025
2,3,State Grid,"$548,414.4",0.5%,"$10,044.9",9.1%,"$797,694",1354310,-,25,2025
3,4,Saudi Aramco,"$480,193.5",-3%,"$104,982.3",-13%,"$645,097.2",75118,-,7,2025
4,5,China National Petroleum,"$412,645.3",-2.2%,"$22,424",5.3%,"$607,615.1",985155,1,25,2025
...,...,...,...,...,...,...,...,...,...,...,...
2495,496,Truist Financial,"$24,427",66.6%,"$4,482",39%,"$509,228",53638,-,1,2021
2496,497,China Reinsurance (Group),"$24,376",18.1%,$827.6,-5.5%,"$69,513.7",63914,-,1,2021
2497,498,Commonwealth Bank of Australia,"$24,362",-18.7%,"$6,457.1",5.4%,"$698,585.9",43585,-82,17,2021
2498,499,Flextronics International,"$24,124",-0.4%,$613,599.9%,"$15,836",167201,-,20,2021
