In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

NO_IMAGE = 'https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/No-image-available.png/480px-No-image-available.png'

# URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_stadiums_by_capacity"

# Send a GET request to fetch the page content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all tables on the page
    tables = soup.find_all('table', class_='wikitable')
    
    all_data = []
    
    # Loop through each table
    for table in tables:
        # Find all rows in the table
        rows = table.find_all('tr')
        
        # Loop through each row
        for row in rows:
            # Find all cells in the row
            cells = row.find_all(['th', 'td'])
            row_data = []
            
            for idx, cell in enumerate(cells):
                cell_text = cell.get_text(strip=True)
                
                # Check for images in the cell except for the 'Country' column
                if idx != 3:  # Assuming the 'Country' column is the 4th column (index 3)
                    img = cell.find('img')
                    if img:
                        img_url = img['src']
                        # Convert relative URL to absolute URL
                        if img_url.startswith("//"):
                            img_url = "https:" + img_url
                        cell_text += f"{img_url}"
                
                row_data.append(cell_text)
            
            all_data.append(row_data)
    
    # Convert the data to a pandas DataFrame
    df = pd.DataFrame(all_data)
    
    # Define column names
    df.columns = ['Stadium', 'Capacity', 'City_State', 'Country', 'Region', 'Tenants', 'Sports', 'Image']

    # Clean the dataset
    def clean_text(text):
        text = re.sub(r'better\xa0source\xa0needed', '', text)
        # Remove references [1], [2], etc.
        text = re.sub(r'\[\d+\]', '', text)
        # Remove any special characters and extra spaces
        text = re.sub(r'[^\w\s,-]', '', text)
        # Strip leading and trailing spaces
        text = text.strip()
        # Capitalize only the first letter of each word
        text = text.title()
        return text

    columns_clean = ['Stadium', 'Capacity', 'City_State', 'Country', 'Region', 'Tenants', 'Sports']
    df[columns_clean] = df[columns_clean].applymap(lambda x: clean_text(x) if isinstance(x, str) else x)
    
    # Remove rows where 'Capacity' column contains only the word 'Capacity' (header rows)
    df = df[df['Capacity'].str.lower() != 'capacity']
    
    # Further clean the 'Capacity' column to remove any non-numeric characters and convert to int
    df['Capacity'] = df['Capacity'].str.replace(',', '').str.replace('.', '').astype(int)
    
    # Handle missing images
    df['Image'] = df['Image'].apply(lambda x: NO_IMAGE if x is None else x)

    # Display the cleaned DataFrame
#     print(df)
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


  df['Capacity'] = df['Capacity'].str.replace(',', '').str.replace('.', '').astype(int)


In [2]:
df

Unnamed: 0,Stadium,Capacity,City_State,Country,Region,Tenants,Sports,Image
1,Narendra Modi Stadium,132000,"Ahmedabad,Gujarat",India,South Asia,"India National Cricket Team,India Womens Natio...",Cricket,https://upload.wikimedia.org/wikipedia/commons...
2,Rungrado 1St Of May Stadium,114000,Pyongyang,North Korea,East Asia,North Korea National Football Team,"Football,Athletics,Mass Games",https://upload.wikimedia.org/wikipedia/commons...
3,Michigan Stadium,107601,"Ann Arbor, Michigan",United States,North America,Michigan Wolverines Football,American Football,https://upload.wikimedia.org/wikipedia/commons...
4,Beaver Stadium,106572,"State College, Pennsylvania",United States,North America,Penn State Nittany Lions Football,American Football,https://upload.wikimedia.org/wikipedia/commons...
5,Ohio Stadium,102780,"Columbus, Ohio",United States,North America,Ohio State Buckeyes Football,American Football,https://upload.wikimedia.org/wikipedia/commons...
...,...,...,...,...,...,...,...,...
538,Hauptstadion,40000,Aachen,Germany,Europe,Aachen-Laurensberger Rennvereinde,Equestrianandshow Jumping,https://upload.wikimedia.org/wikipedia/commons...
539,Xining Stadium,40000,Xining,China,East Asia,Local Football Teams,Association Football,https://upload.wikimedia.org/wikipedia/commons...
540,Shaoxing China Textile City Sports Center,40000,Shaoxing,China,East Asia,,Athletics,https://upload.wikimedia.org/wikipedia/commons...
541,Anqing Sports Centre Stadium,40000,Anqing,China,East Asia,,Athletics,https://upload.wikimedia.org/wikipedia/commons...
