In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import warnings
warnings.filterwarnings("ignore")


NO_IMAGE = 'https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/No-image-available.png/480px-No-image-available.png'

# URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_stadiums_by_capacity"

# Send a GET request to fetch the page content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all tables on the page
    tables = soup.find_all('table', class_='wikitable')
    
    all_data = []
    
    # Loop through each table
    for table in tables:
        # Find all rows in the table
        rows = table.find_all('tr')
        
        # Loop through each row
        for row in rows:
            # Find all cells in the row
            cells = row.find_all(['th', 'td'])
            row_data = []
            
            for idx, cell in enumerate(cells):
                cell_text = cell.get_text(strip=True)
                
                # Check for images in the cell except for the 'Country' column
                if idx != 3:  # Assuming the 'Country' column is the 4th column (index 3)
                    img = cell.find('img')
                    if img:
                        img_url = img['src']
                        # Convert relative URL to absolute URL
                        if img_url.startswith("//"):
                            img_url = "https:" + img_url
                        cell_text += f"{img_url}"
                
                row_data.append(cell_text)
            
            all_data.append(row_data)
    
    # Convert the data to a pandas DataFrame
    df = pd.DataFrame(all_data)
    
    # Define column names
    df.columns = ['Stadium', 'Capacity', 'City_State', 'Country', 'Region', 'Tenants', 'Sports', 'Image']

    # Clean the dataset
    def clean_text(text):
        text = re.sub(r'better\xa0source\xa0needed', '', text)
        # Remove references [1], [2], etc.
        text = re.sub(r'\[\d+\]', '', text)
        # Remove any special characters and extra spaces
        text = re.sub(r'[^\w\s,-]', '', text)
        # Strip leading and trailing spaces
        text = text.strip()
        # Capitalize only the first letter of each word
        text = text.title()
        return text

    columns_clean = ['Stadium', 'Capacity', 'City_State', 'Country', 'Region', 'Tenants', 'Sports']
    df[columns_clean] = df[columns_clean].applymap(lambda x: clean_text(x) if isinstance(x, str) else x)
    
    # Remove rows where 'Capacity' column contains only the word 'Capacity' (header rows)
    df = df[df['Capacity'].str.lower() != 'capacity']
    
    # Further clean the 'Capacity' column to remove any non-numeric characters and convert to int
    df['Capacity'] = df['Capacity'].str.replace(',', '').str.replace('.', '').astype(int)
    
    # Handle missing images
    df['Image'] = df['Image'].apply(lambda x: NO_IMAGE if x is None else x)

    # Display the cleaned DataFrame
#     print(df)
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")
    
    
from geopy import Nominatim

def get_lat_long(country, location):
    # Define the logic for getting latitude and longitude here
    # This is just a placeholder; you'll need to replace it with the actual implementation
    geolocator = Nominatim(user_agent="stadium_geopy")
    location = geolocator.geocode(f"{location}, {country}")
    print('--',location)
    return (location.latitude, location.longitude) if location else (None, None)


# Function to update locations in the DataFrame
def update_locations(df):
    # Create an empty list to store the locations
    locations = []

    # Iterate through the DataFrame rows
    for index, row in df.iterrows():
        # Get the initial location using the country and stadium
        location = get_lat_long(row['Country'], row['Stadium'])
        locations.append(location)

    # Add the locations to the DataFrame
    df['Location'] = locations

    # Handle duplicates
    duplicates = df[df.duplicated(['Location'])]
    duplicate_locations = []

    for index, row in duplicates.iterrows():
        # Update the location for duplicates using the country and city/state
        location = get_lat_long(row['Country'], row['City_State'])
        duplicate_locations.append(location)

    # Add the updated locations to the duplicates DataFrame
    duplicates['Location'] = duplicate_locations

    # Update the original DataFrame with the corrected locations for duplicates
    df.update(duplicates)

    return df

# Assuming df is your DataFrame, call the function to update locations
df = update_locations(df)



-- Narendra Modi Stadium, Motera Stadium Exit Road, Sabarmati, Asarva Taluka, Ahmedabad District, Gujarat, 380005, India
-- 릉라도 5.1경기장, 청류교, 경상동, 중구역, 평양시, 조선민주주의인민공화국
-- Michigan Stadium, 1201, South Main Street, Ann Arbor, Washtenaw County, Michigan, 48104, United States
-- Beaver Stadium, 775, Curtin Road, College Township, Centre County, Pennsylvania, 16802, United States
-- Ohio Stadium, 411, Woody Hayes Drive, University District District 2, Columbus, Sharon, Franklin County, Ohio, 43210, United States
-- Kyle Field, Joe Routt Boulevard, College Station, Brazos County, Texas, 77843, United States
-- Tiger Stadium, South Stadium Road, Baton Rouge, East Baton Rouge Parish, Louisiana, 70803, United States
-- Neyland Stadium, 1300, Phillip Fulmer Way, University of Tennessee, Knoxville, Knox County, East Tennessee, Tennessee, 37916, United States
-- None
-- None
-- Melbourne Cricket Ground, 120, Brunton Avenue, East Melbourne, Melbourne, City of Melbourne, Victoria, 3004, Australia
-

-- US Bank Stadium Station, Minneapolis, Hennepin County, Minnesota, United States
-- None
-- استاد البيت, شارع 391, الخور والذخيرة, قطر
-- Stadio Olimpico, Viale dei Gladiatori, Municipio Roma XV, Roma, Roma Capitale, Lazio, 00135, Italia
-- Cívitas Metropolitano, Avenida 26 de Abril, Rosas, San Blas - Canillejas, Madrid, Comunidad de Madrid, 28022, España
-- SoFi Stadium, 1001, Champion Drive, Hollywood Park, Inglewood, California, 90301, United States
-- Husky Stadium, 3800, Montlake Boulevard Northeast, Central Campus, University District, Seattle, King County, Washington, 98105, United States
-- Національний спортивний комплекс «Олімпійський», 55, Велика Васильківська вулиця, Клов, Печерський район, Київ, 03150, Україна
-- Arena BRB Mané Garrrincha, SRPN Trecho 1, Setor de Administração Municipal, Brasília, Plano Piloto, Região Geográfica Imediata do Distrito Federal, Região Integrada de Desenvolvimento do Distrito Federal e Entorno, Região Geográfica Intermediária do Distrito Fed

-- Emirates Stadium, 75, Drayton Park, Taverner Square, Finsbury Park, London Borough of Islington, London, Greater London, England, N5 1BU, United Kingdom
-- Memorial Stadium, 1402, South First Street, Champaign, Champaign County, Illinois, 61820, United States
-- MHPArena, 30, Schwieberdinger Straße, Schafhof Garten, Ludwigsburg-West, Ludwigsburg, Landkreis Ludwigsburg, Baden-Württemberg, 71636, Deutschland
-- None
-- Arena do Grêmio, 110, Avenida Padre Leopoldo Brentano, Farrapos, Porto Alegre, Região Geográfica Imediata de Porto Alegre, Região Metropolitana de Porto Alegre, Região Geográfica Intermediária de Porto Alegre, Rio Grande do Sul, Região Sul, 90250-590, Brasil
-- Mississippi Veterans Memorial Stadium, North Stadium Drive, Fondren, Woodland Hills, Jackson, Hinds County, Mississippi, 39296, United States
-- None
-- Celtic Park, Upper Plaza, Newbank, Glasgow, Glasgow City, Alba / Scotland, G40 3RE, United Kingdom
-- Estadio de la UNSA, Ciudad de las Corrientes, Urbanización 

-- Autzen Stadium, 2700, Leo Harris Parkway, Eugene, Lane County, Oregon, 97401, United States
-- 부산 아시아드 주경기장, 체육공원로, 거제2동, 연제구, 부산광역시, 47527, 대한민국
-- Folsom Field, 2085, Colorado Avenue, Boulder, Boulder County, Colorado, 80309, United States
-- Estadio Único Diego Armando Maradona, Avenida 25, Tolosa, Partido de La Plata, Buenos Aires, 1906, Argentina
-- Mountain America Stadium, 500, East Veterans Way, Tempe, Maricopa County, Arizona, 85287, United States
-- Adelaide Oval, War Memorial Drive, North Adelaide, Adelaide, Adelaide City Council, South Australia, 5006, Australia
-- Estadio BBVA, Avenida Pablo Livas, La Quinta, Guadalupe, Nuevo León, 67176, México
-- Estadio Cuscatlán, Calle Antigua a Huizúcar, Jardines de Montserrat, Reparto los Heroes, Distrito Municipal 4, Municipio de San Salvador, San Salvador, Departamento de San Salvador, 1101, El Salvador
-- Marvel Stadium, 740, Bourke Street, Yarra’s Edge, Docklands, Melbourne, City of Melbourne, Victoria, 3008, Australia
-- Está

-- キューアンドエースタジアムみやぎ, 利府岩切停車場線, 神谷沢, 宮城野区, 利府町, 宮城郡, 宮城県, 981-0122, 日本
-- Globe Life Park in Arlington, 1000, Ballpark Way, Arlington, Tarrant County, Texas, 76011, United States
-- Estádio Beira Rio, Santana de Cataguases, Região Geográfica Imediata de Cataguases, Região Geográfica Intermediária de Juiz de Fora, Minas Gerais, Região Sudeste, Brasil
-- Chase Field, East Jefferson Street, Phoenix, Maricopa County, Arizona, 85004, United States
-- Heinz-von-Heiden-Arena, 3, Robert-Enke-Straße, Calenberger Neustadt, Mitte, Hannover, Region Hannover, Niedersachsen, 30169, Deutschland
-- None
-- SVNS International Cricket Stadium, NH53, Sector 2, Atal Nagar-Nava Raipur, Arang Tahsil, Raipur District, Chhattisgarh, 492101, India
-- Oriole Park at Camden Yards, Eutaw Street, Baltimore, Maryland, 21201, United States
-- ستاد خليفة الدولي, Khalifa Stadium Parking, بعيا, محيرجة, الريان, قطر
-- Jinnah Stadium, Sarwar Road, زون ۳, وفاقی دارالحکومت اسلام آباد, 44790, پاکستان
-- Parc des Princes, 24,

GeocoderUnavailable: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=Stade+Mohammed+V%2C+Morocco&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))

In [None]:
df.to_parquet("stadiums_data.parquet")

In [2]:
import boto3
from botocore.exceptions import NoCredentialsError, PartialCredentialsError

s3 = boto3.client('s3')

bucket_name = 'stadiums-bucket'
file_name = 'stadiums_data.csv'
object_name = 'stadiums_data.csv'

try:
    s3.upload_file(file_name, bucket_name, object_name)
    print("Upload Successful")
except NoCredentialsError:
    print("Credentials not available")
except PartialCredentialsError:
    print("Incomplete credentials provided")
except Exception as e:
    print(f"Upload failed: {e}")

Upload Successful


In [3]:
df

Unnamed: 0,Stadium,Capacity,City_State,Country,Region,Tenants,Sports,Image,Location
1,Narendra Modi Stadium,132000.0,"Ahmedabad,Gujarat",India,South Asia,"India National Cricket Team,India Womens Natio...",Cricket,https://upload.wikimedia.org/wikipedia/commons...,"(23.0917717, 72.59733447335586)"
2,Rungrado 1St Of May Stadium,114000.0,Pyongyang,North Korea,East Asia,North Korea National Football Team,"Football,Athletics,Mass Games",https://upload.wikimedia.org/wikipedia/commons...,"(39.0496732, 125.77532151186813)"
3,Michigan Stadium,107601.0,"Ann Arbor, Michigan",United States,North America,Michigan Wolverines Football,American Football,https://upload.wikimedia.org/wikipedia/commons...,"(42.2658652, -83.74868376764053)"
4,Beaver Stadium,106572.0,"State College, Pennsylvania",United States,North America,Penn State Nittany Lions Football,American Football,https://upload.wikimedia.org/wikipedia/commons...,"(40.812074249999995, -77.85494312546311)"
5,Ohio Stadium,102780.0,"Columbus, Ohio",United States,North America,Ohio State Buckeyes Football,American Football,https://upload.wikimedia.org/wikipedia/commons...,"(40.00164575, -83.01973744224524)"
...,...,...,...,...,...,...,...,...,...
538,Hauptstadion,40000.0,Aachen,Germany,Europe,Aachen-Laurensberger Rennvereinde,Equestrianandshow Jumping,https://upload.wikimedia.org/wikipedia/commons...,"(50.7955364, 6.094262132366655)"
539,Xining Stadium,40000.0,Xining,China,East Asia,Local Football Teams,Association Football,https://upload.wikimedia.org/wikipedia/commons...,"(36.6503285, 101.69619418946522)"
540,Shaoxing China Textile City Sports Center,40000.0,Shaoxing,China,East Asia,,Athletics,https://upload.wikimedia.org/wikipedia/commons...,"(29.9992425, 120.576854)"
541,Anqing Sports Centre Stadium,40000.0,Anqing,China,East Asia,,Athletics,https://upload.wikimedia.org/wikipedia/commons...,"(30.54320765, 117.11052556724154)"
