# Webscrapping WNBA game data (first part)

In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re
from datetime import datetime
from selenium.common.exceptions import WebDriverException

## Retrieve all game IDs from WNBA website (2010-2022):

NBA data from our original dataset goes from the 2003-04 season to 2022-23 season. In the WNBA website, we only have data from 2010 forward.

We won't scrape data from 2023 since the NBA dataset also doesn't contain that data.

In [None]:
# Create a new instance of the Chrome driver
driver = webdriver.Chrome()

# List to collect all gameIDs:
gameIDs = []

# Iterate through years from 2010 to 2022
for year in range(2010, 2023):

    # Construct the URL for the specific year
    url = f"https://www.wnba.com/schedule?season={year}&month=all"
    
    # Navigate to the URL
    driver.get(url)

    # Wait time so that page has time to fully load
    time.sleep(5)

    # Wait until needed elements are present
    games_section = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "Schedule_gameSectionContainer__CED5h")))

    # Element where we can find information about each gameID, in its "href" attribute 
    games = driver.find_element(By.CLASS_NAME, "Schedule_gameSectionContainer__CED5h")
    a_elements = games.find_elements(By.TAG_NAME, "a")

    for a_element in a_elements:
        href = a_element.get_attribute("href")

        # Filter data so that it only includes NBA teams (exclude games from foreign national teams)
        if href and re.search(r"/game/\d+/[A-Z]{3}-vs-[A-Z]{3}$", href):
            #print(href)
            gameID = href.split("/")[-2]
            gameIDs.append(gameID)

Put gameIDs in CSV so that we don't have to run the code above several times:

In [None]:
dfGameIDs = pd.DataFrame({"WNBA GameID": gameIDs})
dfGameIDs.to_csv("WNBAGameIDs.csv", index = False)

## Get game data for each gameID

In [None]:
WNBA = pd.read_csv("WNBAGameIDs.csv")

In [None]:
# Create an instance of the Chrome driver
driver = webdriver.Chrome()

# Create empty lists to store values
team_names = []
game_dates = []
seasons_1 = [] # first format of season identificator, similar to the NBA dataset
seasons_2 = [] # second format of season identificator, similar to the NBA dataset
away_teams = []
home_teams = []
current_urls = []

# Iterate through years from 2010 to 2023
for game_id in WNBA["WNBA GameID"]:
    try:
        # Create a new instance of the Chrome driver
        base_url = 'https://www.wnba.com/game'
        url = f'{base_url}/{game_id}'
        driver.get(url)

        # Wait for 5 seconds for the page to fully load
        time.sleep(5)

        # Handle HTTP 502 responses
        if "502 Bad Gateway" in driver.title:
            print(f"502 Bad Gateway error for GameID: {game_id}")
            # Wait for 10 minutes before proceeding to the next ID
            time.sleep(600)
            driver = webdriver.Chrome()
            base_url = 'https://www.wnba.com/game'
            url = f'{base_url}/{game_id}'
            driver.get(url)
            continue

        # Retrieve the current URL and store it in a variable, to extract data from it
        current_url = driver.current_url

        # Find elements with required data and retrieve data from them
        home_team_element = driver.find_element(By.CLASS_NAME, "_GameDetailsHeader--team__home_166ax_290")
        TEAM_NAME = home_team_element.text
        TEAM_NAME = TEAM_NAME.replace('\n', ' ')
        team_names.append(TEAM_NAME)

        current_urls.append(current_url)

        game_date_element = driver.find_element(By.CLASS_NAME, "_GameStatusExpanded__date_iiqdo_20")
        GAME_DATE = game_date_element.text

        GAME_DATE = datetime.strptime(GAME_DATE, "%A, %b %d, %Y")
        GAME_DATE = GAME_DATE.strftime("%m-%d-%Y")
        game_dates.append(GAME_DATE)

        SEASON_1 = GAME_DATE[6:]
        seasons_1.append(SEASON_1)

        year = int(SEASON_1)
        previous_year = year - 1

        last_two_digits = str(SEASON_1)[-2:]
        SEASON_2 = f"{previous_year}-{last_two_digits}"
        seasons_2.append(SEASON_2)

        current_url = driver.current_url
        teams = current_url.split("/")[-1].split("-vs-")
        AWAY_TEAM = teams[0]
        HOME_TEAM = teams[1]

        away_teams.append(AWAY_TEAM)
        home_teams.append(HOME_TEAM)

    # Handle other possible problems
    except WebDriverException as e:
        print(f"Error occurred for GameID {game_id}: {str(e)}")
        # Wait for 10 minutes before proceeding to the next ID
        time.sleep(600)
        driver = webdriver.Chrome()
        base_url = 'https://www.wnba.com/game'
        url = f'{base_url}/{game_id}'
        driver.get(url)
        continue

Create dataframe with all data, in the same format as the NBA dataset:

In [None]:
data = {
    "TEAM_NAME": team_names,
    "GAME_DATE": game_dates,
    "SEASON_1": seasons_1,
    "SEASON_2": seasons_2,
    "AWAY_TEAM": away_teams,
    "HOME_TEAM": home_teams,
    "GAME_ID": current_urls
}

df = pd.DataFrame(data)

## Data cleaning and adding data for the skipped gameIDs

Check for duplicate rows:

In [None]:
df[df.duplicated()]

Remove duplicate rows:

In [None]:
df.drop_duplicates(inplace=True)

Correct GAME_ID column, extracting the IDs from the hrefs:

In [None]:
df["GAME_ID"] = df['GAME_ID'].str.extract(r'(\d+)')

We only have spatial data available for shots from 2016 forward. Since we aim to have a dataset with the same data as the NBA dataset, we will filter the WNBA dataset so that it only includes data from 2016 forward:

In [None]:
# Find the index of the row where WNBA GameID is equal to 1041700101
index_of_first_game_2016 = WNBA.index[WNBA['WNBA GameID'] == 1011600001][0]

# Create a new DataFrame with rows from the found index onward
WNBA_2016_forward = WNBA.loc[index_of_first_game_2016:]

Check for duplicate rows:

In [None]:
WNBA_2016_forward[WNBA_2016_forward.duplicated()]

Remove duplicate rows:

In [None]:
WNBA_2016_forward.drop_duplicates(inplace=True)

When we ran the code before using Web Driver in order to retrieve game data for each GameID, some IDs were skipped when we got a 502 response or when there was a WebDriverException. Therefore, we should now check what GAME_IDs have missing data due to that error in previous code, and retrieve that missing data:

In [None]:
# Convert values in 'WNBA GameID' column to string type
WNBA_2016_forward['WNBA GameID'] = WNBA_2016_forward['WNBA GameID'].astype(str)

In [None]:
# Find values in "WNBA GameID" that are not in "GAME_ID" and store them
missing_data = WNBA_2016_forward[~WNBA_2016_forward['WNBA GameID'].isin(df['GAME_ID'])]['WNBA GameID'].tolist()

Run the code again for the missing data:

In [None]:
# Create an instance of the Chrome driver
driver = webdriver.Chrome()

# Create empty lists to store values
team_names_missing_data = []
game_dates_missing_data = []
seasons_1_missing_data = []
seasons_2_missing_data = []
away_teams_missing_data = []
home_teams_missing_data = []
current_urls_missing_data = []

# Iterate through years from 2010 to 2023
for game_id in list(missing_data):
    try:
        # Create a new instance of the Chrome driver
        base_url = 'https://www.wnba.com/game'
        url = f'{base_url}/{game_id}'
        driver.get(url)

        # Wait for 5 seconds for the page to load
        time.sleep(5)

        # Handle HTTP 502 responses
        if "502 Bad Gateway" in driver.title:
            print(f"502 Bad Gateway error for GameID: {game_id}")
            # Wait for 10 minutes before proceeding to the next ID
            time.sleep(600)
            driver = webdriver.Chrome()
            base_url = 'https://www.wnba.com/game'
            url = f'{base_url}/{game_id}'
            driver.get(url)
            continue

        # Retrieve the current URL and store it in a variable
        current_url = driver.current_url

        # Your code to extract data and append it to the respective lists
        home_team_element = driver.find_element(By.CLASS_NAME, "_GameDetailsHeader--team__home_166ax_290")
        TEAM_NAME = home_team_element.text
        TEAM_NAME = TEAM_NAME.replace('\n', ' ')
        team_names_missing_data.append(TEAM_NAME)

        current_urls_missing_data.append(current_url)

        game_date_element = driver.find_element(By.CLASS_NAME, "_GameStatusExpanded__date_iiqdo_20")
        GAME_DATE = game_date_element.text

        GAME_DATE = datetime.strptime(GAME_DATE, "%A, %b %d, %Y")
        GAME_DATE = GAME_DATE.strftime("%m-%d-%Y")
        game_dates_missing_data.append(GAME_DATE)

        SEASON_1 = GAME_DATE[6:]
        seasons_1_missing_data.append(SEASON_1)

        year = int(SEASON_1)
        previous_year = year - 1

        last_two_digits = str(SEASON_1)[-2:]
        SEASON_2 = f"{previous_year}-{last_two_digits}"
        seasons_2_missing_data.append(SEASON_2)

        current_url = driver.current_url
        teams = current_url.split("/")[-1].split("-vs-")
        AWAY_TEAM = teams[0]
        HOME_TEAM = teams[1]

        away_teams_missing_data.append(AWAY_TEAM)
        home_teams_missing_data.append(HOME_TEAM)

        print(current_url)

    # Handle other exceptions
    except WebDriverException as e:
        print(f"Error occurred for GameID {game_id}: {str(e)}")
        # Wait for 10 minutes before proceeding to the next ID
        time.sleep(600)
        driver = webdriver.Chrome()
        base_url = 'https://www.wnba.com/game'
        url = f'{base_url}/{game_id}'
        driver.get(url)
        continue

In [None]:
missing_data = {
    "TEAM_NAME": team_names_missing_data,
    "GAME_DATE": game_dates_missing_data,
    "SEASON_1": seasons_1_missing_data,
    "SEASON_2": seasons_2_missing_data,
    "AWAY_TEAM": away_teams_missing_data,
    "HOME_TEAM": home_teams_missing_data,
    "GAME_ID": current_urls_missing_data
}

df_missing_data = pd.DataFrame(missing_data)

Correct GAME_ID column, following same logic as before (extract the gameID from the href we collected):

In [None]:
df_missing_data["GAME_ID"] = df_missing_data['GAME_ID'].str.extract(r'(\d+)')

Create new df with all games:

In [None]:
all_games_df = pd.concat([df, df_missing_data])

In [None]:
WNBA_2016_forward.reset_index(drop=True, inplace=True)

Create final dataframe:

In [None]:
# Convert values in "WNBA GameID" column of WNBA_2016_forward to string format
WNBA_2016_forward['WNBA GameID'] = WNBA_2016_forward['WNBA GameID'].astype(str)

# Convert values in "GAME_ID" column of all_games_df to string format
all_games_df['GAME_ID'] = all_games_df['GAME_ID'].astype(str)

# Merge the dataframes based on the specified columns
final_df = pd.merge(WNBA_2016_forward, all_games_df, how='left', left_on='WNBA GameID', right_on='GAME_ID')

Now, since we merged the two dataframes, we will have two columns with the GameID. Therefore, we will drop one:

In [None]:
final_df.drop('GAME_ID', axis=1, inplace=True)

In [None]:
final_df.rename(columns = {"WNBA GameID": "GAME_ID"})

Get final data into CSV:

In [None]:
final_df.to_csv("WNBA_2016_2023_Shots.csv")

## Identify games that belong to the regular season

Later on, we concluded that our NBA dataset only contains regular season data. In order to follow the same structure for the WNBA dataset so that we can compare the two leagues, we will filter the data we have so that we only have regular season data. 

The first step is to retrieve from each year's game schedule web page which part of the season each gameID belongs to: 

In [32]:
# Create a new instance of the Chrome driver
driver = webdriver.Chrome()

# List to collect all gameIDs:
gameIDs = []
part_of_season = []

# Iterate through years from 2016 to 2022
for year in range(2016, 2023):

    # Construct the URL for the specific year
    url = f"https://www.wnba.com/schedule?season={year}&month=all&team=all"
    
    # Navigate to the URL
    driver.get(url)

    # Wait so that page can fully load
    time.sleep(5)

    # Find elements with the data we want to retrieve
    games = driver.find_element(By.CLASS_NAME, "Schedule_gameSectionContainer__CED5h")

    a_elements = driver.find_elements(By.CLASS_NAME, "_GameTile__game-info_s6mxa_54")
    
    season_elements = games.find_elements(By.CLASS_NAME, "_GameTile__details_s6mxa_76")
    
    for a_element, season_element in zip(a_elements, season_elements):

        season_info = season_element.text
        season_info = season_info.splitlines()[0]
        part_of_season.append(season_info)

        href = a_element.get_attribute("href")

        # Extract from the href attribute of each element the corresponding gameID
        if href and re.search(r"/game/\d+/[A-Z]{3}-vs-[A-Z]{3}$", href):
            gameID = href.split("/")[-2]
            gameIDs.append(gameID)
        
        else:
            gameIDs.append("Missing")

In [33]:
df_regular_season_games = pd.DataFrame({"GameID": gameIDs, "Season": part_of_season})

Filter dataframe to include only regular season data:

In [34]:
# Filter rows where game is 'Regular Season'
df_regular_season_games = df_regular_season_games[df_regular_season_games['Season'] == 'Regular Season']
df_regular_season_games

Unnamed: 0,GameID,Season
15,1021600001,Regular Season
16,1021600002,Regular Season
17,1021600003,Regular Season
18,1021600004,Regular Season
19,1021600005,Regular Season
...,...,...
1513,1022200212,Regular Season
1514,1022200213,Regular Season
1515,1022200215,Regular Season
1516,1022200214,Regular Season


Create CSV with IDs from regular season:

In [35]:
df_regular_season_games.to_csv("regular_season_gameIDs_WNBA.csv")

## Filter data so that it matches the NBA dataset

We will do the following filtering actions:

 - Remove games from 2023 (2023-24 season), since the NBA dataset only contains games until that season.
 - Remove games that don't belong to the regular season, since the NBA dataset only contains regular season shots.

Get data from previous CSVs:

In [42]:
WNBA_2016_2022_shots = pd.read_csv("WNBA_2016_2023_Shots.csv", index_col = 0)
regular_season_gameIDs = pd.read_csv("regular_season_gameIDs_WNBA.csv", index_col = 0)

Remove games from 2023:

In [43]:
WNBA_2016_2022_shots = WNBA_2016_2022_shots[WNBA_2016_2022_shots["SEASON_1"] != 2023]

Remove games that do not belong to the regular season:

In [45]:
# Merge the dataframes based on the common column "WNBA GameID"
WNBA_2016_2022_shots = pd.merge(WNBA_2016_2022_shots, regular_season_gameIDs, how='left', left_on='WNBA GameID', right_on='GameID')

# Drop the duplicate "GameID" column if needed
WNBA_2016_2022_shots.drop(columns=['GameID'], inplace=True)

In [46]:
WNBA_2016_2022_shots = WNBA_2016_2022_shots[WNBA_2016_2022_shots["Season"] == "Regular Season"]

Final details:

In [48]:
WNBA_2016_2022_shots.drop(columns=['Season'], inplace=True)
WNBA_2016_2022_shots.reset_index(inplace = True, drop = True)
WNBA_2016_2022_shots.rename(columns = {"WNBA GameID": "GAME_ID"})

Unnamed: 0,GAME_ID,TEAM_NAME,GAME_DATE,SEASON_1,SEASON_2,AWAY_TEAM,HOME_TEAM
0,1021600001,Indiana Fever,05-14-2016,2016,2015-16,DAL,IND
1,1021600002,Washington Mystics,05-15-2016,2016,2015-16,NYL,WAS
2,1021600003,Minnesota Lynx,05-15-2016,2016,2015-16,PHO,MIN
3,1021600004,Chicago Sky,05-15-2016,2016,2015-16,CON,CHI
4,1021600005,Las Vegas Aces,05-15-2016,2016,2015-16,ATL,SAN
...,...,...,...,...,...,...,...
1231,1022200212,New York Liberty,08-14-2022,2022,2021-22,ATL,NYL
1232,1022200213,Washington Mystics,08-14-2022,2022,2021-22,IND,WAS
1233,1022200215,Las Vegas Aces,08-14-2022,2022,2021-22,SEA,LVA
1234,1022200214,Phoenix Mercury,08-14-2022,2022,2021-22,CHI,PHO


In [41]:
WNBA_2016_2022_shots.to_csv("WNBA_2016_2022_shots.csv")