In [1]:
import re
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import json

In [2]:
with open('players_database.json', 'r') as f:
    players = json.load(f)

print(f"Trovati {len(players)} link")

Trovati 591 link


In [3]:
players = pd.DataFrame(players)
players[['Cognome']]

Unnamed: 0,Cognome
0,Prince
1,James
2,Davis
3,Reaves
4,Russell
...,...
586,Clarke
587,Simpson
588,Williams
589,Allen


In [4]:
# Ordina i cognomi in base alla lunghezza, dal più lungo al più corto
players = players[['Cognome']].assign(lunghezza=players['Cognome'].apply(len)).sort_values(by='lunghezza', ascending=False)

# Rimuovi la colonna 'lunghezza' se non ti serve più
players = players.drop(columns='lunghezza')

players



Unnamed: 0,Cognome
287,Gilgeous-Alexander
521,Toscano-Anderson
171,Alexander-Walker
441,Freeman-Liberty
546,Washington Jr.
...,...
321,Len
315,Fox
40,Lee
39,Bol


In [5]:
# URL della pagina
url = 'https://www.nba.com/game/hou-vs-orl-0022300066/play-by-play?period=All'

# Configura Selenium
options = Options()
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

In [6]:
# Avvia il WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

driver.get(url)

# Attendi qualche secondo per il caricamento della pagina
time.sleep(1)

# Ottieni l'HTML della pagina
html_content = driver.page_source
soup = BeautifulSoup(html_content, 'html.parser')

In [7]:
# Find all play event articles
plays = soup.find_all('article', class_='GamePlayByPlayRow_article__asoO2')
len(plays)

478

In [8]:
play_data = []

for i in range(len(plays)):
    play = plays[i]
    print(play)

    # Extract gameID
    gameID = url[25:46]

    # Extract clock time
    time_element = play.find('span', class_='GamePlayByPlayRow_clockElement__LfzHV')
    time = time_element.text if time_element else None

    # Extract team event
    article_element = play.find('div', class_='GamePlayByPlayRow_row__2iX_w')
    is_home_team = True if article_element.get('data-is-home-team') == 'true' else False

    # Extract description text
    desc_element = play.find('span', class_='GamePlayByPlayRow_desc__XLzrU')
    description = desc_element.text if desc_element else None

    # Extract score if available
    score_element = play.find('span', class_='GamePlayByPlayRow_scoring__Ax2hd')

    # Initialize data dictionary
    data = {
        'gameID': gameID,
        'time': time,
        'is_home_team': is_home_team,
        'action_type': None,
        'player': None,
        'made': None,
        'shot_type': None,
        'player_points': None,
        'assist_player': None,
        'assist_count': None,
        'score': None,
        'in' : None,
        'out': None
    }

    if description:
        desc_lower = description.lower()

        if 'jump ball' in desc_lower:
            data['action_type'] = 'Jump Ball'
        elif 'block' in desc_lower:
            data['action_type'] = 'Block'
        elif 'steal' in desc_lower:
            data['action_type'] = 'Steal'
        elif 'free throw' in desc_lower:
            data['action_type'] = 'Free Throw'
        elif 'turnover' in desc_lower:
            data['action_type'] = 'Turnover'
        elif 'foul' in desc_lower:
            data['action_type'] = 'Foul'

        elif 'sub' in desc_lower:
            data['action_type'] = 'SUB'
            pattern = r"SUB: ([A-Za-z\s\W]+) FOR ([A-Za-z\s\W]+)"
            match = re.search(pattern, description)
            data['in'] = match.group(1).strip()
            data['out'] = match.group(2).strip()

        elif 'timeout' in desc_lower:
            data['action_type'] = 'Timeout'
        elif 'rebound' in desc_lower:
            data['action_type'] = 'Rebound'
            data['time'] = play_data[-1]['time']
        elif 'violation' in desc_lower:
            data['action_type'] = 'Violation'
        elif 'instant replay' in desc_lower:
            data['action_type'] = 'Instant replay'

        elif score_element:
            data['action_type'] = 'Shot'
            data['made'] = 1

            ast_pattern = r"\(([A-Za-z\s\W]+)\s(\d+)\sAST\)$"
            ast_match = re.search(ast_pattern, description)

            if ast_match:
                data['assist_player'] = ast_match.group(1).strip()
                data['assist_count'] = int(ast_match.group(2))
                description = re.sub(ast_pattern, '', description)

            player_found = False
            for index in players.index:
                p = players.loc[index, 'Cognome']
                position = description.find(p)

                if position >= 0:
                    end = position + len(p)
                    data['player'] = description[:end]
                    description = description[end:]
                    shot_pattern = r"\s?(\d+')?\s?(\d+PT[a-zA-Z]?)?\s?([A-Za-z\s\W]+)\s(\((\d+)\sPT[a-zA-Z]?\))(\s+)?$"
                    shot_match = re.search(shot_pattern, description)
                    if shot_match:
                        data['shot_type'] = shot_match.group(3).strip()
                        data['player_points'] = int(shot_match.group(5))
                    data['score'] = score_element
                    break

        elif 'miss' in desc_lower:
            data['action_type'] = 'Shot'
            data['made'] = 0
            description = description[4:]

            player_found = False
            for index in players.index:
                p = players.loc[index, 'Cognome']
                position = description.find(p)

                if position >= 0:
                    end = position + len(p)
                    data['player'] = description[:end]
                    description = description[end:]
                    shot_pattern = r"\s?(\d+')?\s?(\d+PT[a-zA-Z]?)?s?([A-Za-z\s\W]+)"
                    shot_match = re.search(shot_pattern, description)
                    if shot_match:
                        data['shot_type'] = shot_match.group(3).strip()
                    break

    play_data.append(data)


<article class="GamePlayByPlayRow_article__asoO2" data-is-home-team="true"><div class="GamePlayByPlayRow_row__2iX_w" data-is-home-team="true"><p class="GamePlayByPlayRow_clock__o_PxT" data-is-home-team="true"><span class="GamePlayByPlayRow_clockElement__LfzHV">12:00</span></p><div class="GamePlayByPlayRow_descBlock__By8pv" data-content="4" data-id="nba:games:game-details-play-by-play:play" data-is-home-team="true" data-pos="2/486" data-premium="false" data-section="Play-By-Play" data-text="Jump Ball Carter Jr. vs. Sengun: Tip to Banchero" data-track="video" href="/video/4" title="Watch Video"><a class="StatEventLink_sel__pAwmA GamePlayByPlayRow_statEvent__Ru8Pr" data-id="nba:games:game-details-box-score:video-box-score" data-pos="" data-premium="false" data-track="video" href="/stats/events/?CFID=&amp;CFPARAMS=&amp;GameEventID=4&amp;GameID=0022300066&amp;Season=2023-24&amp;flag=1&amp;title=Jump%20Ball%20Carter%20Jr.%20vs.%20Sengun:%20Tip%20to%20Banchero"><span class="GamePlayByPlayRow_

In [9]:
df = pd.DataFrame(play_data)
#pd.set_option('display.max_rows', None)
df

Unnamed: 0,gameID,time,is_home_team,action_type,player,made,shot_type,player_points,assist_player,assist_count,score,in,out
0,hou-vs-orl-0022300066,12:00,True,Jump Ball,,,,,,,,,
1,hou-vs-orl-0022300066,11:45,True,Shot,Fultz,0.0,Turnaround Jump Shot,,,,,,
2,hou-vs-orl-0022300066,11:45,False,Rebound,,,,,,,,,
3,hou-vs-orl-0022300066,11:22,False,Turnover,,,,,,,,,
4,hou-vs-orl-0022300066,11:09,True,Shot,Suggs,0.0,Driving Floating Jump Shot,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,hou-vs-orl-0022300066,00:43,True,Shot,Okeke,0.0,Running Pull-Up Jump Shot,,,,,,
474,hou-vs-orl-0022300066,00:43,False,Rebound,,,,,,,,,
475,hou-vs-orl-0022300066,00:32,False,Shot,Williams,0.0,Jump Shot,,,,,,
476,hou-vs-orl-0022300066,00:32,True,Rebound,,,,,,,,,


In [10]:
# 1. Convertire il tempo in secondi dall'inizio del quarto
def time_to_seconds(time_str):
    minutes, seconds = map(int, time_str.split(':'))
    return minutes * 60 + seconds

df['seconds_from_start'] = df['time'].apply(time_to_seconds)
df

Unnamed: 0,gameID,time,is_home_team,action_type,player,made,shot_type,player_points,assist_player,assist_count,score,in,out,seconds_from_start
0,hou-vs-orl-0022300066,12:00,True,Jump Ball,,,,,,,,,,720
1,hou-vs-orl-0022300066,11:45,True,Shot,Fultz,0.0,Turnaround Jump Shot,,,,,,,705
2,hou-vs-orl-0022300066,11:45,False,Rebound,,,,,,,,,,705
3,hou-vs-orl-0022300066,11:22,False,Turnover,,,,,,,,,,682
4,hou-vs-orl-0022300066,11:09,True,Shot,Suggs,0.0,Driving Floating Jump Shot,,,,,,,669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,hou-vs-orl-0022300066,00:43,True,Shot,Okeke,0.0,Running Pull-Up Jump Shot,,,,,,,43
474,hou-vs-orl-0022300066,00:43,False,Rebound,,,,,,,,,,43
475,hou-vs-orl-0022300066,00:32,False,Shot,Williams,0.0,Jump Shot,,,,,,,32
476,hou-vs-orl-0022300066,00:32,True,Rebound,,,,,,,,,,32


In [11]:
df['shot_clock'] = None

df.loc[0, 'shot_clock'] = 24

for i in range(1, len(df)):
    
    action = df.loc[i, 'action_type']

    if df.loc[i, 'seconds_from_start'] > df.loc[i-1, 'seconds_from_start']:
        df.loc[i, 'shot_clock'] = 24 - (720 - df.loc[i, 'seconds_from_start'])
        print(df.loc[i-1, 'seconds_from_start'], ' ', df.loc[i, 'seconds_from_start'])

    elif action == 'Jump Ball':
        df.loc[i, 'shot_clock'] = 24

    elif action in ['Block', 'Turnover', 'SUB', 'Timeout', 'Violation', 'Instant replay']:
        delta_time = df.loc[i-1, 'seconds_from_start'] - df.loc[i, 'seconds_from_start']
        df.loc[i, 'shot_clock'] = df.loc[i-1, 'shot_clock'] - delta_time

    elif action == 'Steal':
        df.loc[i, 'shot_clock'] = 24

    elif action == 'Free Throw':
        df.loc[i, 'shot_clock'] = df.loc[i-1, 'shot_clock']

    elif action == 'Foul':
        if df.loc[i-1, 'action_type'] == 'Rebound' and df.loc[i-1, 'is_home_team'] == df.loc[i-2, 'is_home_team']:
            df.loc[i, 'shot_clock'] = 14
        else:
            df.loc[i, 'shot_clock'] = 24

    elif action == 'Rebound':
        if df.loc[i-1, 'is_home_team'] == df.loc[i, 'is_home_team']:
            df.loc[i, 'shot_clock'] = 14
        else:
            df.loc[i, 'shot_clock'] = 24

    elif action == 'Shot':
        delta_time = df.loc[i-1, 'seconds_from_start'] - df.loc[i, 'seconds_from_start']
        if df.loc[i-1, 'is_home_team'] == df.loc[i, 'is_home_team']:
            df.loc[i, 'shot_clock'] = df.loc[i-1, 'shot_clock'] - delta_time
        else:
            df.loc[i, 'shot_clock'] = 24 - delta_time

0   697
0   697
0   720


In [13]:
pd.set_option('display.max_rows', None)
df

Unnamed: 0,gameID,time,is_home_team,action_type,player,made,shot_type,player_points,assist_player,assist_count,score,in,out,seconds_from_start,shot_clock
0,hou-vs-orl-0022300066,12:00,True,Jump Ball,,,,,,,,,,720,24
1,hou-vs-orl-0022300066,11:45,True,Shot,Fultz,0.0,Turnaround Jump Shot,,,,,,,705,9
2,hou-vs-orl-0022300066,11:45,False,Rebound,,,,,,,,,,705,24
3,hou-vs-orl-0022300066,11:22,False,Turnover,,,,,,,,,,682,1
4,hou-vs-orl-0022300066,11:09,True,Shot,Suggs,0.0,Driving Floating Jump Shot,,,,,,,669,11
5,hou-vs-orl-0022300066,11:09,False,Rebound,,,,,,,,,,669,24
6,hou-vs-orl-0022300066,10:55,False,Shot,Smith Jr.,0.0,Pullup Jump Shot,,,,,,,655,10
7,hou-vs-orl-0022300066,10:55,True,Rebound,,,,,,,,,,655,24
8,hou-vs-orl-0022300066,10:40,True,Shot,Suggs,1.0,Jump Shot,3.0,Carter Jr.,1.0,[0 - 3],,,640,9
9,hou-vs-orl-0022300066,10:30,True,Foul,,,,,,,,,,630,24
