In [1]:
import json
import time

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from pydantic import BaseModel
from typing import List, Optional

from selenium import webdriver

from supabase import create_client, Client

In [2]:
class MatchEvent(BaseModel):
    id: int
    event_id: int
    minute: int
    second: Optional[float] = None
    team_id: int
    player_id: int
    x: float
    y: float
    end_x: Optional[float] = None
    end_y: Optional[float] = None
    qualifiers: List[dict]
    is_touch: bool
    blocked_x: Optional[float]
    blocked_y: Optional[float]
    goal_mouth_z: Optional[float]
    goal_mouth_y: Optional[float]
    is_shot: bool
    card_type: bool
    is_goal: bool
    type_display_name: str
    outcome_type_display_name: str
    period_display_name: str

In [3]:
def insert_match_events(df, supabase):
    events = [
        MatchEvent(**x).dict()
        for x in df.to_dict(orient='records')
    ]
    
    execution = supabase.table('match_events').upsert(events).execute()

In [4]:
class Player(BaseModel):
    player_Id: int
    shirt_no: int
    name: str
    age: int
    position: str
    team_id: int

In [5]:
def insert_players(team_info, supabase):
    players = []
    for team in team_info:
        for player in team['players']:
            players.append({
                'player_id': player['playerId'],
                'team_id': team['team_id'],
                'shirt_no': player['shirtNo'],
                'name': player['name'],
                'position': player['position'],
                'age': player['age'],
            })
            
    execution = supabase.table('players').upsert(players).execute()

In [7]:
supabase_password = 'CQh3HvCoCodnyjfR!'
project_url = 'https://wglpmtlyurrdgofntmag.supabase.co'
api_key = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6IndnbHBtdGx5dXJyZGdvZm50bWFnIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MDU0NDQ0NTgsImV4cCI6MjAyMTAyMDQ1OH0.gEk9CN0_rlgaHZ0ApGP72aiYujgvuuMF4VLkreyDYR0'
supabase = create_client(project_url, api_key)

In [8]:
def scrape_match_events(whoscored_url, driver):
    
    driver.get(whoscored_url)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    element = soup.select_one('script:-soup-contains("matchCentreData")')
    
    matchdict = json.loads(element.text.split("matchCentreData: ")[1].split(',\n')[0])
    
    match_events = matchdict['events']
    
    df = pd.DataFrame(match_events)
    
    df.dropna(subset='playerId', inplace=True)
    
    df = df.where(pd.notnull(df), None)
    
    df = df.rename(
    {
        'eventId': 'event_id',
        'expandedMinute': 'expanded_minute',
        'outcomeType': 'outcome_type',
        'isTouch': 'is_touch',
        'playerId': 'player_id',
        'teamId': 'team_id',
        'endX': 'end_x',
        'endY': 'end_y',
        'blockedX': 'blocked_x',
        'blockedY': 'blocked_y',
        'goalMouthZ': 'goal_mouth_z',
        'goalMouthY': 'goal_mouth_y',
        'isShot': 'is_shot',
        'cardType': 'card_type',
        'isGoal': 'is_goal'
    },
        axis=1
    )
    
    df['period_display_name'] = df['period'].apply(lambda x: x['displayName'])
    df['type_display_name'] = df['type'].apply(lambda x: x['displayName'])
    df['outcome_type_display_name'] = df['outcome_type'].apply(lambda x: x['displayName'])
    
    df.drop(columns=["period", "type", "outcome_type"], inplace=True)
    
    if 'is_goal' not in df.columns:
        df['is_goal'] = False
        
    if 'is_card' not in df.columns:
        df['is_card'] = False
        df['card_type'] = False
        
    df = df[~(df['type_display_name'] == "OffsideGiven")]
    
    df = df[[
        'id', 'event_id', 'minute', 'second', 'team_id', 'player_id', 'x', 'y', 'end_x', 'end_y',
        'qualifiers', 'is_touch', 'blocked_x', 'blocked_y', 'goal_mouth_z', 'goal_mouth_y', 'is_shot',
        'card_type', 'is_goal', 'type_display_name', 'outcome_type_display_name',
        'period_display_name'
    ]]
    
    df[['id', 'event_id', 'minute', 'team_id', 'player_id']] = df[['id', 'event_id', 'minute', 'team_id', 'player_id']].astype(np.int64)
    df[['second', 'x', 'y', 'end_x', 'end_y']] = df[['second', 'x', 'y', 'end_x', 'end_y']].astype(float)
    df[['is_shot', 'is_goal', 'card_type']] = df[['is_shot', 'is_goal', 'card_type']].astype(bool)
    
    df['is_goal'] = df['is_goal'].fillna(False)
    df['is_shot'] = df['is_shot'].fillna(False)
    
    for column in df.columns:
        if df[column].dtype == np.float64 or df[column].dtype == np.float32:
            df[column] = np.where(
                np.isnan(df[column]),
                None,
                df[column]
            )
            
            
    insert_match_events(df, supabase)
    
    
    team_info = []
    team_info.append({
        'team_id': matchdict['home']['teamId'],
        'name': matchdict['home']['name'],
        'country_name': matchdict['home']['countryName'],
        'manager_name': matchdict['home']['managerName'],
        'players': matchdict['home']['players'],
    })

    team_info.append({
        'team_id': matchdict['away']['teamId'],
        'name': matchdict['away']['name'],
        'country_name': matchdict['away']['countryName'],
        'manager_name': matchdict['away']['managerName'],
        'players': matchdict['away']['players'],
    })
    
    insert_players(team_info, supabase)
    
    return print('Success')

In [37]:
driver = webdriver.Chrome()
url = 'https://www.whoscored.com/Regions/247/Tournaments/36/Seasons/8213/Stages/18657/Show/International-FIFA-World-Cup-2022'
driver.get(url)

click_limit = 15
click_count = 0

scraped_urls = set()

while click_count < click_limit:
    time.sleep(2)
    try:
        button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.ui-icon-triangle-1-w')))
        button.click()
    except Exception as e:
        print("Error occurred while waiting for button:", e)
        break
    
    time.sleep(3)
    click_count += 1

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    temp_urls = soup.select('a[href*="/Live/"]')
    all_urls = [
        'https://whoscored.com' + x.attrs['href']
        for x in temp_urls
    ]

    for url in all_urls:
        if url not in scraped_urls:
            print(url)
            scrape_match_events(
                whoscored_url=url,
                driver=driver
            )
            scraped_urls.add(url)

            time.sleep(2)

https://whoscored.com/Matches/1697297/Live/International-FIFA-World-Cup-2022-Japan-Croatia


2024-04-06 18:32:14,652:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/match_events "HTTP/1.1 201 Created"
2024-04-06 18:32:14,881:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/players "HTTP/1.1 201 Created"


Success
https://whoscored.com/Matches/1697399/Live/International-FIFA-World-Cup-2022-Brazil-South-Korea


2024-04-06 18:32:21,312:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/match_events "HTTP/1.1 201 Created"
2024-04-06 18:32:21,510:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/players "HTTP/1.1 201 Created"


Success
https://whoscored.com/Matches/1697298/Live/International-FIFA-World-Cup-2022-Morocco-Spain


2024-04-06 18:32:26,660:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/match_events "HTTP/1.1 201 Created"
2024-04-06 18:32:26,885:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/players "HTTP/1.1 201 Created"


Success
https://whoscored.com/Matches/1697398/Live/International-FIFA-World-Cup-2022-Portugal-Switzerland


2024-04-06 18:33:56,609:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/match_events "HTTP/1.1 201 Created"
2024-04-06 18:33:56,829:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/players "HTTP/1.1 201 Created"


Success
https://whoscored.com/Matches/1697767/Live/International-FIFA-World-Cup-2022-Croatia-Brazil


2024-04-06 18:35:27,738:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/match_events "HTTP/1.1 201 Created"
2024-04-06 18:35:28,087:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/players "HTTP/1.1 201 Created"


Success
https://whoscored.com/Matches/1697443/Live/International-FIFA-World-Cup-2022-Netherlands-Argentina


2024-04-06 18:35:35,441:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/match_events "HTTP/1.1 201 Created"
2024-04-06 18:35:35,814:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/players "HTTP/1.1 201 Created"


Success
https://whoscored.com/Matches/1698025/Live/International-FIFA-World-Cup-2022-Morocco-Portugal


2024-04-06 18:35:46,554:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/match_events "HTTP/1.1 201 Created"
2024-04-06 18:35:46,769:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/players "HTTP/1.1 200 OK"


Success
https://whoscored.com/Matches/1697726/Live/International-FIFA-World-Cup-2022-England-France


2024-04-06 18:35:57,960:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/match_events "HTTP/1.1 201 Created"
2024-04-06 18:35:58,386:INFO - HTTP Request: POST https://wglpmtlyurrdgofntmag.supabase.co/rest/v1/players "HTTP/1.1 201 Created"


Success
Error occurred while waiting for button: Message: 

