In [19]:
import json 
import time 
import sys
import re
import numpy as np
import pandas as pd 
from bs4 import BeautifulSoup
from pydantic import BaseModel
from typing import List, Optional 
from selenium import webdriver
from supabase import create_client, Client

from selenium.common.exceptions import NoSuchElementException, WebDriverException
from selenium.webdriver.common.by import By

In [9]:
def prepend_base_url(match_urls):
    return ['https://www.whoscored.com' + url['url'] for url in match_urls]

In [10]:
def getLeagueUrls(minimize_window=True):
    
    driver = webdriver.Chrome()
    
    if minimize_window:
        driver.minimize_window()
        
    driver.get(main_url)
    league_names = []
    league_urls = []
    n_tournaments = len(BeautifulSoup(driver.find_element(By.ID, 'popular-tournaments-list').get_attribute('innerHTML')).findAll('li'))
    for i in range(n_tournaments):
        league_name = driver.find_element(By.XPATH, '//*[@id="popular-tournaments-list"]/li['+str(i+1)+']/a').text
        league_link = driver.find_element(By.XPATH, '//*[@id="popular-tournaments-list"]/li['+str(i+1)+']/a').get_attribute('href')
        league_names.append(league_name)
        league_urls.append(league_link)
        
    for link in league_urls:
        if 'Russia' in link:
            r_index = league_urls.index(link)
            
    league_names[r_index] = 'Russian Premier League'
    
    leagues = {}
    for name,link in zip(league_names,league_urls):
        leagues[name] = link
    driver.close()
    return leagues

In [11]:
def getMatchUrls(comp_urls, competition, season, maximize_window=True):

    driver = webdriver.Chrome()
    
    if maximize_window:
        driver.maximize_window()
    
    comp_url = comp_urls[competition]
    driver.get(comp_url)
    time.sleep(5)
    
    seasons = driver.find_element(By.XPATH, '//*[@id="seasons"]').get_attribute('innerHTML').split(sep='\n')
    seasons = [i for i in seasons if i]
    
    
    for i in range(1, len(seasons)+1):
        if driver.find_element(By.XPATH, '//*[@id="seasons"]/option['+str(i)+']').text == season:
            driver.find_element(By.XPATH, '//*[@id="seasons"]/option['+str(i)+']').click()
            
            time.sleep(5)
            try:
                stages = driver.find_element(By.XPATH, '//*[@id="stages"]').get_attribute('innerHTML').split(sep='\n')
                stages = [i for i in stages if i]
                
                all_urls = []
            
                for i in range(1, len(stages)+1):
                    if competition == 'Champions League' or competition == 'Europa League':
                        if 'Group Stages' in driver.find_element(By.XPATH, '//*[@id="stages"]/option['+str(i)+']').text or 'Final Stage' in driver.find_element_by_xpath('//*[@id="stages"]/option['+str(i)+']').text:
                            driver.find_element(By.XPATH, '//*[@id="stages"]/option['+str(i)+']').click()
                            time.sleep(5)
                            
                            driver.execute_script("window.scrollTo(0, 400)") 
                            
                            match_urls = getUrlData(driver)
                            
                            all_urls += match_urls
                        else:
                            continue
                    
                    elif competition == 'Major League Soccer':
                        if 'Grp. ' not in driver.find_element(By.XPATH, '//*[@id="stages"]/option['+str(i)+']').text: 
                            driver.find_element(By.XPATH, '//*[@id="stages"]/option['+str(i)+']').click()
                            time.sleep(5)
                        
                            driver.execute_script("window.scrollTo(0, 400)")
                            
                            match_urls = getUrlData(driver)
                            
                            all_urls += match_urls
                        else:
                            continue
                        
                    else:
                        driver.find_element(By.XPATH, '//*[@id="stages"]/option['+str(i)+']').click()
                        time.sleep(5)
                    
                        driver.execute_script("window.scrollTo(0, 400)")
                        
                        match_urls = getUrlData(driver)
                        
                        all_urls += match_urls
                
            except NoSuchElementException:
                all_urls = []
                
                driver.execute_script("window.scrollTo(0, 400)")
                
                match_urls = getUrlData(driver)
                
                all_urls += match_urls
            
            # Remove duplicates from all_urls
            remove_dup = [dict(t) for t in {tuple(sorted(d.items())) for d in all_urls}]
            remove_dup = list(filter(None, remove_dup))
            
            driver.close() 
    
            return remove_dup
     
    season_names = [re.search(r'\>(.*?)\<',season).group(1) for season in seasons]
    driver.close() 
    print('Seasons available: {}'.format(season_names))
    raise('Season Not Found.')

In [16]:
def getUrlData(driver):

    matches_ls = []
    while True:
        table_rows = driver.find_elements(By.CLASS_NAME, 'divtable-row')
        if len(table_rows) == 0:
            if('is-disabled' in driver.find_element(By.XPATH, '//*[@id="date-controller"]/a[1]').get_attribute('class').split()):
                break
            else:
                driver.find_element(By.XPATH, '//*[@id="date-controller"]/a[1]').click()
        for row in table_rows:
            match_dict = {}
            element = BeautifulSoup(row.get_attribute('innerHTML'), features='lxml')
            link_tag = element.find("a", {"class":"result-1 rc"})
            if type(link_tag) is not type(None):
                match_dict['url'] = link_tag.get("href")
            matches_ls.append(match_dict)
                
        prev_month = driver.find_element(By.XPATH, '//*[@id="date-controller"]/a[1]').click()
        time.sleep(2)
        if driver.find_element(By.XPATH, '//*[@id="date-controller"]/a[1]').get_attribute('title') == 'No data for previous week':
            table_rows = driver.find_elements(By.CLASS_NAME, 'divtable-row')
            for row in table_rows:
                match_dict = {}
                element = BeautifulSoup(row.get_attribute('innerHTML'), features='lxml')
                link_tag = element.find("a", {"class":"result-1 rc"})
                if type(link_tag) is not type(None):
                    match_dict['url'] = link_tag.get("href")
                matches_ls.append(match_dict)
            break
    
    matches_ls = list(filter(None, matches_ls))

    return matches_ls

In [13]:
main_url = 'https://www.whoscored.com/'

In [14]:
league_urls = getLeagueUrls()
league_urls

{'Premier League': 'https://www.whoscored.com/Regions/252/Tournaments/2/England-Premier-League',
 'Serie A': 'https://www.whoscored.com/Regions/108/Tournaments/5/Italy-Serie-A',
 'LaLiga': 'https://www.whoscored.com/Regions/206/Tournaments/4/Spain-LaLiga',
 'Bundesliga': 'https://www.whoscored.com/Regions/81/Tournaments/3/Germany-Bundesliga',
 'Ligue 1': 'https://www.whoscored.com/Regions/74/Tournaments/22/France-Ligue-1',
 'Liga Portugal': 'https://www.whoscored.com/Regions/177/Tournaments/21/Portugal-Liga-Portugal',
 'Eredivisie': 'https://www.whoscored.com/Regions/155/Tournaments/13/Netherlands-Eredivisie',
 'Russian Premier League': 'https://www.whoscored.com/Regions/182/Tournaments/77/Russia-Premier-League',
 'Brasileirão': 'https://www.whoscored.com/Regions/31/Tournaments/95/Brazil-Brasileir%C3%A3o',
 'Major League Soccer': 'https://www.whoscored.com/Regions/233/Tournaments/85/USA-Major-League-Soccer',
 'Super Lig': 'https://www.whoscored.com/Regions/225/Tournaments/17/Turkey-Sup

In [20]:
match_urls = getMatchUrls(comp_urls=league_urls, competition='LaLiga', season='2023/2024')


In [21]:
formatted_urls = prepend_base_url(match_urls)
formatted_urls

['https://www.whoscored.com/Matches/1734648/Live/Spain-LaLiga-2023-2024-Sevilla-Girona',
 'https://www.whoscored.com/Matches/1734649/Live/Spain-LaLiga-2023-2024-Valencia-Osasuna',
 'https://www.whoscored.com/Matches/1734920/Live/Spain-LaLiga-2023-2024-Celta-Vigo-Real-Sociedad',
 'https://www.whoscored.com/Matches/1734858/Live/Spain-LaLiga-2023-2024-Atletico-Madrid-Las-Palmas',
 'https://www.whoscored.com/Matches/1734707/Live/Spain-LaLiga-2023-2024-Osasuna-Sevilla',
 'https://www.whoscored.com/Matches/1734904/Live/Spain-LaLiga-2023-2024-Real-Sociedad-Villarreal',
 'https://www.whoscored.com/Matches/1734689/Live/Spain-LaLiga-2023-2024-Valencia-Atletico-Madrid',
 'https://www.whoscored.com/Matches/1734682/Live/Spain-LaLiga-2023-2024-Mallorca-Cadiz',
 'https://www.whoscored.com/Matches/1734917/Live/Spain-LaLiga-2023-2024-Sevilla-Deportivo-Alaves',
 'https://www.whoscored.com/Matches/1734932/Live/Spain-LaLiga-2023-2024-Rayo-Vallecano-Cadiz',
 'https://www.whoscored.com/Matches/1734786/Live/