In [1]:
import pandas as pd
from utils import *
import json
import os
from rapidfuzz import process
from functools import lru_cache

In [2]:
folder_path = os.path.join(os.getcwd(), "Competitions")
os.makedirs(folder_path, exist_ok=True)  # Create the folder if it doesn't exist
# File-based caching
CACHE_FILE = folder_path + '\league_links.json'

In [3]:
def load_cache(file_path=CACHE_FILE):
    """Load cached league URLs from a JSON file."""
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            return json.load(file)
    return {}

In [4]:
def save_cache(league_dict, file_path=CACHE_FILE):
    """Save league URLs to a JSON file for caching."""
    with open(file_path, 'w') as file:
        json.dump(league_dict, file)

In [5]:
# Function to scrape league links from FBref's main competitions page
def scrape_league_links_from_fbref():
    url = "https://fbref.com/en/comps/"  # FBref competitions page
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve FBref page. Status code: {response.status_code}")
        return {}

    soup = BeautifulSoup(response.text, 'html.parser')
    
    league_dict = {}
    
    # Find the table containing the top tier information
    top_leagues_table = soup.find('table', {'id': 'comps_1_fa_club_league_senior'})
    if not top_leagues_table:
        print("Could not find the top tier leagues table on the page.")
        return {}
    
    # Loop through all rows in the table to extract league names and URLs
    for row in top_leagues_table.find('tbody').find_all('th'):
        league_link_tag = row.find('a')  # Find the link for the league
        if league_link_tag:
            league_name = league_link_tag.text.strip()
            league_url = 'https://fbref.com' + league_link_tag['href']
            league_dict[league_name] = league_url
    
    # Find the table containing the second tier information
    second_leagues_table = soup.find('table', {'id': 'comps_2_fa_club_league_senior'})
    if not second_leagues_table:
        print("Could not find the second tier leagues table on the page.")
        return {}
    
    # Loop through all rows in the table to extract league names and URLs
    for row in second_leagues_table.find('tbody').find_all('th'):
        league_link_tag = row.find('a')  # Find the link for the league
        if league_link_tag:
            league_name = league_link_tag.text.strip()
            league_url = 'https://fbref.com' + league_link_tag['href']
            league_dict[league_name] = league_url   
             
    return league_dict

In [6]:
# Function to get league links, using caching to avoid redundant scraping
@lru_cache(maxsize=32)  # Caches the result in memory for 32 different league scrapes
def get_league_links():
    #"""Scrapes the league URLs or loads them from cache."""
    league_dict = load_cache()  # First try to load from cache
    if not league_dict:  # If cache is empty, scrape the league URLs
        # Placeholder for scraping logic
        # Example: league_dict = {'K League 1': 'https://fbref.com/en/comps/55/K-League-1'}
        league_dict = scrape_league_links_from_fbref()  # Your scraping function here
        save_cache(league_dict)  # Save the newly scraped league URLs to cache
    return league_dict

In [7]:
# Fuzzy matching to get the closest league name

def get_closest_league(input_league):
    league_dict = get_league_links()  # Fetch the league dictionary, either from cache or by scraping
    league_names = list(league_dict.keys())  # List of league names
    closest_match = process.extractOne(input_league, league_names)  # Fuzzy match

    if closest_match and closest_match[1] > 80:  # Set a threshold for accuracy (80% in this case)
        return closest_match[0], league_dict[closest_match[0]]  # Return the match and its URL
    return None, None

In [8]:
# Ask user for league to scrape

# Main Script
if __name__ == '__main__':
    # Example input from the user
    user_input = 'Sere B'  # Intentionally misspelled for demonstration
    closest_league, league_url = get_closest_league(user_input)

    if closest_league:
        print(f"Closest match: {closest_league}, URL: {league_url}")
    else:
        print("No close match found.")
        

Closest match: Serie B, URL: https://fbref.com/en/comps/18/history/Serie-B-Seasons
