In [5]:
import csv
import time # Used for pauses between retries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# --- Configuration ---
MAX_RETRIES = 3 # Maximum number of times to retry scraping a page if entries are low
EXPECTED_ENTRIES_PER_PAGE = 60 # Expected number of player entries on a full page
NUM_PAGES_TO_SCRAPE = 5 # Total number of pages to attempt to scrape (offset iterations)

# --- WebDriver Setup ---
driver = webdriver.Chrome() # Assumes chromedriver is accessible via PATH

# List to store lists of [full_hyperlink, player_name] for all collected unique players
all_player_data = []
# Set to keep track of unique full hyperlinks to prevent duplicates
seen_hyperlinks = set()
# List to store URLs of pages that failed to yield enough entries after retries
failed_urls = []

# Base URL for the players list (without offset)
base_players_url = "https://sofifa.com/players"
# Base URL prefix for constructing full player profile links (e.g., https://sofifa.com)
base_link_prefix = "https://sofifa.com"

print("Starting to scrape player hyperlinks and names from Sofifa.com using offset pagination...")

# --- Function to extract player name from URL slug ---
def extract_player_name_from_href(href_path):
    """
    Extracts the player's name from a URL path like
    '/player/259516/joao-lucas-de-souza-cardoso/250040/'
    and formats it to 'Joao Lucas De Souza Cardoso'.
    """
    try:
        # Split the path by '/'
        parts = href_path.split('/')
        
        # The player name slug is typically the 4th part (index 3)
        # after '', 'player', 'ID'
        # Example: ['','player','259516','joao-lucas-de-souza-cardoso','250040','']
        if '-' in parts[5]:
            name = parts[5].replace('-', ' ')
            name = name.title()
            return name
        else:
            return (parts[5].title())
    except IndexError:
        # Handle cases where the path might not conform to the expected structure
        pass
    return "N/A" # Return "N/A" if name cannot be extracted

# --- Main Scraping Loop with Retries ---
try:
    # Loop for the specified number of pages
    for page_index in range(NUM_PAGES_TO_SCRAPE):
        current_offset = page_index * EXPECTED_ENTRIES_PER_PAGE # Calculate the offset
        current_page_url = f"{base_players_url}?offset={current_offset}"
        
        print(f"\n--- Scraping Page (Offset: {current_offset}, Attempting URL: {current_page_url}) ---")
        
        retries = 0
        page_scraped_successfully = False

        while retries < MAX_RETRIES and not page_scraped_successfully:
            if retries > 0:
                print(f"Retrying page (Offset: {current_offset}). Attempt {retries + 1}/{MAX_RETRIES}...")
                time.sleep(3) # Wait a bit before retrying

            try:
                # Navigate to the constructed URL for the current page
                driver.get(current_page_url)
                
                # Wait for the main table body to be present on the page.
                WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.TAG_NAME, "tbody"))
                )

                # Find all <a> tags that are descendants of a <tbody> element
                player_elements = driver.find_elements(By.CSS_SELECTOR, "tbody a[data-tippy-content]")

                current_page_unique_players_added = 0
                temp_page_data = [] # Temporarily store data for this attempt
                
                if not player_elements:
                    print(f"No player links found on page (Offset: {current_offset}) within tables on attempt {retries + 1}.")
                else:
                    for player_element in player_elements:
                        href_path = player_element.get_attribute("href")
                        if href_path:
                            full_hyperlink = base_link_prefix + href_path if href_path.startswith('/') else href_path
                            player_name = extract_player_name_from_href(href_path)
                            
                            player_entry = [full_hyperlink, player_name]
                            temp_page_data.append(player_entry)
                
                # Check if enough entries were collected from this attempt
                # The last page might legitimately have fewer entries, so we don't retry if it's the last page
                is_last_expected_page = (page_index == NUM_PAGES_TO_SCRAPE - 1)
                
                if len(temp_page_data) >= EXPECTED_ENTRIES_PER_PAGE or is_last_expected_page:
                    if not is_last_expected_page: # Don't print for the very last page if it's short
                         print(f"Collected {len(temp_page_data)} player entries from page (Offset: {current_offset}).")
                    else:
                         print(f"Collected {len(temp_page_data)} player entries from final page (Offset: {current_offset}).")

                    # Add unique entries from this page to the main list and seen set
                    for entry in temp_page_data:
                        if entry[0] not in seen_hyperlinks: # Check uniqueness by hyperlink
                            all_player_data.append(entry)
                            seen_hyperlinks.add(entry[0])
                            current_page_unique_players_added += 1
                    
                    print(f"Added {current_page_unique_players_added} new unique entries from page (Offset: {current_offset}).")
                    page_scraped_successfully = True # Mark as successful, break retry loop
                else:
                    print(f"Collected {len(temp_page_data)} entries, which is less than expected ({EXPECTED_ENTRIES_PER_PAGE}) on page (Offset: {current_offset}).")
                    retries += 1 # Increment retry counter

            except TimeoutException:
                print(f"Timeout while waiting for elements on page (Offset: {current_offset}) on attempt {retries + 1}.")
                retries += 1 # Increment retry counter
            except NoSuchElementException:
                print(f"Required element (tbody) not found on page (Offset: {current_offset}) on attempt {retries + 1}. Page structure might have changed or page is empty.")
                retries += 1
            except Exception as e:
                print(f"An unexpected error occurred on page (Offset: {current_offset}) on attempt {retries + 1}: {e}.")
                retries += 1
        
        # If after all retries, the page still wasn't scraped successfully, record its URL
        if not page_scraped_successfully:
            print(f"Failed to scrape page (Offset: {current_offset}) after {MAX_RETRIES} retries. Recording URL for future use.")
            failed_urls.append(current_page_url)


# --- Finalizing: Close Browser and Save to CSV ---
finally:
    # Close the browser session, regardless of whether errors occurred
    driver.quit()

    # Define the CSV file name for successful scrapes
    csv_filename = "sofifa_player_data_offset.csv"
    
    # Save all the collected unique player data to a CSV file
    with open(csv_filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the header row for both columns
        writer.writerow(["Player Hyperlink", "Player Name"]) 
        # Write each player's data (hyperlink and name) to a new row
        writer.writerows(all_player_data)

    print(f"\nScraping complete!")
    print(f"All collected unique player data saved to '{csv_filename}'")
    print(f"Total unique player entries extracted: {len(all_player_data)}")

    # --- Save Failed URLs to a separate CSV ---
    if failed_urls:
        failed_csv_filename = "failed_sofifa_pages.csv"
        with open(failed_csv_filename, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["Failed Page URL"]) # Header for failed URLs
            for url in failed_urls:
                writer.writerow([url])
        print(f"URLs of {len(failed_urls)} failed pages saved to '{failed_csv_filename}' for future review.")
    else:
        print("No pages failed to scrape after retries. No 'failed_sofifa_pages.csv' created.")

Starting to scrape player hyperlinks and names from Sofifa.com using offset pagination...

--- Scraping Page (Offset: 0, Attempting URL: https://sofifa.com/players?offset=0) ---
Collected 60 player entries from page (Offset: 0).
Added 60 new unique entries from page (Offset: 0).

--- Scraping Page (Offset: 60, Attempting URL: https://sofifa.com/players?offset=60) ---
Collected 60 player entries from page (Offset: 60).
Added 60 new unique entries from page (Offset: 60).

--- Scraping Page (Offset: 120, Attempting URL: https://sofifa.com/players?offset=120) ---
Collected 60 player entries from page (Offset: 120).
Added 55 new unique entries from page (Offset: 120).

--- Scraping Page (Offset: 180, Attempting URL: https://sofifa.com/players?offset=180) ---
Collected 60 player entries from page (Offset: 180).
Added 56 new unique entries from page (Offset: 180).

--- Scraping Page (Offset: 240, Attempting URL: https://sofifa.com/players?offset=240) ---
Collected 60 player entries from final