# SoFIFA web scaping using Selenium

### Import Required Libraries

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service as EdgeService
import pandas as pd
import time
import os
import re

## Data scraping using Selenium


### Initialize the webdriver and the environment

In [None]:
# Set Edge WebDriver Path
PATH_TO_DRIVER = "C:/WebDriver/msedgedriver.exe"  
service = EdgeService(PATH_TO_DRIVER)
options = webdriver.EdgeOptions()
options.add_argument("--ignore-certificate-errors")  
options.add_argument("--disable-gpu")
options.add_argument("--start-maximized")  

# Initialize WebDriver
driver = webdriver.Edge(service=service, options=options)
driver.maximize_window()


### Setting the link and detecting the columns

In [None]:
# Sofifa URL with U23 filter
base_url = "https://sofifa.com/players?type=all&lg%5B0%5D=13&lg%5B1%5D=31&lg%5B2%5D=53&lg%5B3%5D=19&lg%5B4%5D=16&aeh=23&showCol%5B%5D=ae&showCol%5B%5D=hi&showCol%5B%5D=wi&showCol%5B%5D=pf&showCol%5B%5D=oa&showCol%5B%5D=pt&showCol%5B%5D=bo&showCol%5B%5D=bp&showCol%5B%5D=gu&showCol%5B%5D=vl&showCol%5B%5D=wg&showCol%5B%5D=ta&showCol%5B%5D=cr&showCol%5B%5D=fi&showCol%5B%5D=he&showCol%5B%5D=sh&showCol%5B%5D=vo&showCol%5B%5D=ts&showCol%5B%5D=dr&showCol%5B%5D=cu&showCol%5B%5D=fr&showCol%5B%5D=lo&showCol%5B%5D=bl&showCol%5B%5D=to&showCol%5B%5D=ac&showCol%5B%5D=sp&showCol%5B%5D=ag&showCol%5B%5D=re&showCol%5B%5D=ba&showCol%5B%5D=tp&showCol%5B%5D=so&showCol%5B%5D=ju&showCol%5B%5D=st&showCol%5B%5D=sr&showCol%5B%5D=ln&showCol%5B%5D=te&showCol%5B%5D=ar&showCol%5B%5D=in&showCol%5B%5D=po&showCol%5B%5D=vi&showCol%5B%5D=pe&showCol%5B%5D=cm&showCol%5B%5D=td&showCol%5B%5D=ma&showCol%5B%5D=sa&showCol%5B%5D=sl&showCol%5B%5D=tg&showCol%5B%5D=gd&showCol%5B%5D=gh&showCol%5B%5D=gc&showCol%5B%5D=gp&showCol%5B%5D=gr&showCol%5B%5D=tt&showCol%5B%5D=bs&showCol%5B%5D=wk&showCol%5B%5D=sk&showCol%5B%5D=aw&showCol%5B%5D=dw&showCol%5B%5D=ir&showCol%5B%5D=bt&showCol%5B%5D=hc&showCol%5B%5D=pac&showCol%5B%5D=sho&offset="
data = []
 
# Detect Column Names Automatically
driver.get(base_url + "0")
time.sleep(15)  

# Extract column headers dynamically
headers = driver.find_elements(By.XPATH, "//thead//th")
columns = [header.text.strip() for header in headers if header.text.strip()]
print("Detected Columns:", columns)

# Ensure the first column is "ID" (Sofifa doesn't label it)
if "ID" not in columns:
    columns.insert(0, "ID")

### Scraping the data and closing the driver

In [None]:
# Scrape Player Data
for offset in range(0, 65):  # offset is used to scroll through the pages
    url = base_url + str(offset * 60)
    driver.get(url)
    time.sleep(0.8)  # Wait for JavaScript & Cloudflare check

    rows = driver.find_elements(By.XPATH, "//tbody/tr")

    for row in rows:
        try:
            attributes = row.find_elements(By.XPATH, ".//td")

            player_data = [attr.text.strip() for attr in attributes]

            # Ensure data length matches columns
            while len(player_data) < len(columns):
                player_data.append(None) 
            player_data = player_data[:len(columns)]  # Trim extra values

            data.append(player_data)

        except Exception as e:
            print(f"Error processing row: {e}")

    print(f"Done for offset {offset}", end="\r")

driver.quit()


### Save the data into it's specific directory

In [None]:
# Convert to DataFrame
df = pd.DataFrame(data, columns=columns)

# Create output folder if needed
year = "2020_2021"  # Adjust the year as needed
year_folder = f"./{year}"
os.makedirs(year_folder, exist_ok=True)

# Save to CSV
csv_path = os.path.join(year_folder, 'sofifa_u23_players.csv')
df.to_csv(csv_path, encoding='utf-8-sig', index=False)
print("\nScraping completed successfully!")


## Data cleaning and organizing

### Converts weight, height, value, wage, and other numerical values properly 
   

In [None]:
def clean_numeric(value):
    if isinstance(value, str):
        value = value.strip()
        # Extract numbers from height (e.g., "186cm / 6'1" -> 186)
        if 'cm' in value:
            return int(re.search(r'\d+', value).group())
        # Extract numbers from weight (e.g., "74kg / 163lbs" -> 74)
        elif 'kg' in value:
            return int(re.search(r'\d+', value).group())
        # Convert monetary values (e.g., "€39M" -> 39000000, "€89K" -> 89000)
        elif value.startswith('€'):
            value = value.replace('€', '')
            if 'M' in value:
                return int(float(value.replace('M', '')) * 1_000_000)
            elif 'K' in value:
                return int(float(value.replace('K', '')) * 1_000)
        # Remove '+' or '-' sign and any number following it 
        elif any(sign in value for sign in ['+', '-']):
            return int(re.split(r'[+-]', value)[0])
    return value

### Simplifying the position into the one of the four main positions , and fixing the name format

In [None]:
def simplify_position(position):
    position = position.upper()
    if any(pos in position for pos in ["GK", "GOALKEEPER"]):
        return 'G'
    elif any(pos in position for pos in ["CB", "LB", "RB", "LWB", "RWB", "DEFENDER"]):
        return 'D'
    elif any(pos in position for pos in ["CDM", "CM", "CAM","LM","RM", "MIDFIELDER"]):
        return 'M'
    elif any(pos in position for pos in ["ST", "CF", "LW", "RW", "FORWARD"]):
        return 'F'
    return position  
def extract_name(full_name):
    return full_name.split('\n')[0] if isinstance(full_name, str) else full_name

### Apply all needed functions and changes

In [None]:
def process_dataset(file_path):
    df = pd.read_csv(file_path)
    
    # Clean the name column first
    df['Name'] = df['Name'].apply(extract_name)
    
    # Identify numerical columns for selective cleaning
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                df[col] = df[col].apply(clean_numeric)
            except:
                pass  # Avoid errors on non-numeric columns
    
    # Simplify position column
    df['Best position'] = df['Best position'].apply(simplify_position)
    
    return df

### Apply to our scraped data  

In [None]:
f = process_dataset("./2020_2021/sofifa_u23_players.csv")
df.to_csv("./2020_2021/processed_fifa_dataset.csv", index=False)