# Transfermarkt Web Scraping - Football Player Market Value
## This script scrapes player market value data from Transfermarkt for different football leagues using requests.


### Import Required Libraries

In [1]:
import os
import requests
from lxml import html
import pandas as pd
import time


### Set the year to scrap and the leagues with the links 

In [None]:
SEASON_YEAR = 2020

league_links = {
    "Laliga": "https://www.transfermarkt.co.uk/laliga/startseite/wettbewerb/ES1/plus/?saison_id={season_year}",
    "Laliga2": "https://www.transfermarkt.co.uk/laliga2/startseite/wettbewerb/ES2/plus/?saison_id={season_year}",
    "BelgianProLeague": "https://www.transfermarkt.co.uk/jupiler-pro-league/startseite/wettbewerb/BE1/plus/?saison_id={season_year}",
    "BrazilSerieA": "https://www.transfermarkt.co.uk/campeonato-brasileiro-serie-a/startseite/wettbewerb/BRA1/plus/?saison_id={season_year}",
    "Bundesliga": "https://www.transfermarkt.co.uk/bundesliga/startseite/wettbewerb/L1/plus/?saison_id={season_year}",
    "Bundesliga2": "https://www.transfermarkt.co.uk/2-bundesliga/startseite/wettbewerb/L2/plus/?saison_id={season_year}",
    "LigaProfesionalArgentina": "https://www.transfermarkt.co.uk/torneo-final/startseite/wettbewerb/ARGC/plus/?saison_id={season_year}",
    "Ligue1": "https://www.transfermarkt.co.uk/ligue-1/startseite/wettbewerb/FR1/plus/?saison_id={season_year}",
    "Ligue2": "https://www.transfermarkt.co.uk/ligue-2/startseite/wettbewerb/FR2/plus/?saison_id={season_year}",
    "SerieA": "https://www.transfermarkt.co.uk/serie-a/startseite/wettbewerb/IT1/plus/?saison_id={season_year}",
    "SerieB": "https://www.transfermarkt.co.uk/serie-b/startseite/wettbewerb/IT2/plus/?saison_id={season_year}",
    "Netherlands": "https://www.transfermarkt.co.uk/eredivisie/startseite/wettbewerb/NL1/plus/?saison_id={season_year}",
    "PrimeiraLigaPortugal": "https://www.transfermarkt.co.uk/liga-nos/startseite/wettbewerb/PO1/plus/?saison_id={season_year}",
    "PremierLeague" :"https://www.transfermarkt.co.uk/premier-league/startseite/wettbewerb/GB1/plus/?saison_id={season_year}",
    "championship" : "https://www.transfermarkt.co.uk/championship/startseite/wettbewerb/GB2/plus/?saison_id={season_year}"
}

### Helper Function - Convert Market Values
### This function converts market values from strings (e.g., `"€10m"`, `"€500k"`) to numeric values.


In [None]:
def convert_value(value):
    if not value or value.strip() == "-":
        return 0
    value = value.replace("€", "").replace(",", ".").strip()
    try:
        if "m" in value:
            return float(value.replace("m", "")) * 1000000
        elif "k" in value:
            return float(value.replace("k", "")) * 1000
        else:
            return float(value)
    except ValueError:
        return 0


### Scrape Club Links from League Page
### This function extracts club names and their URLs for a given league.


In [None]:
def get_club_links(league_url, headers):
    response = requests.get(league_url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve league data: {league_url}")
        return []
    tree = html.fromstring(response.content)
    clubs = tree.xpath("//div[@id='yw1']//td[contains(@class, 'hauptlink')]/a[contains(@href, '/verein/')]")
    club_links = [(club.text.strip(), f"https://www.transfermarkt.co.uk{club.get('href')}/plus/?saison_id={SEASON_YEAR}") for club in clubs if club is not None and club.text]
    return club_links



### Scrape Players from Each Club
### This function extracts **player names** and their **market value**.


In [None]:
def get_players_from_club(club_name, club_url, headers):
    players_data = []
    team_response = requests.get(club_url, headers=headers)
    if team_response.status_code != 200:
        print(f"Failed to fetch data for {club_name}")
        return []
    team_tree = html.fromstring(team_response.content)
    squad_table = team_tree.xpath("//div[@id='yw1']//table[contains(@class, 'items')]/tbody/tr")
    for row in squad_table:
        player = row.xpath(".//td[contains(@class, 'hauptlink')]/a/text()")
        value = row.xpath(".//td[contains(@class, 'rechts hauptlink')]/a/text()")
        player_name = player[0].strip() if player else "Unknown Player"
        market_value = convert_value(value[0]) if value else 0
        players_data.append({"Club": club_name, "Player": player_name, "Market Value (€)": market_value})
    return players_data

###  Scrape Data for All Leagues & Save to CSV
###    This function:
###   - Iterates over each league.
###   - Scrapes club links.
###   - Scrapes player data for each club.
###   - Saves results as CSV files.


In [None]:
def scrape_and_save(league_links, season_year):
    output_folder = f"./transfermarkt_data_{season_year}"
    os.makedirs(output_folder, exist_ok=True)
    headers = {"User-Agent": "Mozilla/5.0"}
    all_data = []
    
    for league_name, league_url in league_links.items():
        print(f"Scraping league: {league_name}")
        league_url = league_url.format(season_year=season_year)  # Update URL with season year
        club_links = get_club_links(league_url, headers)
        league_data = []
        
        for club_name, club_url in club_links:
            print(f"Fetching data for {club_name}")
            league_data.extend(get_players_from_club(club_name, club_url, headers))
            time.sleep(2)
        
        if league_data:
            df = pd.DataFrame(league_data)
            df.to_csv(os.path.join(output_folder, f"{league_name}_{season_year}.csv"), index=False)
            all_data.append(df)
    
    if all_data:
        final_df = pd.concat(all_data, ignore_index=True).fillna(0)
        final_df.to_csv(os.path.join(output_folder, f"all_leagues_combined_{season_year}.csv"), index=False)
        print("Final merged dataset saved.")
    else:
        print("No data collected.")


### call the function

In [None]:
scrape_and_save(league_links, SEASON_YEAR)
