# Script to scrape player salaries
# Are salaries aligned with goals, assists, minutes, etc.?


https://capology.com/uk/premier-league/salaries/

And other clubs


Collected DATA:
    
    Player

    Team

    Market value

    Age

    Contract value

    Estimated salary (if listed)

In [1]:
import re
import time
from pathlib import Path
from typing import Dict, List, Tuple


import pandas as pd
import requests
import cloudscraper




In [2]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

capology_league_pages = [
    {
        "league": "Premier League",
        "country": "England",
        "currency_hint": "£",
        "url": "https://capology.com/uk/premier-league/salaries/",
    },
    {
        "league": "La Liga",
        "country": "Spain",
        "currency_hint": "€",
        "url": "https://capology.com/es/la-liga/salaries/",
    },
    {
        "league": "Serie A",
        "country": "Italy",
        "currency_hint": "€",
        "url": "https://capology.com/it/serie-a/salaries/",
    },
    {
        "league": "Bundesliga",
        "country": "Germany",
        "currency_hint": "€",
        "url": "https://capology.com/de/1-bundesliga/salaries/",
    },
    {
        "league": "Ligue 1",
        "country": "France",
        "currency_hint": "€",
        "url": "https://capology.com/fr/ligue-1/salaries/",
    },
]

In [3]:
from bs4 import BeautifulSoup

def fetch_with_fallback(url: str, delay: float = 3.0) -> str | None:
    """Try requests first, fall back to cloudscraper on 403."""
    time.sleep(delay)
    
    # Try standard requests first
    try:
        resp = requests.get(url, headers=HEADERS, timeout=30)
        if resp.status_code == 200:
            print(f"  [requests] OK: {url}")
            return resp.text
        elif resp.status_code == 403:
            print(f"  [requests] 403 – trying cloudscraper...")
        else:
            print(f"  [requests] {resp.status_code}: {url}")
            return None
    except Exception as e:
        print(f"  [requests] Error: {e}")
    
    # Fallback to cloudscraper
    time.sleep(delay)
    try:
        scraper = cloudscraper.create_scraper()
        resp = scraper.get(url, timeout=30)
        if resp.status_code == 200:
            print(f"  [cloudscraper] OK: {url}")
            return resp.text
        else:
            print(f"  [cloudscraper] {resp.status_code}: {url}")
    except Exception as e:
        print(f"  [cloudscraper] Error: {e}")
    
    return None

In [4]:
def parse_salary_table(html: str, league: str, country: str, currency: str) -> List[Dict]:
    """Parse the salary data from Capology JavaScript variable."""
    
    # Find "var data = [" and extract until "];"
    start_idx = html.find("var data = [")
    if start_idx < 0:
        print(f"  No data array found for {league}")
        return []
    
    end_idx = html.find("];", start_idx)
    if end_idx < 0:
        print(f"  No closing ]; found for {league}")
        return []
    
    # Extract the data content (between [ and ])
    data_str = html[start_idx + len("var data = ["):end_idx]
    
    rows = []
    
    # Split by '},\n' or '},\r\n' to get individual player objects
    # Each entry ends with }, and starts with {
    player_blocks = re.split(r"\},\s*\{", data_str)
    
    for block in player_blocks:
        try:
            # Extract player name from the name field link
            name_match = re.search(r"href='/player/[^']+/'[^>]*>(?:<img[^>]+>)?([^<]+)</a>", block)
            player_name = name_match.group(1).strip() if name_match else ""
            
            # Extract team from club field
            team_match = re.search(r"'club':\s*\"<a[^>]+href='/club/[^']+/'[^>]*>([^<]+)</a>", block)
            team = team_match.group(1).strip() if team_match else ""
            
            # Extract position
            pos_match = re.search(r"'position_detail':\s*\"([^\"]+)\"", block)
            position = pos_match.group(1).strip() if pos_match else ""
            
            # Extract age
            age_match = re.search(r"'age':\s*Math\.round\(\"(\d+)\"\)", block)
            age = age_match.group(1) if age_match else ""
            
            # Extract annual gross salary based on currency
            if currency == "£":
                salary_match = re.search(r"'annual_gross_gbp':\s*accounting\.formatMoney\(\"(\d+)\"", block)
            else:
                salary_match = re.search(r"'annual_gross_eur':\s*accounting\.formatMoney\(\"(\d+)\"", block)
            
            annual_gross = int(salary_match.group(1)) if salary_match else 0
            
            # Extract weekly gross (divide by 52)
            if currency == "£":
                weekly_match = re.search(r"'weekly_gross_gbp':\s*accounting\.formatMoney\(\"(\d+)\"/52", block)
            else:
                weekly_match = re.search(r"'weekly_gross_eur':\s*accounting\.formatMoney\(\"(\d+)\"/52", block)
            
            weekly_gross = int(weekly_match.group(1)) // 52 if weekly_match else 0
            
            if not player_name:
                continue
                
            rows.append({
                "player": player_name,
                "team": team,
                "position": position,
                "age": age,
                "weekly_gross": weekly_gross,
                "annual_gross": annual_gross,
                "currency": currency,
                "league": league,
                "country": country,
            })
        except Exception as e:
            continue
    
    return rows

In [5]:
# Main scraping loop
all_salaries: List[Dict] = []

for league_info in capology_league_pages:
    league = league_info["league"]
    country = league_info["country"]
    url = league_info["url"]
    currency = league_info["currency_hint"]
    
    print(f"\n>>> Scraping {league} ({country})...")
    
    html = fetch_with_fallback(url, delay=4.0)
    if not html:
        print(f"  FAILED to fetch {league}")
        continue
    
    players = parse_salary_table(html, league, country, currency)
    print(f"  Parsed {len(players)} players from {league}")
    all_salaries.extend(players)

print(f"\n=== Total players collected: {len(all_salaries)} ===")


>>> Scraping Premier League (England)...
  [requests] OK: https://capology.com/uk/premier-league/salaries/
  Parsed 702 players from Premier League

>>> Scraping La Liga (Spain)...
  [requests] OK: https://capology.com/es/la-liga/salaries/
  Parsed 501 players from La Liga

>>> Scraping Serie A (Italy)...
  [requests] OK: https://capology.com/it/serie-a/salaries/
  Parsed 601 players from Serie A

>>> Scraping Bundesliga (Germany)...
  [requests] OK: https://capology.com/de/1-bundesliga/salaries/
  Parsed 524 players from Bundesliga

>>> Scraping Ligue 1 (France)...
  [requests] OK: https://capology.com/fr/ligue-1/salaries/
  Parsed 484 players from Ligue 1

=== Total players collected: 2812 ===


In [6]:
# Create DataFrame and preview
salaries_df = pd.DataFrame(all_salaries)
print(f"DataFrame shape: {salaries_df.shape}")
salaries_df.head(10)

DataFrame shape: (2812, 9)


Unnamed: 0,player,team,position,age,weekly_gross,annual_gross,currency,league,country
0,Erling Haaland,Manchester City,CF,25,525000,27300000,£,Premier League,England
1,Mohamed Salah,Liverpool,RW,33,400000,20800000,£,Premier League,England
2,Virgil van Dijk,Liverpool,CB,34,350000,18200000,£,Premier League,England
3,Casemiro,Manchester United,DM,33,350000,18200000,£,Premier League,England
4,Raheem Sterling,Chelsea,RW,30,325000,16900000,£,Premier League,England
5,Bruno Fernandes,Manchester United,AM,30,300000,15600000,£,Premier League,England
6,Bernardo Silva,Manchester City,AM,31,300000,15600000,£,Premier League,England
7,Omar Marmoush,Manchester City,CF,26,295000,15340000,£,Premier League,England
8,Alexander Isak,Liverpool,CF,25,280000,14560000,£,Premier League,England
9,Kai Havertz,Arsenal,CF,26,280000,14560000,£,Premier League,England


In [7]:
salaries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2812 entries, 0 to 2811
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   player        2812 non-null   object
 1   team          2812 non-null   object
 2   position      2812 non-null   object
 3   age           2812 non-null   object
 4   weekly_gross  2812 non-null   int64 
 5   annual_gross  2812 non-null   int64 
 6   currency      2812 non-null   object
 7   league        2812 non-null   object
 8   country       2812 non-null   object
dtypes: int64(2), object(7)
memory usage: 197.9+ KB


In [8]:
playerStats_df = pd.read_csv("../data_raw/playerStats.csv")
playerStats_df.head()

Unnamed: 0,Player,Squad,Pos,Age,Gls,Ast,Min,MP,GS,xG,npxG,xAG,PrgC,PrgP,Yellow,Red,League,Country,Source,Source_URL
0,Brenden Aaronson,Leeds United,"FW,MF",25-064,1,2,911,16,10,2.1,2.1,1.5,21,26,1,0,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...
1,Zach Abbott,Nott'ham Forest,DF,19-226,0,0,45,1,0,0.0,0.0,0.0,0,1,0,0,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...
2,Joshua Acheampong,Chelsea,DF,19-234,1,0,349,5,4,0.6,0.6,0.0,4,11,0,0,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...
3,Tyler Adams,Bournemouth,MF,26-314,2,1,1195,15,15,0.5,0.5,0.1,12,61,5,0,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...
4,Tosin Adarabioyo,Chelsea,DF,28-092,0,0,561,9,5,0.1,0.1,0.0,3,18,1,0,Premier League,England,cloudscraper,https://fbref.com/en/comps/9/stats/Premier-Lea...


In [9]:
print(playerStats_df.columns)
print(salaries_df.columns)

Index(['Player', 'Squad', 'Pos', 'Age', 'Gls', 'Ast', 'Min', 'MP', 'GS', 'xG',
       'npxG', 'xAG', 'PrgC', 'PrgP', 'Yellow', 'Red', 'League', 'Country',
       'Source', 'Source_URL'],
      dtype='object')
Index(['player', 'team', 'position', 'age', 'weekly_gross', 'annual_gross',
       'currency', 'league', 'country'],
      dtype='object')


In [10]:
#merge salaries with stats
merged = pd.merge(playerStats_df, salaries_df, how='left', left_on=['Player', 'Squad'], right_on=['player', 'team'])
merged.head()

Unnamed: 0,Player,Squad,Pos,Age,Gls,Ast,Min,MP,GS,xG,...,Source_URL,player,team,position,age,weekly_gross,annual_gross,currency,league,country
0,Brenden Aaronson,Leeds United,"FW,MF",25-064,1,2,911,16,10,2.1,...,https://fbref.com/en/comps/9/stats/Premier-Lea...,,,,,,,,,
1,Zach Abbott,Nott'ham Forest,DF,19-226,0,0,45,1,0,0.0,...,https://fbref.com/en/comps/9/stats/Premier-Lea...,,,,,,,,,
2,Joshua Acheampong,Chelsea,DF,19-234,1,0,349,5,4,0.6,...,https://fbref.com/en/comps/9/stats/Premier-Lea...,,,,,,,,,
3,Tyler Adams,Bournemouth,MF,26-314,2,1,1195,15,15,0.5,...,https://fbref.com/en/comps/9/stats/Premier-Lea...,Tyler Adams,Bournemouth,DM,26.0,60000.0,3120000.0,£,Premier League,England
4,Tosin Adarabioyo,Chelsea,DF,28-092,0,0,561,9,5,0.1,...,https://fbref.com/en/comps/9/stats/Premier-Lea...,Tosin Adarabioyo,Chelsea,CB,27.0,120000.0,6240000.0,£,Premier League,England


In [11]:
merged.columns

Index(['Player', 'Squad', 'Pos', 'Age', 'Gls', 'Ast', 'Min', 'MP', 'GS', 'xG',
       'npxG', 'xAG', 'PrgC', 'PrgP', 'Yellow', 'Red', 'League', 'Country',
       'Source', 'Source_URL', 'player', 'team', 'position', 'age',
       'weekly_gross', 'annual_gross', 'currency', 'league', 'country'],
      dtype='object')

In [12]:
len(merged[merged['weekly_gross'].isna() | merged['annual_gross'].isna() ]) 
# ['weekly_gross', 'annual_gross']

860

In [13]:

print("Stats")
print(playerStats_df[['Player', 'Squad']].head(10))
print("Salaries")
print(salaries_df[['player', 'team']].head(10))

Stats
              Player            Squad
0   Brenden Aaronson     Leeds United
1        Zach Abbott  Nott'ham Forest
2  Joshua Acheampong          Chelsea
3        Tyler Adams      Bournemouth
4   Tosin Adarabioyo          Chelsea
5      Simon Adingra       Sunderland
6         Amine Adli      Bournemouth
7   Emmanuel Agbadou           Wolves
8       Nayef Aguerd         West Ham
9           Ola Aina  Nott'ham Forest
Salaries
            player               team
0   Erling Haaland    Manchester City
1    Mohamed Salah          Liverpool
2  Virgil van Dijk          Liverpool
3         Casemiro  Manchester United
4  Raheem Sterling            Chelsea
5  Bruno Fernandes  Manchester United
6   Bernardo Silva    Manchester City
7    Omar Marmoush    Manchester City
8   Alexander Isak          Liverpool
9      Kai Havertz            Arsenal


In [14]:
# Normalize names for better matching
def normalize_name(name: str) :
    """Normalize player/team name for matching."""
    if pd.isna(name):
        return ""
    # Lowercase, strip whitespace, remove accents would be ideal
    return str(name).strip().lower()

# Create normalized columns for matching
stats_df = playerStats_df.copy()
salaries_norm = salaries_df.copy()

stats_df['player_norm'] = stats_df['Player'].apply(normalize_name)
stats_df['team_norm'] = stats_df['Squad'].apply(normalize_name)

salaries_norm['player_norm'] = salaries_norm['player'].apply(normalize_name)
salaries_norm['team_norm'] = salaries_norm['team'].apply(normalize_name)

# Check unique teams in both datasets
print("Stats teams (sample):", stats_df['team_norm'].unique()[:15])
print("\nSalaries teams (sample):", salaries_norm['team_norm'].unique()[:15])

Stats teams (sample): ['leeds united' "nott'ham forest" 'chelsea' 'bournemouth' 'sunderland'
 'wolves' 'west ham' 'manchester city' 'brentford' 'everton' 'liverpool'
 'fulham' 'burnley' 'brighton' 'aston villa']

Salaries teams (sample): ['manchester city' 'liverpool' 'manchester united' 'chelsea' 'arsenal'
 'everton' 'aston villa' 'tottenham' 'newcastle' 'west ham'
 'nottingham forest' 'fulham' 'sunderland' 'wolverhampton'
 'crystal palace']


In [15]:
import difflib

# different in team names between datasets may cause mismatches. Further normalization or mapping may be needed.
# Compare team names between datasets (normalized)
teams_stats = set(stats_df['team_norm'].dropna().unique())
teams_salaries = set(salaries_norm['team_norm'].dropna().unique())

only_in_stats = sorted(teams_stats - teams_salaries)
only_in_salaries = sorted(teams_salaries - teams_stats)

print(f"Teams only in stats_df ({len(only_in_stats)}): {only_in_stats}")
print(f"Teams only in salaries_df ({len(only_in_salaries)}): {only_in_salaries}")

# Suggest close matches to help build a mapping
suggestions = {
    t: difflib.get_close_matches(t, list(teams_salaries), n=3, cutoff=0.6)
    for t in only_in_stats
}

closeMatches = pd.DataFrame(
    [(k, ", ".join(v)) for k, v in suggestions.items()],
    columns=["team_in_stats", "suggested_matches_in_salaries"]
)
closeMatches

Teams only in stats_df (21): ['alavés', 'atlético madrid', 'betis', 'dortmund', 'eint frankfurt', 'gladbach', 'hamburger sv', 'inter', 'köln', 'leeds united', 'leverkusen', 'mainz 05', 'manchester utd', 'milan', 'newcastle utd', "nott'ham forest", 'oviedo', 'paris s-g', 'rb leipzig', 'st. pauli', 'wolves']
Teams only in salaries_df (21): ['ac milan', 'alaves', 'atletico madrid', 'bayer leverkusen', 'borussia dortmund', 'eintracht frankfurt', 'hamburg', 'inter milan', 'koln', 'leeds', 'leipzig', 'mainz', 'manchester united', 'monchengladbach', 'newcastle', 'nottingham forest', 'psg', 'real betis', 'real oviedo', 'st pauli', 'wolverhampton']


Unnamed: 0,team_in_stats,suggested_matches_in_salaries
0,alavés,alaves
1,atlético madrid,"atletico madrid, real madrid"
2,betis,"real betis, brest"
3,dortmund,borussia dortmund
4,eint frankfurt,eintracht frankfurt
5,gladbach,monchengladbach
6,hamburger sv,hamburg
7,inter,inter milan
8,köln,koln
9,leeds united,manchester united


In [16]:
len(closeMatches)

21

In [17]:
# Team name mapping (stats -> salaries) - Complete list based on mismatch analysis
team_mapping = {
    # Premier League
    "nott'ham forest": "nottingham forest",
    "wolves": "wolverhampton",
    "spurs": "tottenham",
    "man city": "manchester city",
    "man united": "manchester united",
    "manchester utd": "manchester united",
    "newcastle utd": "newcastle",
    "leicester city": "leicester",
    "west ham united": "west ham",
    "brighton and hove albion": "brighton",
    "leeds united": "leeds",
    # La Liga
    "alavés": "alaves",
    "atlético madrid": "atletico madrid",
    "betis": "real betis",
    "oviedo": "real oviedo",
    # Serie A
    "inter": "inter milan",
    "milan": "ac milan",
    # Bundesliga
    "dortmund": "borussia dortmund",
    "eint frankfurt": "eintracht frankfurt",
    "gladbach": "monchengladbach",
    "hamburger sv": "hamburg",
    "köln": "koln",
    "leverkusen": "bayer leverkusen",
    "mainz 05": "mainz",
    "rb leipzig": "leipzig",
    "st. pauli": "st pauli",
    # Ligue 1
    "paris s-g": "psg",
}

# Apply mapping to stats team names
stats_df['team_norm'] = stats_df['team_norm'].replace(team_mapping)

# Merge on normalized player name and team
merged = pd.merge(
    stats_df, 
    salaries_norm[['player_norm', 'team_norm', 'position', 'age', 'weekly_gross', 'annual_gross', 'currency']], 
    on=['player_norm', 'team_norm'], 
    how='left',
    suffixes=('_stats', '_salary')
)

print(f"Stats rows: {len(stats_df)}")
print(f"Merged rows: {len(merged)}")
print(f"Matched salaries: {merged['weekly_gross'].notna().sum()}")
print(f"Missing salaries: {merged['weekly_gross'].isna().sum()}")

Stats rows: 2361
Merged rows: 2361
Matched salaries: 1935
Missing salaries: 426


In [18]:
# Preview merged data with salaries
cols_to_show = ['Player', 'Squad', 'Gls', 'Ast', 'Min', 'weekly_gross', 'annual_gross', 'currency']
merged[cols_to_show].dropna(subset=['weekly_gross']).head(15)

Unnamed: 0,Player,Squad,Gls,Ast,Min,weekly_gross,annual_gross,currency
0,Brenden Aaronson,Leeds United,1,2,911,45000.0,2340000.0,£
1,Zach Abbott,Nott'ham Forest,0,0,45,12500.0,650000.0,£
3,Tyler Adams,Bournemouth,2,1,1195,60000.0,3120000.0,£
4,Tosin Adarabioyo,Chelsea,0,0,561,120000.0,6240000.0,£
5,Simon Adingra,Sunderland,0,0,382,70000.0,3640000.0,£
6,Amine Adli,Bournemouth,1,0,394,60000.0,3120000.0,£
7,Emmanuel Agbadou,Wolves,0,0,1011,85000.0,4420000.0,£
9,Ola Aina,Nott'ham Forest,0,0,270,60000.0,3120000.0,£
10,Rayan Aït-Nouri,Manchester City,0,0,204,120000.0,6240000.0,£
11,Kristoffer Ajer,Brentford,0,1,613,40000.0,2080000.0,£


In [19]:
# Drop helper columns and save merged dataset
final_cols = [c for c in merged.columns if c not in ['player_norm', 'team_norm']]
merged_final = merged[final_cols]



In [20]:
# Save to data_raw
merged_final.to_csv("../data_raw/player_stats_with_salaries.csv", index=False)
print(f"Saved {len(merged_final)} rows to ../data_raw/player_stats_with_salaries.csv")
print(f"Match rate: {merged_final['weekly_gross'].notna().sum() / len(merged_final) * 100:.1f}%")

Saved 2361 rows to ../data_raw/player_stats_with_salaries.csv
Match rate: 82.0%
