In [1]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.


In [52]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [6]:
def get_player_stats(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    table_body = soup.select_one('#yw1 > table > tbody')
    rows = table_body.find_all('tr', class_=['odd', 'even'])
    
    players = []
    tbody = soup.select_one('#yw1 > table > tbody')
    rows = tbody.find_all('tr', class_=['odd', 'even'])
    
    for row in rows:
        cols = row.find_all('td')
        name_tag = cols[1].select_one('table > tbody > tr:nth-child(1) > td.hauptlink > div:nth-child(1) > span > a')
        position_tag = cols[1].select_one('table > tbody > tr:nth-child(1) > td.zentriert.rueckennummer.bg_Torwart')
        country_tag = cols[3].find('img')
        
        player = {
            'Player': name_tag['title'].strip() if name_tag else '',
            'Position': position_tag.text.strip() if position_tag else '',
            'Kit Number': cols[2].text.strip(),
            'Nationality': country_tag['alt'].strip() if country_tag else '',
            'In Squad': cols[4].text.strip(),
            'Appearances': cols[5].text.strip(),
            'Goals': cols[6].text.strip(),
            'Assists': cols[7].text.strip(),
            'Yellow Cards': cols[8].text.strip(),
            'Second Yellows': cols[9].text.strip(),
            'Straight Reds': cols[10].text.strip(),
            'Substituted On': cols[11].text.strip(),
            'Substituted Off': cols[12].text.strip(),
            'PPG': cols[13].text.strip(),
            'Minutes Played': cols[14].text.strip()
        }
        
        # Handle "Not used during this season" cases
        if 'Not used during this season' in cols[5].text.strip():
            player.update({
                'Appearances': '0',
                'Goals': '0',
                'Assists': '0',
                'Yellow Cards': '0',
                'Second Yellows': '0',
                'Straight Reds': '0',
                'Substituted On': '0',
                'Substituted Off': '0',
                'PPG': '0',
                'Minutes Played': '0'
            })
        
        players.append(player)
    
    return players

# Example URL for Real Madrid 2023 season
url = 'https://www.transfermarkt.com/real-madrid/leistungsdaten/verein/418/plus/1?reldata=%262023'
player_stats = get_player_stats(url)

# Convert to DataFrame
df = pd.DataFrame(player_stats)
df.head(30)

Unnamed: 0,Player,Position,Kit Number,Nationality,In Squad,Appearances,Goals,Assists,Yellow Cards,Second Yellows,Straight Reds,Substituted On,Substituted Off,PPG,Minutes Played
0,,,,,Goalkeeper,31,,8,5,-,-,-,-,-,-
1,,,,,Goalkeeper,24,,55,31,-,-,2,-,-,-
2,,,,,Goalkeeper,28,,50,20,-,-,2,-,-,1
3,,,,,Goalkeeper,21,,8,Not used during this season,-,-,-,-,-,-
4,,,,,Goalkeeper,21,,3,Not used during this season,-,-,-,-,-,-
5,,,,,Goalkeeper,19,,17,Not used during this season,-,-,-,-,-,-
6,,,,,Goalkeeper,18,,21,Not used during this season,-,-,-,-,-,-
7,,,,,Centre-Back,25,,15,13,-,-,-,-,-,7
8,,,,,Centre-Back,30,,50,48,2,3,8,-,-,4
9,,,,,Left-Back,28,,44,37,1,-,6,-,-,4


In [57]:
def get_player_stats_redo(base_url, team_name, team_id, season_year):
    headers = {'User-Agent': 'Mozilla/5.0'}
    url = f"{base_url}/{team_name}/leistungsdaten/verein/{team_id}/plus/1?reldata=%26{season_year}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    match = re.search(r'reldata=%26(\d{4})', url)
    if match:
        year = int(match.group(1))
        season = f"{year}/{year + 1}"
    else:
        season = 'Unknown'

    # Extract the club name from the page header or URL (assuming it's in the URL for simplicity)
    club_name = soup.find('h1').text.strip() if soup.find('h1') else 'Unknown'

    
    players = []
    tbody = soup.select_one('#yw1 > table > tbody')
    rows = tbody.find_all('tr', class_=['odd', 'even'])
    
    for row in rows:
        cols = row.find_all('td')
        
        # Extracting the player name from the specific span class and a tag
        name_tag = row.select_one('span.hide-for-small > a')
        player_name = name_tag.text.strip() if name_tag else ''
            
        # Extracting the position
        position = cols[4].text.strip() if len(cols) > 4 else ''
        
        # Extracting the kit number
        kit_number = cols[0].text.strip() if len(cols) > 0 else ''
        
        # Extracting the nationality using the correct selector
        nationality_tag = row.select_one('td:nth-child(4) > img')
        nationality = nationality_tag['title'].strip() if nationality_tag else ''

        age = cols[5].text.strip() if len(cols) > 5 else ''
        
        # Extracting other statistics
        in_squad = cols[7].text.strip() if len(cols) > 7 else ''
        appearances = cols[8].text.strip() if len(cols) > 8 else ''
        goals = cols[9].text.strip() if len(cols) > 9 else ''
        assists = cols[10].text.strip() if len(cols) > 10 else ''
        yellow_cards = cols[11].text.strip() if len(cols) > 11 else ''
        second_yellows = cols[12].text.strip() if len(cols) > 12 else ''
        straight_reds = cols[13].text.strip() if len(cols) > 13 else ''
        substituted_on = cols[14].text.strip() if len(cols) > 14 else ''
        substituted_off = cols[15].text.strip() if len(cols) > 14 else ''
        ppg = cols[16].text.strip() if len(cols) > 14 else ''
        minutes_played = cols[17].text.strip() if len(cols) > 14 else ''
        
        # Handle "Not used during this season" cases
        if 'Not used during this season' in appearances:
            appearances = '0'
            goals = '0'
            assists = '0'
            yellow_cards = '0'
            second_yellows = '0'
            straight_reds = '0'
            substituted_on = '0'
            substituted_off = '0'
            ppg = '0'
            minutes_played = '0'
        
        player = {
            'Player': player_name,
            'Age': age,
            'Position': position,
            'Kit Number': kit_number,
            'Nationality': nationality,
            'In Squad': in_squad,
            'Appearances': appearances,
            'Goals': goals,
            'Assists': assists,
            'Yellow Cards': yellow_cards,
            'Second Yellows': second_yellows,
            'Straight Reds': straight_reds,
            'Substituted On': substituted_on,
            'Substituted Off': substituted_off,
            'PPG': ppg,
            'Minutes Played': minutes_played,
            'Club': club_name,
            'Season': season
        }
        
        players.append(player)
    
    return players

# Example usage for Real Madrid 2023 season
base_url = 'https://www.transfermarkt.com'
team_name = 'real-madrid'
team_id = 418
season_year = 2022
player_stats = get_player_stats_redo(base_url, team_name, team_id, season_year)

# Convert to DataFrame
df = pd.DataFrame(player_stats)
df.head(100)

Unnamed: 0,Player,Age,Position,Kit Number,Nationality,In Squad,Appearances,Goals,Assists,Yellow Cards,Second Yellows,Straight Reds,Substituted On,Substituted Off,PPG,Minutes Played,Club,Season
0,Thibaut Courtois,30,Goalkeeper,1,Belgium,50,49,-,-,1,-,-,-,-,2.14,4.470',Real Madrid,2022/2023
1,Andriy Lunin,23,Goalkeeper,13,Ukraine,59,12,-,-,-,-,-,-,-,2.17,1.080',Real Madrid,2022/2023
2,Lucas Cañizares,20,Goalkeeper,30,Spain,11,0,0,0,0,0,0,0,0,0.0,0,Real Madrid,2022/2023
3,Mario de Luis,20,Goalkeeper,43,Spain,1,0,0,0,0,0,0,0,0,0.0,0,Real Madrid,2022/2023
4,Luis López,21,Goalkeeper,26,Spain,52,0,0,0,0,0,0,0,0,0.0,0,Real Madrid,2022/2023
5,Diego Piñeiro,18,Goalkeeper,38,Spain,1,0,0,0,0,0,0,0,0,0.0,0,Real Madrid,2022/2023
6,Fran González,17,Goalkeeper,44,Spain,1,0,0,0,0,0,0,0,0,0.0,0,Real Madrid,2022/2023
7,Éder Militão,24,Centre-Back,3,Brazil,55,51,7,1,9,-,-,3,6,2.12,4.231',Real Madrid,2022/2023
8,Antonio Rüdiger,29,Centre-Back,22,Germany,58,53,2,-,2,-,-,14,2,2.08,3.848',Real Madrid,2022/2023
9,Ferland Mendy,27,Left-Back,23,France,36,28,-,1,4,-,-,3,13,2.25,2.064',Real Madrid,2022/2023


In [58]:
def collect_data_for_multiple_seasons(base_url, team_name, team_id, season_years):
    all_players = []
    
    for year in season_years:
        player_stats = get_player_stats_redo(base_url, team_name, team_id, year)
        all_players.extend(player_stats)
    
    # Convert to DataFrame
    df = pd.DataFrame(all_players)
    
    # Save to CSV
    df.to_csv(f'{team_name}_multiple_seasons_stats.csv', index=False)
    return df

In [59]:
# Example usage for Real Madrid for specific seasons
base_url = 'https://www.transfermarkt.com'
team_name = 'fc-barcelona'
team_id = 131
season_years = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]  # List of specific seasons you want to collect data for

df = collect_data_for_multiple_seasons(base_url, team_name, team_id, season_years)
df.head(100)

Unnamed: 0,Player,Age,Position,Kit Number,Nationality,In Squad,Appearances,Goals,Assists,Yellow Cards,Second Yellows,Straight Reds,Substituted On,Substituted Off,PPG,Minutes Played,Club,Season
0,Fernando Pacheco,18,Goalkeeper,32,Spain,1,0,0,0,0,0,0,0,0,0,0,Real Madrid,2010/2011
1,Antonio Adán,23,Goalkeeper,13,Spain,34,5,-,-,-,-,-,2,1,2.40,395',Real Madrid,2010/2011
2,Jesús Fernández,22,Goalkeeper,28,Spain,1,1,-,-,-,-,-,1,-,3.00,13',Real Madrid,2010/2011
3,Tomás Mejías,21,Goalkeeper,40,Spain,1,1,-,-,-,-,-,1,-,3.00,6',Real Madrid,2010/2011
4,Iker Casillas,29,Goalkeeper,1,Spain,54,54,-,-,4,-,1,-,-,2.39,4.802',Real Madrid,2010/2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Xabi Alonso,30,Defensive Midfield,14,Spain,49,47,-,8,18,-,-,6,8,1.94,3.793',Real Madrid,2012/2013
96,Michael Essien,29,Defensive Midfield,15,Ghana,43,35,2,1,7,-,-,4,7,2.23,2.702',Real Madrid,2012/2013
97,Lassana Diarra,27,Defensive Midfield,24,France,4,2,-,-,-,-,-,-,2,0.50,110',Real Madrid,2012/2013
98,Álvaro Morata,19,Centre-Forward,29,Spain,28,15,2,3,3,-,-,8,4,2.53,579',Real Madrid,2012/2013


In [60]:
df.head(50)

Unnamed: 0,Player,Age,Position,Kit Number,Nationality,In Squad,Appearances,Goals,Assists,Yellow Cards,Second Yellows,Straight Reds,Substituted On,Substituted Off,PPG,Minutes Played,Club,Season
0,Fernando Pacheco,18,Goalkeeper,32,Spain,1,0,0,0,0,0,0,0,0,0.0,0,Real Madrid,2010/2011
1,Antonio Adán,23,Goalkeeper,13,Spain,34,5,-,-,-,-,-,2,1,2.4,395',Real Madrid,2010/2011
2,Jesús Fernández,22,Goalkeeper,28,Spain,1,1,-,-,-,-,-,1,-,3.0,13',Real Madrid,2010/2011
3,Tomás Mejías,21,Goalkeeper,40,Spain,1,1,-,-,-,-,-,1,-,3.0,6',Real Madrid,2010/2011
4,Iker Casillas,29,Goalkeeper,1,Spain,54,54,-,-,4,-,1,-,-,2.39,4.802',Real Madrid,2010/2011
5,Jerzy Dudek,37,Goalkeeper,25,Poland,27,2,-,-,-,-,-,-,2,3.0,122',Real Madrid,2010/2011
6,Nacho Fernández,20,Centre-Back,35,Spain,3,2,-,-,-,-,-,-,1,1.5,142',Real Madrid,2010/2011
7,Sergio Ramos,24,Centre-Back,4,Spain,47,46,4,3,17,1,1,1,1,2.33,4.050',Real Madrid,2010/2011
8,Raúl Albiol,24,Centre-Back,18,Spain,46,32,-,-,7,-,1,8,3,2.41,2.151',Real Madrid,2010/2011
9,Marcelo,22,Left-Back,12,Brazil,53,50,5,10,8,-,-,3,5,2.5,4.239',Real Madrid,2010/2011


In [None]:
#need to get all player id's but might have to make multidimensional array for each season due to relegations and what not and the array being completely different for each league