In [1]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.


In [52]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [6]:
def get_player_stats(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    table_body = soup.select_one('#yw1 > table > tbody')
    rows = table_body.find_all('tr', class_=['odd', 'even'])
    
    players = []
    tbody = soup.select_one('#yw1 > table > tbody')
    rows = tbody.find_all('tr', class_=['odd', 'even'])
    
    for row in rows:
        cols = row.find_all('td')
        name_tag = cols[1].select_one('table > tbody > tr:nth-child(1) > td.hauptlink > div:nth-child(1) > span > a')
        position_tag = cols[1].select_one('table > tbody > tr:nth-child(1) > td.zentriert.rueckennummer.bg_Torwart')
        country_tag = cols[3].find('img')
        
        player = {
            'Player': name_tag['title'].strip() if name_tag else '',
            'Position': position_tag.text.strip() if position_tag else '',
            'Kit Number': cols[2].text.strip(),
            'Nationality': country_tag['alt'].strip() if country_tag else '',
            'In Squad': cols[4].text.strip(),
            'Appearances': cols[5].text.strip(),
            'Goals': cols[6].text.strip(),
            'Assists': cols[7].text.strip(),
            'Yellow Cards': cols[8].text.strip(),
            'Second Yellows': cols[9].text.strip(),
            'Straight Reds': cols[10].text.strip(),
            'Substituted On': cols[11].text.strip(),
            'Substituted Off': cols[12].text.strip(),
            'PPG': cols[13].text.strip(),
            'Minutes Played': cols[14].text.strip()
        }
        
        # Handle "Not used during this season" cases
        if 'Not used during this season' in cols[5].text.strip():
            player.update({
                'Appearances': '0',
                'Goals': '0',
                'Assists': '0',
                'Yellow Cards': '0',
                'Second Yellows': '0',
                'Straight Reds': '0',
                'Substituted On': '0',
                'Substituted Off': '0',
                'PPG': '0',
                'Minutes Played': '0'
            })
        
        players.append(player)
    
    return players

# Example URL for Real Madrid 2023 season
url = 'https://www.transfermarkt.com/real-madrid/leistungsdaten/verein/418/plus/1?reldata=%262023'
player_stats = get_player_stats(url)

# Convert to DataFrame
df = pd.DataFrame(player_stats)
df.head(30)

Unnamed: 0,Player,Position,Kit Number,Nationality,In Squad,Appearances,Goals,Assists,Yellow Cards,Second Yellows,Straight Reds,Substituted On,Substituted Off,PPG,Minutes Played
0,,,,,Goalkeeper,31,,8,5,-,-,-,-,-,-
1,,,,,Goalkeeper,24,,55,31,-,-,2,-,-,-
2,,,,,Goalkeeper,28,,50,20,-,-,2,-,-,1
3,,,,,Goalkeeper,21,,8,Not used during this season,-,-,-,-,-,-
4,,,,,Goalkeeper,21,,3,Not used during this season,-,-,-,-,-,-
5,,,,,Goalkeeper,19,,17,Not used during this season,-,-,-,-,-,-
6,,,,,Goalkeeper,18,,21,Not used during this season,-,-,-,-,-,-
7,,,,,Centre-Back,25,,15,13,-,-,-,-,-,7
8,,,,,Centre-Back,30,,50,48,2,3,8,-,-,4
9,,,,,Left-Back,28,,44,37,1,-,6,-,-,4


In [54]:
def get_player_stats_redo(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    match = re.search(r'reldata=%26(\d{4})', url)
    if match:
        year = int(match.group(1))
        season = f"{year}/{year + 1}"
    else:
        season = 'Unknown'

    # Extract the club name from the page header or URL (assuming it's in the URL for simplicity)
    club_name = soup.find('h1').text.strip() if soup.find('h1') else 'Unknown'

    
    players = []
    tbody = soup.select_one('#yw1 > table > tbody')
    rows = tbody.find_all('tr', class_=['odd', 'even'])
    
    for row in rows:
        cols = row.find_all('td')
        
        # Extracting the player name from the specific span class and a tag
        name_tag = row.select_one('span.hide-for-small > a')
        player_name = name_tag.text.strip() if name_tag else ''
            
        # Extracting the position
        position = cols[4].text.strip() if len(cols) > 4 else ''
        
        # Extracting the kit number
        kit_number = cols[0].text.strip() if len(cols) > 0 else ''
        
        # Extracting the nationality using the correct selector
        nationality_tag = row.select_one('td:nth-child(4) > img')
        nationality = nationality_tag['title'].strip() if nationality_tag else ''

        age = cols[5].text.strip() if len(cols) > 5 else ''
        
        # Extracting other statistics
        in_squad = cols[7].text.strip() if len(cols) > 7 else ''
        appearances = cols[8].text.strip() if len(cols) > 8 else ''
        goals = cols[9].text.strip() if len(cols) > 9 else ''
        assists = cols[10].text.strip() if len(cols) > 10 else ''
        yellow_cards = cols[11].text.strip() if len(cols) > 11 else ''
        second_yellows = cols[12].text.strip() if len(cols) > 12 else ''
        straight_reds = cols[13].text.strip() if len(cols) > 13 else ''
        substituted_on = cols[14].text.strip() if len(cols) > 14 else ''
        substituted_off = cols[15].text.strip() if len(cols) > 14 else ''
        ppg = cols[16].text.strip() if len(cols) > 14 else ''
        minutes_played = cols[17].text.strip() if len(cols) > 14 else ''
        
        # Handle "Not used during this season" cases
        if 'Not used during this season' in appearances:
            appearances = '0'
            goals = '0'
            assists = '0'
            yellow_cards = '0'
            second_yellows = '0'
            straight_reds = '0'
            substituted_on = '0'
            substituted_off = '0'
            ppg = '0'
            minutes_played = '0'
        
        player = {
            'Player': player_name,
            'Age': age,
            'Position': position,
            'Kit Number': kit_number,
            'Nationality': nationality,
            'In Squad': in_squad,
            'Appearances': appearances,
            'Goals': goals,
            'Assists': assists,
            'Yellow Cards': yellow_cards,
            'Second Yellows': second_yellows,
            'Straight Reds': straight_reds,
            'Substituted On': substituted_on,
            'Substituted Off': substituted_off,
            'PPG': ppg,
            'Minutes Played': minutes_played,
            'Club': club_name,
            'Season': season
        }
        
        players.append(player)
    
    return players

# Example URL for Real Madrid 2023 season
url = 'https://www.transfermarkt.com/real-madrid/leistungsdaten/verein/418/plus/1?reldata=%262023'
player_stats = get_player_stats_redo(url)

# Convert to DataFrame
df = pd.DataFrame(player_stats)
df.head(20)

Unnamed: 0,Player,Age,Position,Kit Number,Nationality,In Squad,Appearances,Goals,Assists,Yellow Cards,Second Yellows,Straight Reds,Substituted On,Substituted Off,PPG,Minutes Played,Club,Season
0,Thibaut Courtois,31,Goalkeeper,1,Belgium,8,5,-,-,-,-,-,-,1,2.6,423',Real Madrid,2023/2024
1,Andriy Lunin,24,Goalkeeper,13,Ukraine,55,31,-,-,2,-,-,-,-,2.39,2.850',Real Madrid,2023/2024
2,Kepa Arrizabalaga,28,Goalkeeper,25,Spain,50,20,-,-,2,-,-,1,-,2.55,1.767',Real Madrid,2023/2024
3,Lucas Cañizares,21,Goalkeeper,-,Spain,8,0,0,0,0,0,0,0,0,0.0,0,Real Madrid,2023/2024
4,Mario de Luis,21,Goalkeeper,-,Spain,3,0,0,0,0,0,0,0,0,0.0,0,Real Madrid,2023/2024
5,Diego Piñeiro,19,Goalkeeper,-,Spain,17,0,0,0,0,0,0,0,0,0.0,0,Real Madrid,2023/2024
6,Fran González,18,Goalkeeper,-,Spain,21,0,0,0,0,0,0,0,0,0.0,0,Real Madrid,2023/2024
7,Éder Militão,25,Centre-Back,3,Brazil,15,13,-,-,-,-,-,7,3,2.69,509',Real Madrid,2023/2024
8,Antonio Rüdiger,30,Centre-Back,22,Germany,50,48,2,3,8,-,-,4,2,2.5,4.076',Real Madrid,2023/2024
9,Ferland Mendy,28,Left-Back,23,France,44,37,1,-,6,-,-,4,13,2.41,2.835',Real Madrid,2023/2024


In [None]:
seasons = [2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010]  # Add more seasons as needed
all_stats = []

for season in seasons:
    url = f'https://www.transfermarkt.com/real-madrid/leistungsdaten/verein/418/plus/1?reldata=%26{season}'
    stats = get_player_stats_redo(url)
    for player in stats:
        player['Season'] = season
    all_stats.extend(stats)

# Convert to DataFrame
df_all = pd.DataFrame(all_stats)
print(df_all)

# Save to CSV
df_all.to_csv('real_madrid_stats_all_seasons.csv', index=False)

In [None]:
#need to get all player id's but might have to make multidimensional array for each season due to relegations and what not and the array being completely different for each league