In [2]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import requests
import json
import time
import re

### SCRAPER FUNCTIONS

In [3]:
def get_league_html(session, league, season, window):
    """
    HTML page response
    """
    URL_LEAGUE = "https://www.transfermarkt.com/premier-league/transfers/wettbewerb/{league}/plus/?saison_id={season}&s_w={window}"
    
    scrape_url = URL_LEAGUE.format(
        league=league,
        season=season,
        window=window
    )
    
    print("URL")
    print(scrape_url)

    resp = session.get(scrape_url)
    
    print("RESP")
    print(resp)

    return BeautifulSoup(resp.text)

In [4]:
def parse_team_id_from_url(url):
    """
    Helper to prase team id from URL
    """
    if 'verein/' in url:
        return url.split('/')[4]
    
    return None

def scrape_transfer_table(transfer_table_soup):
    """
    Scrape data from team transfers HTML table 
    """
    table_records = transfer_table_soup.find("tbody").findAll("tr")
    
    # No rows with player transfers info
    if len(table_records[0].findAll("td")) <= 1:
        return None
    
    records = []
    for rec in table_records:
        rec_data = {}
        try:
            rec_data['player_name'] = rec.find("td").find("div").text.strip()
            rec_data['player_id'] = rec.find("td").find("div").find("a")["href"].split("spieler/")[-1]
            rec_data['player_age'] = rec.find("td", class_ = "zentriert alter-transfer-cell").text.strip()
            
            # player nationalities
            nationalities = rec_data['player_nat'] = rec.find(
                "td",
                class_ = "zentriert nat-transfer-cell"
            ).findAll("img")
            rec_data['player_nat'] = [nat['title'].strip() for nat in nationalities]
        
            rec_data['player_pos'] = rec.find("td", class_ = "kurzpos-transfer-cell zentriert").text.strip()
            rec_data['market_val'] = rec.find("td", class_ = "rechts mw-transfer-cell").text.strip()
            
            # counter team data
            counter_team = rec.find("td", class_ = "verein-flagge-transfer-cell")
            rec_data['counter_team_country'] = counter_team.find("img")["title"].strip() if counter_team.find("img") else counter_team.text.strip()
            rec_data['counter_team_name'] = counter_team.find("a")["title"].strip() if counter_team.find("a") else counter_team.text.strip()
            rec_data['counter_team_id'] = parse_team_id_from_url(
                counter_team.find("a")["href"]
            ) if counter_team.find("a") else counter_team.text.strip()
            
            # transfer data
            transfer = rec.findAll("td", class_ = "rechts")[-1].find("a")
            rec_data['transfer_fee'] = transfer.text.strip()
            rec_data['transfer_id'] = transfer["href"].split("transfer_id/")[-1]
            
        except Exception as e:
            print(rec)
            raise e
        
        records.append(rec_data)
        
    return records

In [5]:
def scrape_league_season(session, league, season, window):
    """
    
    """
    league_soup = get_league_html(session, league, season, window)
    league_country = league_soup.find("td", class_="no-border-links verein-flagge-transfer-cell").find("img")['title'].strip()
    
    boxes = league_soup.find_all(class_ = "box")
    team_boxes = []
    for box in boxes:
        if box.find("h2", class_ = "content-box-headline content-box-headline--inverted content-box-headline--logo"): # transfer-zusatzinfo-box
            team_boxes.append(box)
            
    data = []
    for box in team_boxes:
        # team name and ID

        # OLD
        #team_info_tag = box.find("div", class_ = "table-header")
        #team_id = box['id'].split('-')[1]

        # WORKING
        team_name = box.find("img")['title'].strip()
        team_id = re.findall('id="to-'+ "(.*?)" +'">', str(box))[0]
        
        # Log
        print(f"TEAM {team_name}, ID {team_id}")

        # Tables with transfers
        team_tables = box.find_all("table")

        # IN transfers
        in_transfers = scrape_transfer_table(team_tables[0])

        # OUT transfers
        out_transfers = scrape_transfer_table(team_tables[1])

        data.append(
            {
                'team': {
                    'team_name': team_name,
                    'team_id': team_id,
                    'team_country': league_country
                },
                'in': in_transfers,
                'left': out_transfers
            }
        )
        
    return data

In [6]:
def scrape_script(leagues:list=['GB1', 'ES1', 'IT1', 'L1', 'FR1', 'PO1', 'NL1'], seasons:list=[2022, 2023, 2024], windows:list=["s", "w"]):
    HEADERS = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
        'cache-control': 'max-age=0',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36 OPR/62.0.3331.116',
    }
    
    SCRAPE_LEAGUES = leagues
    SCRAPE_SEASONS = seasons
    SCRAPE_WINDOWS = windows
    
    session = requests.session()
    session.headers.update(HEADERS)
    
    for league in SCRAPE_LEAGUES:
        for season in SCRAPE_SEASONS:
            for window in SCRAPE_WINDOWS:
            
                time.sleep(15)
                print('SCRAPE:', league, season, window)
                try:
                    data = scrape_league_season(session, league, season, window)
                except:
                    print("Data not avaible")

                filename = 'data/{league}_{season}_{window}.json'.format(
                    league=league,
                    season=season,
                    window=window
                )
                
                with open(filename, 'w', encoding='utf8') as output:
                    json.dump(data, output, ensure_ascii=False)
                
                
    return None

### RUN SCRAPER

**Test**

In [7]:
# scrape_script(leagues=['GB1'], seasons=[2022], windows=["s", "w"])

**All leagues**

In [8]:
scrape_script()

SCRAPE: GB1 2022 s
URL
https://www.transfermarkt.com/premier-league/transfers/wettbewerb/GB1/plus/?saison_id=2022&s_w=s
RESP
<Response [200]>
TEAM Arsenal FC, ID 11
TEAM Aston Villa, ID 405
TEAM AFC Bournemouth, ID 989
TEAM Brentford FC, ID 1148
TEAM Brighton & Hove Albion, ID 1237
TEAM Chelsea FC, ID 631
TEAM Crystal Palace, ID 873
TEAM Everton FC, ID 29
TEAM Fulham FC, ID 931
TEAM Leeds United, ID 399
TEAM Leicester City, ID 1003
TEAM Liverpool FC, ID 31
TEAM Manchester City, ID 281
TEAM Manchester United, ID 985
TEAM Newcastle United, ID 762
TEAM Nottingham Forest, ID 703
TEAM Southampton FC, ID 180
TEAM Tottenham Hotspur, ID 148
TEAM West Ham United, ID 379
TEAM Wolverhampton Wanderers, ID 543
SCRAPE: GB1 2022 w
URL
https://www.transfermarkt.com/premier-league/transfers/wettbewerb/GB1/plus/?saison_id=2022&s_w=w
RESP
<Response [200]>
TEAM Arsenal FC, ID 11
TEAM Aston Villa, ID 405
TEAM AFC Bournemouth, ID 989
TEAM Brentford FC, ID 1148
TEAM Brighton & Hove Albion, ID 1237
TEAM Chels