In [1]:
from abc import ABC, abstractmethod
from dataclasses import dataclass
from collections.abc import Sequence
import httpx
import pandas as pd
from bs4 import BeautifulSoup
import time

In [2]:
team_url = "https://www.transfermarkt.co.uk/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2020"

In [4]:
team_resp = httpx.get(
    team_url,
    headers={
        "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, Gecko gibi) Chrome/134.0.0.0 Safari/537.36'
    },
)

In [5]:
team_html = team_resp.content

In [6]:
page_soup = BeautifulSoup(team_html, "html.parser")

In [7]:
team_info = page_soup.find_all("td", {"class": "hauptlink no-border-links"})

In [8]:
team_name = [
    td.find("a").get("href").split("/")[1] if td.find("a") else None for td in team_info
]

In [9]:
team_id = [
    td.find("a").get("href").split("/")[4] if td.find("a") else None for td in team_info
]

In [10]:
urls = []
for td in team_info:
    data = td.find('a').get('href')
    team_name = data.split('/')[1]
    team_id = data.split('/')[4]
    year = data.split('/')[6]
    
    url = f'https://www.transfermarkt.co.uk/{team_name}/kader/verein/{team_id}/saison_id/{year}/plus/1'
    
    urls.append(url)
    

In [11]:
@dataclass
class Team:
    id: str
    name: str


class Parser(ABC):
    """ABC Protocol class for parsing data from transfermarkt."""

    @abstractmethod
    def parse(self, soup: BeautifulSoup) -> pd.DataFrame:
        pass


@dataclass
class Scraper:
    """Scrape data from transfermarkt for a given team and year."""

    team: Team
    parsers: Sequence[Parser]
    year: int
    url: str = (
        "https://www.transfermarkt.co.uk/{name}/kader/verein/{id}/saison_id/{year}/plus/1"
    )

    def run(self) -> pd.DataFrame:
        """Run the scraping process."""
        url = self.url.format(name=self.team.name, id=self.team.id, year=self.year)
        print(f"Scraping: {self.team.name} - {self.year}")

        soup = self._get_soup_content(url)  # get html content from url

        data = pd.concat(
            [parser.parse(soup) for parser in self.parsers], axis=1
        )  # concatenate parsers into a dataframe

        data["season"] = self.year  # add season to dataframe
        data["team"] = self.team.name  # add team name to dataframe

        return data

    def _get_soup_content(self, url: str) -> BeautifulSoup:
        """Get the html content from a given Transfermarkt url."""
        resp = self._make_request(url)
        return BeautifulSoup(resp.content, "html.parser")

    def _make_request(self, url: str) -> httpx.Response:
        """Make a request to a given Transfermarkt url."""
        try:
            response = httpx.get(
                url,
                headers={
                    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, Gecko gibi) Chrome/134.0.0.0 Safari/537.36'
                },
                timeout=60,
            )
            response.raise_for_status()
            return response

        except httpx.HTTPError as e:
            print(f"HTTP error occurred: {e}")
            raise e

In [12]:
class PlayerNames(Parser):
    def parse(self, soup: BeautifulSoup) -> pd.Series:
        elements = soup.find_all("img", {"class": "bilderrahmen-fixed lazy lazy"})
        names = [td.get("title") if td.get("title") else None for td in elements]
        return pd.Series(names, name="name")

In [13]:
class PlayerNumbers(Parser):
    def parse(self, soup: BeautifulSoup) -> pd.Series:
        stats = soup.find_all("td", {"class": "zentriert"})
        numbers = [stat for stat in stats[0::8]]
        numbers = [
            (
                td.find("div", class_="rn_nummer").text.strip()
                if td.find("div", class_="rn_nummer")
                else None
            )
            for td in numbers
        ]
        return pd.Series(numbers, name="number")

In [14]:
class PlayerAges(Parser):
    def parse(self, soup: BeautifulSoup) -> pd.DataFrame:
        stats = soup.find_all("td", {"class": "zentriert"})
        ages = [stat for stat in stats[1::8]]
        dob = [td.text.strip().split(" (")[0] if td.text else None for td in ages]
        age = [
            int(td.text.strip().split(" (")[1].split(")")[0]) if td.text else None
            for td in ages
        ]
        return pd.DataFrame({"dob": dob, "age": age})

In [15]:
class PlayerCountries(Parser):
    def parse(self, soup: BeautifulSoup) -> pd.Series:
        stats = soup.find_all("td", {"class": "zentriert"})
        countries = [stat for stat in stats[2::8]]
        countries = [
            td.find("img").get("title") if td.find("img") else None for td in countries
        ]
        return pd.Series(countries, name="country")

In [16]:
class CurrentClubs(Parser):
    def parse(self, soup: BeautifulSoup) -> pd.Series:
        stats = soup.find_all("td", {"class": "zentriert"})
        current_clubs = [stat for stat in stats[3::8]]
        current_clubs = [
            td.find("a").get("title") if td.find("a") else None for td in current_clubs
        ]
        return pd.Series(current_clubs, name="current_club")

In [17]:
class PlayerHeights(Parser):
    def parse(self, soup: BeautifulSoup) -> pd.Series:
        stats = soup.find_all("td", {"class": "zentriert"})
        heights = [stat for stat in stats[4::8]]
        heights = [td.text if td.text else None for td in heights]
        return pd.Series(heights, name="height")

In [18]:
class PlayerFoot(Parser):
    def parse(self, soup: BeautifulSoup) -> pd.Series:
        stats = soup.find_all("td", {"class": "zentriert"})
        foots = [stat for stat in stats[5::8]]
        foots = [td.text if td.text else None for td in foots]
        return pd.Series(foots, name="foot")

In [19]:
class PlayerJoinedDate(Parser):
    def parse(self, soup: BeautifulSoup) -> pd.Series:
        stats = soup.find_all("td", {"class": "zentriert"})
        joined_date = [stat for stat in stats[6::8]]
        joined_date = [td.text if td.text else None for td in joined_date]
        return pd.Series(joined_date, name="joined_date")

In [20]:
class PlayerSigningFee(Parser):
    def parse(self, soup: BeautifulSoup) -> pd.Series:
        stats = soup.find_all("td", {"class": "zentriert"})
        signing_info = [stat for stat in stats[7::8]]
        signing_fee = [
            td.find("a").get("title").split(": Ablöse ")[1] if td.find("a") else 0
            for td in signing_info
        ]
        return pd.Series(signing_fee, name="signing_fee")

In [21]:
class PlayerSignedFrom(Parser):
    def parse(self, soup: BeautifulSoup) -> pd.Series:
        stats = soup.find_all("td", {"class": "zentriert"})
        signing_info = [stat for stat in stats[7::8]]
        signed_from = [
            td.find("a").get("title").split(": Ablöse ")[0] if td.find("a") else None
            for td in signing_info
        ]
        return pd.Series(signed_from, name="signed_from")

In [22]:
class PlayerValues(Parser):
    def parse(self, soup: BeautifulSoup) -> pd.Series:
        values = soup.find_all("td", {"class": "rechts hauptlink"})
        values = [td.find("a").text if td.find("a") else "€0" for td in values]
        return pd.Series(values, name="value")

In [23]:
class PlayerPositions(Parser):
    def parse(self, soup: BeautifulSoup) -> pd.Series:
        pos_soup = soup.find_all("td", {"class": "posrela"})
        positions = [
            td.find_all("tr")[1].find("td").text.strip() if td.find_all("tr") else None
            for td in pos_soup
        ]
        return pd.Series(positions, name="position")

In [24]:
class TransfermarktName(Parser):
    def parse(self, soup: BeautifulSoup) -> pd.Series:
        links = soup.find_all("td", {"class": "hauptlink"})
        tm_name = [
            link.find("a")["href"].split("/")[1] if link.find("a") else None
            for link in links[::2]
        ]
        return pd.Series(tm_name, name="tm_name")

In [25]:
class TransfermarktId(Parser):
    def parse(self, soup: BeautifulSoup) -> pd.Series:
        links = soup.find_all("td", {"class": "hauptlink"})
        tm_id = [
            link.find("a")["href"].split("/")[4] if link.find("a") else None
            for link in links[::2]
        ]
        return pd.Series(tm_id, name="tm_id")

In [26]:
parsers = (
    PlayerNames(),
    PlayerNumbers(),
    PlayerAges(),
    PlayerCountries(),
    CurrentClubs(),
    PlayerHeights(),
    PlayerFoot(),
    PlayerJoinedDate(),
    PlayerSigningFee(),
    PlayerSignedFrom(),
    PlayerValues(),
    PlayerPositions(),
    TransfermarktName(),
    TransfermarktId()
)

In [27]:
def get_team_info(league: str, league_id: str, year: int) -> tuple:
    link = "https://www.transfermarkt.co.uk/{league}/startseite/wettbewerb/{league_id}/plus/?saison_id={year}"
    url = link.format(league=league, league_id=league_id, year=year)
    resp = httpx.get(
        url,
        headers={
            "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, Gecko gibi) Chrome/134.0.0.0 Safari/537.36'
        },
        timeout=20,
    )
    soup = BeautifulSoup(resp.content, "html.parser")
    team_info = soup.find_all("td", {"class": "hauptlink no-border-links"})
    team_name = [td.find('a').get('href').split('/')[1] for td in team_info]
    team_id = [td.find('a').get('href').split('/')[4] for td in team_info]
    return tuple(zip(team_name, team_id))

In [28]:
pl_teams = get_team_info('bundesliga', 'L1', 2020)

In [29]:
teams = [Team(id=id, name=name) for name, id in zip(team_name, team_id)]

In [30]:
teams

[Team(id='1', name='a'), Team(id='0', name='r')]

In [31]:
dfs = []
for name, id_ in pl_teams:
    team = Team(id=id_, name=name)
    scraper = Scraper(team=team, parsers=parsers, year=2020)
    df = scraper.run()
    dfs.append(df)
    time.sleep(5) # sleep for 5 seconds to avoid getting blocked

Scraping: fc-bayern-munchen - 2020
Scraping: borussia-dortmund - 2020
Scraping: rasenballsport-leipzig - 2020
Scraping: bayer-04-leverkusen - 2020
Scraping: borussia-monchengladbach - 2020
Scraping: eintracht-frankfurt - 2020
Scraping: vfl-wolfsburg - 2020
Scraping: hertha-bsc - 2020
Scraping: tsg-1899-hoffenheim - 2020
Scraping: vfb-stuttgart - 2020
Scraping: 1-fsv-mainz-05 - 2020
Scraping: sc-freiburg - 2020
Scraping: fc-schalke-04 - 2020
Scraping: 1-fc-koln - 2020
Scraping: sv-werder-bremen - 2020
Scraping: fc-augsburg - 2020
Scraping: 1-fc-union-berlin - 2020
Scraping: arminia-bielefeld - 2020


In [32]:
data = pd.concat(dfs)

In [33]:
data

Unnamed: 0,name,number,dob,age,country,current_club,height,foot,joined_date,signing_fee,signed_from,value,position,tm_name,tm_id,season,team
0,Manuel Neuer,1,"Mar 27, 1986",35,Germany,Bayern Munich,"1,93m",right,"Jul 1, 2011",€30.00m,FC Schalke 04,€18.00m,Goalkeeper,manuel-neuer,17259,2020,fc-bayern-munchen
1,Alexander Nübel,35,"Sep 30, 1996",24,Germany,VfB Stuttgart,"1,93m",right,"Jul 1, 2020",free transfer,FC Schalke 04,€6.00m,Goalkeeper,alexander-nubel,195778,2020,fc-bayern-munchen
2,Sven Ulreich,26,"Aug 3, 1988",32,Germany,Bayern Munich,"1,92m",right,"Jul 1, 2021",free transfer,Hamburger SV,€900k,Goalkeeper,sven-ulreich,40680,2020,fc-bayern-munchen
3,Ron-Thorben Hoffmann,39,"Apr 4, 1999",22,Germany,Eintracht Braunschweig,"1,92m",right,"Jul 1, 2018",-,FC Bayern Munich U19,€600k,Goalkeeper,ron-thorben-hoffmann,317444,2020,fc-bayern-munchen
4,Lukas Schneller,34,"Oct 26, 2001",19,Germany,1.FC Schweinfurt 05,"1,90m",left,,0,,€200k,Goalkeeper,lukas-schneller,453874,2020,fc-bayern-munchen
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,Andreas Voglsammer,21,"Jan 9, 1992",29,Germany,Hannover 96,"1,78m",right,"Jan 20, 2016",free transfer,1.FC Heidenheim 1846,€1.70m,Centre-Forward,andreas-voglsammer,94771,2020,arminia-bielefeld
29,Sergio Córdova,18,"Aug 9, 1997",23,Venezuela,Alanyaspor,"1,88m",right,"Aug 17, 2020",?,FC Augsburg,€1.20m,Centre-Forward,sergio-cordova,377387,2020,arminia-bielefeld
30,Fabian Klos,9,"Dec 2, 1987",33,Germany,,"1,94m",right,"Jul 1, 2011",free transfer,VfL Wolfsburg II,€800k,Centre-Forward,fabian-klos,78147,2020,arminia-bielefeld
31,Jóan Símun Edmundsson,14,"Jul 26, 1991",29,Faroe Islands,KA Akureyri,"1,85m",left,"Jul 1, 2018",free transfer,Odense Boldklub,€500k,Centre-Forward,joan-simun-edmundsson,123683,2020,arminia-bielefeld


In [34]:
# Export the data DataFrame to Excel
data.to_excel("../Datas/my_2020_bundesliga_data.xlsx", index=False)