In [1]:
# Imports
import pandas as pd
import bs4
from bs4 import BeautifulSoup 
import requests
import csv

In [2]:
# %% Setup version

print('pandas version: {}'.format(pd.__version__))
print('bs4 version: {}'.format(bs4.__version__))
print('requests version: {}'.format(requests.__version__))
print('csv version: {}'.format(csv.__version__))

pandas version: 1.4.2
bs4 version: 4.11.1
requests version: 2.27.1
csv version: 1.0


In [3]:
def get_url(league_id):
    template= 'https://www.transfermarkt.com/ligue-1/transfers/wettbewerb/{}'
    url= template.format(league_id)
    return url

def get_souped_page(url):
    ''' 
    
    Scraping is prohibited on several websites. When scraping, we must request the xml pages similarly 
    to how a browser does it by passing headers as a parameter to the get function in order to avoid rejection.
        
    '''
    
    headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
              "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1",
              "Connection":"close", "Upgrade-Insecure-Requests":"100"}
    
    page= requests.get(url, headers= headers)
    soup= BeautifulSoup(page.content, 'html.parser')
    return soup

# Convert the league_id to league name
def id_to_name(league_id):
    
    if league_id == 'GB1':
        return 'Premier League'
    elif league_id == 'ES1':
        return 'La Liga'
    elif league_id == 'L1':
        return 'Bundesliga'
    elif league_id == 'IT1':
        return 'Serie A'
    elif league_id == 'FR1':
        return 'Ligue 1'
    
# leagues ids and number of clubs competing
leagues_dict= {'GB1':20,
               'ES1':20,
               'L1':18,
               'IT1':20,
               'FR1':20}

In [4]:

def scrape_all_transfers(leagues_dict):
    
    records= []
    for key, value in leagues_dict.items():
        
        url= get_url(key)
        soup= get_souped_page(url)
        divs= soup.find_all('div', class_='box')
        clubs= divs[4:value+4]
        
        for club in clubs:
            header= club.find('h2')
            try:
                name= header.a.text
            except:
                pass
            table_bodies= club.find_all('tbody')
            
            for table in table_bodies:
                trs= table.find_all('tr')   
                
                for tr in trs:
                    
                    player= tr.a.text
                    age= tr.find('td', class_='zentriert alter-transfer-cell').text
                    nationality= tr.find('td', class_='zentriert nat-transfer-cell').img['alt']
                    position= tr.find('td', class_='pos-transfer-cell').text
                    
                    market_value= tr.find('td', class_='rechts mw-transfer-cell').text
                    
                    # In the sake of cleaning data
                    if "m" in market_value:
                        market_value = int( float(market_value.strip().replace("€","").replace("m","")) * 1000000 )
                    elif "Th." in market_value:
                        market_value = int(market_value.strip().replace("€","").replace("Th.","")) * 1000
                    elif "-":
                        market_value = 0
                        
                    try:
                        left= tr.find('td', class_='no-border-links verein-flagge-transfer-cell').a.text
                    except:
                        pass
                    
                    fee= tr.find_all('td', class_='rechts')[1].a.text
                    if "Loan fee" in fee:
                        type_transfer= 'Loan'
                        fee= fee.replace('Loan fee:','')
                    else:
                        type_transfer= 'Transfer'
                    
                    if "m" in fee:
                        fee = int( float(fee.strip().replace("€","").replace("m","")) * 1000000 )
                    elif "Th." in fee:
                        fee = int(fee.strip().replace("€","").replace("Th.","")) * 1000                        
                    elif "-":
                        fee = 0

                    
                    competition= id_to_name(key)
                    if table_bodies.index(table)==0:
                        record= [player, age, nationality, position, market_value, left, name, fee, type_transfer, competition]
                    else:
                        record= [player, age, nationality, position, market_value, name, left, fee, type_transfer, competition]

                    records.append(record)
                    
    with open("data/transfers_data.csv", 'w', newline='', encoding='utf-8') as f:
        writer= csv.writer(f)
        writer.writerow(['Player', 'Age', 'Nationality', 'Position', 'Market Value', 'From',
                         'To', 'Fee', 'Type of Transfer','Competition'])
        writer.writerows(records)
        
    return records


In [5]:
# Let's scrape some transfers data
data= scrape_all_transfers(leagues_dict)

In [8]:
df= pd.read_csv('data/transfers_data.csv')
df.head()

Unnamed: 0,Player,Age,Nationality,Position,Market Value,From,To,Fee,Type of Transfer,Competition
0,Diego Carlos,29,Brazil,Centre-Back,30000000,Sevilla FC,Aston Villa,31000000,Transfer,Premier League
1,Philippe Coutinho,30,Brazil,Left Winger,20000000,Barcelona,Aston Villa,20000000,Transfer,Premier League
2,Leander Dendoncker,27,Belgium,Defensive Midfield,28000000,Wolves,Aston Villa,15000000,Transfer,Premier League
3,Robin Olsen,32,Sweden,Goalkeeper,2000000,AS Roma,Aston Villa,3500000,Transfer,Premier League
4,Ludwig Augustinsson,28,Sweden,Left-Back,5000000,Sevilla FC,Aston Villa,500000,Loan,Premier League


In [9]:
df.shape

(3079, 10)