In [3]:
import pandas as pd
import requests
from dateutil.parser import parse
import re
import numpy as np
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

url = 'http://www.transfermarkt.com/manuel-neuer/profil/spieler/17259'


In [4]:
link = 'http://www.transfermarkt.com/manuel-neuer/profil/spieler/17259'

def soup(url):
    '''Get HTML code as text.'''
    response = requests.get(url)
    ua = UserAgent()
    user_agent = {'User-agent': ua.random}
    print(user_agent)
    response = requests.get(url, headers = user_agent)
    
    print('Response code:',response.status_code)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    return soup

In [5]:
soup = soup(link)

{'User-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36'}
Response code: 200


### Functions for scraping features from each player site

In [189]:
infos = ['Date of birth:','Place of birth:','Age:','Height:','Nationality:','Position:','Foot:',"""Player's agent:""",\
         'Current club:','In the team since:','Contract until:','Date of last contract extension:','Outfitter:']

def player_name(soup):
    '''Get a name of a soccer player.'''
    return soup.find_all('title')[0].text.split(' -')[0]

def player_values(soup, field_name):

    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next = obj.findNext()
    if next:
        return next.text.strip()
    else:
        return None

def market_values(soup):
    '''Get a soccer players market value.'''
    values = []
    obj = soup.find_all(class_=re.compile('left-td'))
    for x in obj:
        value = x.findNextSibling().text.strip()
        
        if '\n' in value:
            value = value.split('\n')
            value1, value2 = value
            value1 = value1.replace('\t','')
            values.append(change_euro_value(value1))
            values.append(change_euro_value(value2))
        else:
            values.append(change_euro_value(value))
    return values
        
def transfer_proceeds(soup):
    '''Get a soccer players total career transfer proceeds.'''
    transfer_proceeds = soup.find('tfoot').text.strip().split('\n')[2]
    return change_euro_value(transfer_proceeds)

def performance(soup):
    '''Get a soccer players performance stats.'''
    values = []
    for i in soup.find_all('table')[2].find_all('td')[2:7]:
        values.append(i.text.replace('.',''))
    return values

def wc_winner(soup):
    '''Get a soccer players information if he has won the World Cup (1) or not (0).'''
    if soup.find('img', {'alt': 'World Cup winner'}) in soup.find_all('img'):
        return 1
    else:
        return 0

def cl_winner(soup):
    '''Get a soccer players information if he has won the Champions League (1) or not (0).'''
    if soup.find('img', {'alt': 'Champions League winner'}) in soup.find_all('img'):
        return 1
    else:
        return 0

def player_international_exp(soup):
    '''Get a soccer players total international appearances.'''
    try:
        return soup.find(text=re.compile('International caps/goals:')).findNext().text.split('/')[0]
    except:
        return None

def scrape_int_exp(soup):
    '''Get a soccer players total international appearances.'''
    inf = []
    int_exp = player_international_exp(soup)
    name = player_name(soup)
    inf.append(name)
    inf.append(int_exp)
    return tuple(inf)

In [190]:
def change_euro_value(value):
    '''Clean market values to get an integer.'''
    value = value.replace(' Th. €', '000')
    value = value.replace(' Mill. €','0000')
    value = value.replace(',','')
    value = value.replace('\t','')
    value = value.replace('\n','')
    return value

def get_soup(url):
    '''HTML source code in text.'''
    ua = UserAgent()
    user_agent = {'User-agent': ua.random}
    response = requests.get(url, headers = user_agent)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    return soup


In [191]:
def create_player(soup):
    player_info = []
    player_columns = ['name','date_of_birth','place_of_birth','age','height','nationality','position', \
                 'foot','agency','current_club','in_team_since','contract_until', \
                 'date_last_contract_extension','outfitter', \
                  'current_market_value','date_last_market_value_change','highest_market_value', \
                  'date_highest_market_value', 'total_transfer_proceeds','apps_season','goals_season',\
                 'assists_season','minute_per_goal','played_minutes','wc_winner','cl_winner']
    infos = ['Date of birth:','Place of birth:','Age:','Height:','Nationality:','Position:','Foot:',"""Player's agent:""",\
         'Current club:','In the team since:','Contract until:','Date of last contract extension:','Outfitter:']
    # add player name
    player_info.append(player_name(soup))
    
    # add standard player values
    for value in infos:
        player_info.append(player_values(soup, value))
        
    # add market value infos
    for value in market_values(soup):
        player_info.append(value)
        
    # add transfer proceeds
    player_info.append(transfer_proceeds(soup))
    
    # add performance data
    for value in performance(soup):
        player_info.append(value)
        
    # add if World Cup or Champions League winner (0/1)
    player_info.append(wc_winner(soup))
    player_info.append(cl_winner(soup))
    
    return tuple(player_info)

### Get all links for scraping

In [192]:
def get_club_urls(soup):
    '''Get all links of all clubs in all of the first 2 leagues in top5 countries.'''
    urls = []
    for url in soup.find_all('tr', {'class': 'odd'}):
        if url.find('a')['href'] != '#':
            urls.append(url.find('a')['href'])
    for url2 in soup.find_all('tr', {'class': 'even'}):
        if url2.find('a')['href'] != '#':
            urls.append(url2.find('a')['href'])
    return urls

def get_player_urls(soup):
    '''Get every URL of players in each club.'''
    player_urls = []
    for i in soup.find_all('a', {'class': 'spielprofil_tooltip'}):
        player_url = 'http://www.transfermarkt.com' + i['href']
        if player_url not in player_urls:
            player_urls.append(player_url)
    return player_urls

def scrape_club(soup):
    '''Get every URL of players in each club.'''
    for i in get_player_urls(club):
        #print(i)
        soup = get_soup(i)
        player = create_player(soup)
        df = df.append(player, ignore_index=True)


### Scraping links of all clubs.
All URLs of first 2 leagues of each country: Germany, UK, France, Spain, Italy

In [193]:
def URLs():
    leagues = ['L1','L2','GB1','GB2','FR1','FR2','ES1','ES2','IT1','IT2']
    league_urls = ['http://www.transfermarkt.com/jumplist/startseite/wettbewerb/'+ x for x in leagues]
    club_urls = [get_club_urls(get_soup(league)) for league in league_urls]
    player_urls = [get_player_urls(get_soup('http://www.transfermarkt.com/'+club)) for clubs in club_urls for club in clubs]
    return club_urls, player_urls

In [242]:
club_urls, player_urls = URLs()

#### Scrape team by team (204 total) and save players in list 'players'. Pop the scraped teams from the link list.

In [457]:
players = []
scraped_links = []
len(player_urls)

204

In [541]:
while len(player_urls) > 0:
    for player in player_urls[0]:
        player_soup = get_soup(player)
        players.append(create_player(player_soup))
    scraped_links.append(player_urls.pop(0))

In [77]:
player_columns = ['name','date_of_birth','place_of_birth','age','height','nationality','position', \
                 'foot','agency','current_club','in_team_since','contract_until', \
                 'date_last_contract_extension','outfitter', \
                  'current_market_value','date_last_market_value_change','highest_market_value', \
                  'date_highest_market_value', 'total_transfer_proceeds','apps_season','goals_season',\
                 'assists_season','minute_per_goal','played_minutes','wc_winner','cl_winner']

Save and load pickles

In [370]:
import pickle
import datetime

def save_df(df):
    day = datetime.datetime.now().day
    hour = datetime.datetime.now().hour
    minute = datetime.datetime.now().minute
    with open('soccer_scrape_'+str(day)+'-'+str(hour)+'h-'+str(minute)+'m.pkl', 'wb') as picklefile:
        pickle.dump(df, picklefile)
def load_df(filename):
    with open(filename, 'rb') as picklefile: 
        last_version = pickle.load(picklefile)