In [1]:
import pandas as pd
import requests
from dateutil.parser import parse
import re
import numpy as np
url = 'http://www.transfermarkt.com/manuel-neuer/profil/spieler/17259'

response = requests.get(url)

In [215]:
from fake_useragent import UserAgent
ua = UserAgent()

In [216]:
user_agent = {'User-agent': ua.random}
print(user_agent)

{'User-agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36'}


In [217]:
response = requests.get(url, headers = user_agent)

In [218]:
response.status_code

200

In [219]:
page = response.text

In [220]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(page, "lxml")

### Functions for scraping features from each player site

In [189]:
infos = ['Date of birth:','Place of birth:','Age:','Height:','Nationality:','Position:','Foot:',"""Player's agent:""",\
         'Current club:','In the team since:','Contract until:','Date of last contract extension:','Outfitter:']
def player_name(soup):
    #return soup.find('h1').text
    #print(soup)
    return soup.find_all('title')[0].text.split(' -')[0]

def player_values(soup, field_name):
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next = obj.findNext()
    if next:
        return next.text.strip()
    else:
        return None

def market_values(soup):
    values = []
    obj = soup.find_all(class_=re.compile('left-td'))
    for x in obj:
        value = x.findNextSibling().text.strip()
        
        if '\n' in value:
            value = value.split('\n')
            value1, value2 = value
            value1 = value1.replace('\t','')
            values.append(change_euro_value(value1))
            values.append(change_euro_value(value2))
        else:
            values.append(change_euro_value(value))
    return values
        
def transfer_proceeds(soup):
    transfer_proceeds = soup.find('tfoot').text.strip().split('\n')[2]
    return change_euro_value(transfer_proceeds)

def performance(soup):
    values = []
    for i in soup.find_all('table')[2].find_all('td')[2:7]:
        values.append(i.text.replace('.',''))
    return values

def wc_winner(soup):
    if soup.find('img', {'alt': 'World Cup winner'}) in soup.find_all('img'):
        return 1
    else:
        return 0

def cl_winner(soup):
    if soup.find('img', {'alt': 'Champions League winner'}) in soup.find_all('img'):
        return 1
    else:
        return 0

def player_international_exp(soup):
    try:
        return soup.find(text=re.compile('International caps/goals:')).findNext().text.split('/')[0]
    except:
        return None

def scrape_int_exp(soup):
    inf = []
    int_exp = player_international_exp(soup)
    name = player_name(soup)
    inf.append(name)
    inf.append(int_exp)
    return tuple(inf)

In [None]:
new_list = []
lks = []
while len(player_urls) > 0:
    for i in player_urls[0]:
        player_soup = get_soup(i)
        new_list.append(scrape_int_exp(player_soup))
    lks.append(player_urls.pop(0))

In [190]:
def change_euro_value(value):
    value = value.replace(' Th. €', '000')
    value = value.replace(' Mill. €','0000')
    value = value.replace(',','')
    value = value.replace('\t','')
    value = value.replace('\n','')
    return value

def get_soup(url):
    ua = UserAgent()
    user_agent = {'User-agent': ua.random}
    response = requests.get(url, headers = user_agent)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    return soup


In [191]:
def create_player(soup):
    player_info = []
    player_columns = ['name','date_of_birth','place_of_birth','age','height','nationality','position', \
                 'foot','agency','current_club','in_team_since','contract_until', \
                 'date_last_contract_extension','outfitter', \
                  'current_market_value','date_last_market_value_change','highest_market_value', \
                  'date_highest_market_value', 'total_transfer_proceeds','apps_season','goals_season',\
                 'assists_season','minute_per_goal','played_minutes','wc_winner','cl_winner']
    infos = ['Date of birth:','Place of birth:','Age:','Height:','Nationality:','Position:','Foot:',"""Player's agent:""",\
         'Current club:','In the team since:','Contract until:','Date of last contract extension:','Outfitter:']
    # add player name
    player_info.append(player_name(soup))
    
    # add standard player values
    for value in infos:
        player_info.append(player_values(soup, value))
        
    # add market value infos
    for value in market_values(soup):
        player_info.append(value)
        
    # add transfer proceeds
    player_info.append(transfer_proceeds(soup))
    
    # add performance data
    for value in performance(soup):
        player_info.append(value)
        
    # add if World Cup or Champions League winner (0/1)
    player_info.append(wc_winner(soup))
    player_info.append(cl_winner(soup))
    
    # create player dict and return Series
    #player = dict(zip(player_columns, player_info))
    #return pd.Series(player)
    
    return tuple(player_info)

### Get all links for scraping

In [192]:
def get_club_urls(soup):
    urls = []
    for url in soup.find_all('tr', {'class': 'odd'}):
        if url.find('a')['href'] != '#':
            urls.append(url.find('a')['href'])
    for url2 in soup.find_all('tr', {'class': 'even'}):
        if url2.find('a')['href'] != '#':
            urls.append(url2.find('a')['href'])
    return urls

def get_player_urls(soup):
    player_urls = []
    for i in soup.find_all('a', {'class': 'spielprofil_tooltip'}):
        player_url = 'http://www.transfermarkt.com' + i['href']
        if player_url not in player_urls:
            player_urls.append(player_url)
    return player_urls

def scrape_club(soup):
    for i in get_player_urls(club):
        #print(i)
        soup = get_soup(i)
        player = create_player(soup)
        df = df.append(player, ignore_index=True)


### Scraping links of all clubs.
First 2 leagues of each country: Germany, UK, France, Spain, Italy

In [193]:
leagues = ['L1','L2','GB1','GB2','FR1','FR2','ES1','ES2','IT1','IT2']
league_urls = ['http://www.transfermarkt.com/jumplist/startseite/wettbewerb/'+ x for x in leagues]
club_urls = [get_club_urls(get_soup(league)) for league in league_urls]


In [2]:
#club_urls

In [242]:
player_urls = [get_player_urls(get_soup('http://www.transfermarkt.com/'+club)) for clubs in club_urls for club in clubs]

#### Scrape team by team and save players in list 'players'. Pop the scraped teams from the link list.

In [457]:
#players = []
#scraped_links = []
#len(player_urls)

204

all 204 teams were scraped.

In [541]:
while len(player_urls) > 150:
    for player in player_urls[0]:
        player_soup = get_soup(player)
        players.append(create_player(player_soup))
    scraped_links.append(player_urls.pop(0))

In [72]:
#len(players), len(player_urls), len(scraped_links)

In [77]:
player_columns = ['name','date_of_birth','place_of_birth','age','height','nationality','position', \
                 'foot','agency','current_club','in_team_since','contract_until', \
                 'date_last_contract_extension','outfitter', \
                  'current_market_value','date_last_market_value_change','highest_market_value', \
                  'date_highest_market_value', 'total_transfer_proceeds','apps_season','goals_season',\
                 'assists_season','minute_per_goal','played_minutes','wc_winner','cl_winner']



### Dataframe

In [4]:
#df['contract_until'] = df['contract_until'].map(lambda x: '30.06.2019' if x == '31.06.2019' else x)
#df['minute_per_goal'] = df['minute_per_goal'].fillna(0)
#df['days_to_contract_expiry'] = df['days_to_contract_expiry'].map(lambda x: x / np.timedelta64(1, 'D'))
#df['contract_until'] = df['contract_until'].map(lambda x: x.strip('T00:00:00.000000000'))
#df[df['total_transfer_proceeds'] != int()]
#df[df['foot'] == np.isnan]
#df.columns
#df['contract_until'] = df['contract_until'].apply(pd.to_datetime, format='%Y-%m-%d', errors='ignore')
#df.iloc[:,11:12][df.iloc[:,11:12].isnull().any(axis=1)] # show NaN values

Save and load pickles

In [370]:
import pickle
import datetime

def save_df(df):
    day = datetime.datetime.now().day
    hour = datetime.datetime.now().hour
    minute = datetime.datetime.now().minute
    with open('soccer_scrape_'+str(day)+'-'+str(hour)+'h-'+str(minute)+'m.pkl', 'wb') as picklefile:
        pickle.dump(df, picklefile)
def load_df(filename):
    with open(filename, 'rb') as picklefile: 
        last_version = pickle.load(picklefile)

In [4]:
#df = pd.read_csv('0423_2014pm.csv')

#df.to_csv('0423_2014pm.csv', index=False)
