### Import and settings

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 50)

### Functions

In [2]:
def get_players(year: int, out=False, count=10, **kwargs):
    page_no = 0

    players = pd.DataFrame(columns = ['name', 'pos', 'card_id', 'year', 'link'])
    
    additional = ''
    
    for key in kwargs:
        additional = additional + '&' + key + '=' + str(kwargs.get(key))

    valid = True

    while valid:
        page_no += 1
        url = f'https://www.futbin.com/{year}/players?page={page_no}' + additional
        print(url)
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        result = requests.get(url, headers=headers)
        soup = BeautifulSoup(result.content, 'html.parser')
        ply = soup.find_all("a", {"class": "player_name_players_table"})
        pos = soup.find_all("div", {"class": "font-weight-bold"})
        
        if ply:
            for j in range(len(ply)):
                players = pd.concat([pd.DataFrame([[ply[j].text, pos[j].text, ply[j].get('data-site-id'), ply[j].get('data-year'), ply[j].get('href')]], columns=players.columns), players], ignore_index=True)
        else:
            valid = False
            
        if out and page_no % count == 0:
            print(page_no)
            
        time.sleep(2)

    return players

def get_stats(players: pd.DataFrame, out=False, count=100):
    links = players['link']
    position = players['pos']

    stats = pd.DataFrame(columns = ['player_id', 'dob', 'club', 'club_id', 'league', 'league_id', 'nation', 'card_id', 'rating', 'pac', 'acceleration', 'sprint_speed', 'sho', 'positioning', 'finishing', 'shot_power', 'long_shots', 
                                    'volleys', 'penalties', 'pas', 'vision', 'crossing', 'fk_accuracy', 'short_passing', 'long_passing', 'curve', 'dri', 'agility', 'balance',
                                    'reactions', 'ball_control', 'dribbling', 'composure', 'def', 'interceptions', 'heading_acc', 'def_awareness', 
                                    'standing_tackle', 'sliding_tackle', 'phy', 'jumping', 'stamina', 'strength', 'aggression', 'gk_diving', 'gk_handling', 'gk_kicking',
                                    'gk_reflexes', 'gk_speed', 'gk_positioning'])

    for i in range(len(links)):
        url = f'https://www.futbin.com{links[i]}'
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        result = requests.get(url, headers=headers)
        soup = BeautifulSoup(result.content, 'html.parser')
        rating = soup.find("div", {"class": "pcdisplay-rat"})
        ids = soup.find("div", {"id": "page-info"})
        stat_tab = soup.find("div", {"class": "card-body"})
        ss = stat_tab.find_all("div", {"class": "stat_val"})
        dob = soup.find("a", {"style": "color : #346fda;"}).get("title").split()[2]
        
        spec_list = soup.find("div", {"id": "info_content"})
        spec_tab = spec_list.find_all("td")
        club = ""
        league = ""
        nation = ""
        
        spec_len = len(spec_tab)
        club_id = spec_tab[spec_len-2].text
        league_id = spec_tab[spec_len-1].text
        year = int(ids.get("data-year"))
        int_rep = False
        if spec_list.find_all("th")[spec_len-14].text == "Intl. Rep ":
            int_rep = True
        if year == 23:
            if int_rep:
                club = spec_tab[spec_len-19].find("a").text
                try:
                    league = spec_tab[spec_len-17].find("a").text
                except:
                    print(url)
                nation = spec_tab[spec_len-18].find("a").text
            else:
                club = spec_tab[spec_len-18].find("a").text
                league = spec_tab[spec_len-16].find("a").text
                nation = spec_tab[spec_len-17].find("a").text
        elif year < 23 and year >= 15:
                club = spec_tab[1].find("a").text
                league = spec_tab[3].find("a").text
                nation = spec_tab[2].find("a").text
        else:
            print("L")
            
            
        if ss and position[i] != 'GK':
            stats = pd.concat([pd.DataFrame([[ids.get("data-baseid"), dob, club, club_id, league, league_id, nation, ids.get('data-id'), rating.text, ss[1].text, ss[3].text, ss[5].text, ss[7].text, ss[9].text,
                                            ss[11].text, ss[13].text, ss[15].text, ss[17].text, ss[19].text, ss[21].text, ss[23].text, ss[25].text,
                                            ss[27].text, ss[29].text, ss[31].text, ss[33].text, ss[35].text, ss[37].text, ss[39].text, ss[41].text, ss[43].text,
                                            ss[45].text, ss[47].text, ss[49].text, ss[51].text, ss[53].text, ss[55].text, ss[57].text, ss[59].text,
                                            ss[61].text, ss[63].text, ss[65].text, ss[67].text, ss[69].text, None, None, None, None, None, None]], columns=stats.columns), stats], ignore_index=True)            
        else:
            stats = pd.concat([pd.DataFrame([[ids.get("data-baseid"), dob, club, club_id, league, league_id, nation, ids.get('data-id'), rating.text, None, None, None, 
                                                None, None, None, None, None, None, None, None, None, None,
                                                None, None, None, None, None, None, None, None, None, None, None, None, None, None,
                                                None, None, None, None, None, None, None, None, ss[1].text, ss[5].text, ss[9].text, 
                                                ss[13].text, ss[17].text, ss[21].text]], columns=stats.columns), stats], ignore_index=True)

    if out and i % count == 0:
            print(i)
    
    return stats

## Fetch Players
* Gets players from the "top" 5 leagues in a given year

In [8]:
leagues = [13, 53, 31, 16, 19]

# Prem: 13
# La Liga: 53
# Serie A: 31
# ligue 1: 16
# Bundesliga: 19

dfs = []

for i in leagues:
    dfs.append(get_players(year=18, out=False, version='gold', league=i))

https://www.futbin.com/18/players?page=1&version=gold&league=13
https://www.futbin.com/18/players?page=2&version=gold&league=13
https://www.futbin.com/18/players?page=3&version=gold&league=13
https://www.futbin.com/18/players?page=4&version=gold&league=13
https://www.futbin.com/18/players?page=5&version=gold&league=13
https://www.futbin.com/18/players?page=6&version=gold&league=13
https://www.futbin.com/18/players?page=7&version=gold&league=13
https://www.futbin.com/18/players?page=8&version=gold&league=13
https://www.futbin.com/18/players?page=9&version=gold&league=13
https://www.futbin.com/18/players?page=10&version=gold&league=13
https://www.futbin.com/18/players?page=11&version=gold&league=13
https://www.futbin.com/18/players?page=12&version=gold&league=13
https://www.futbin.com/18/players?page=13&version=gold&league=13
https://www.futbin.com/18/players?page=14&version=gold&league=13
https://www.futbin.com/18/players?page=15&version=gold&league=13
https://www.futbin.com/18/players?

In [9]:
# Retrieve stats for each player fetched from the leagues
stat_dfs = []

for i in dfs:
    stat_dfs.append(get_stats(players=i, out=True))

In [10]:
# Combine the dataframes of the 5 leagues into 1
comb = []

for i in range(len(dfs)):
    comb.append(pd.merge(dfs[i], stat_dfs[i], on='card_id'))

total = pd.concat(comb)

In [11]:
total

Unnamed: 0,name,pos,card_id,year,link,player_id,dob,club,club_id,league,league_id,nation,rating,pac,acceleration,sprint_speed,sho,positioning,finishing,shot_power,long_shots,volleys,penalties,pas,vision,...,curve,dri,agility,balance,reactions,ball_control,dribbling,composure,def,interceptions,heading_acc,def_awareness,standing_tackle,sliding_tackle,phy,jumping,stamina,strength,aggression,gk_diving,gk_handling,gk_kicking,gk_reflexes,gk_speed,gk_positioning
0,Jefferson Lerma,CDM,21159,18,/18/player/21159/jefferson-lerma,213991,25-10-1994,Bournemouth,1943,Premier League,13,Colombia,75,77,76,77,64,58,56,80,71,53,58,71,66,...,67,73,76,76,72,75,71,63,69,74,58,60,75,80,80,81,90,75,78,,,,,,
1,Dwight Gayle,ST,21141,18,/18/player/21141/dwight-gayle,205670,20-10-1990,West Bromwich Albion,109,Premier League,13,England,75,83,85,82,76,78,79,73,72,68,73,55,34,...,54,73,83,80,74,71,70,76,25,35,68,12,21,13,60,71,66,62,46,,,,,,
2,Leon Balogun,CB,19927,18,/18/player/19927/leon-balogun,188182,28-06-1988,Brighton & Hove Albion,1808,Premier League,13,Nigeria,75,70,66,74,46,49,30,70,60,32,39,62,47,...,59,61,55,52,68,62,60,72,74,75,78,70,75,76,79,75,73,84,75,,,,,,
3,Matz Sels,GK,19887,18,/18/player/19887/matz-sels,199641,26-02-1992,Newcastle United,13,Premier League,13,Belgium,75,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,80,35,18,38,19,70
4,Stuart Armstrong,CAM,19865,18,/18/player/19865/stuart-armstrong,202282,30-03-1992,Southampton,17,Premier League,13,Scotland,75,84,84,84,71,73,71,75,74,62,52,71,71,...,69,75,86,62,68,76,74,72,62,65,66,58,62,59,73,73,86,69,67,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,Mats Hummels,CB,97,18,/18/player/97/mats-hummels,178603,16-12-1988,FC Bayern München,21,Bundesliga,19,Germany,88,64,62,65,58,56,55,71,51,60,68,75,79,...,65,72,64,58,85,77,68,91,89,89,89,85,92,90,76,68,66,85,66,,,,,,
345,Arjen Robben,RM,92,18,/18/player/92/arjen-robben,9014,23-01-1984,FC Bayern München,21,Bundesliga,19,Holland,88,86,87,86,86,85,85,87,88,86,80,81,83,...,87,90,89,91,87,89,92,86,32,39,51,29,26,26,63,61,68,67,47,,,,,,
346,Mats Hummels,CB,17873,18,/18/player/17873/mats-hummels,178603,16-12-1988,FC Bayern München,21,Bundesliga,19,Germany,89,65,62,66,58,56,55,71,51,60,68,76,80,...,65,73,64,58,86,78,68,92,90,90,90,85,93,91,77,68,66,86,66,,,,,,
347,Robert Lewandowski,ST,33,18,/18/player/33/robert-lewandowski,188545,21-08-1988,FC Bayern München,21,Bundesliga,19,Poland,91,81,79,83,88,91,91,88,83,87,81,75,78,...,77,86,78,80,91,89,85,87,38,39,85,25,42,19,82,84,79,84,80,,,,,,


In [12]:
# Output results to csv file
total.to_csv("csv files/futbin_2018.csv", index=False)