In [18]:
from helium import *
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

In [13]:
url = 'https://www.espncricinfo.com/cricketers/team/afghanistan-40/alpha-a'
base_url = 'https://www.espncricinfo.com'
teams_url = 'https://www.espncricinfo.com/team'
records_url ='https://www.espncricinfo.com/cricketers/virat-kohli-253802/tests-odi-t20-records'

In [3]:
def get_content_from_url(url):
    response = requests.get(url)
    if response.status_code ==200:
        content = response.content
    else:
        content =  "Error finding page"
    return content

def get_content_from_url_jv(url,sleep_time = 0,scroll_down = 0):
    delay = sleep_time
    browser = start_chrome(url, headless=True)
    Config.implicit_wait_secs = delay
    helium.scroll_down(num_pixels=scroll_down)
    html = browser.page_source
    kill_browser()
    return html

In [4]:
# Finds the list of urls for each teams
def get_teams_url(url):
    content = get_content_from_url_jv(url,sleep_time=3,scroll_down=700)
    soup = BeautifulSoup(content,'html.parser')
    teams_divs = soup.find_all('div',{'class':'ds-grid'})[0]
    teams_url = []
    for team_div in teams_divs:
        # print(team_div,'\n')
        
        team_flag_url = team_div.find('img')['src']
        team_name = team_div.text
        team_url = base_url +'/cricketers' + team_div['href']
        teams_url.append({
            'team_name': team_name,
            'team_url': team_url,
            'team_flag_url':team_flag_url
        })
    return teams_url

In [5]:
get_teams_url(teams_url)

[{'team_name': 'Afghanistan',
  'team_url': 'https://www.espncricinfo.com/cricketers/team/afghanistan-40',
  'team_flag_url': 'https://img1.hscicdn.com/image/upload/f_auto,t_ds_square_w_160,q_50/lsci/db/PICTURES/CMS/321000/321005.png'},
 {'team_name': 'Australia',
  'team_url': 'https://www.espncricinfo.com/cricketers/team/australia-2',
  'team_flag_url': 'https://img1.hscicdn.com/image/upload/f_auto,t_ds_square_w_160,q_50/lsci/db/PICTURES/CMS/340400/340493.png'},
 {'team_name': 'Bangladesh',
  'team_url': 'https://www.espncricinfo.com/cricketers/team/bangladesh-25',
  'team_flag_url': 'https://img1.hscicdn.com/image/upload/f_auto,t_ds_square_w_160,q_50/lsci/db/PICTURES/CMS/341400/341456.png'},
 {'team_name': 'England',
  'team_url': 'https://www.espncricinfo.com/cricketers/team/england-1',
  'team_flag_url': 'https://img1.hscicdn.com/image/upload/f_auto,t_ds_square_w_160,q_50/lsci/db/PICTURES/CMS/313100/313114.logo.png'},
 {'team_name': 'India',
  'team_url': 'https://www.espncricinfo

In [6]:
# Finds the alphabetical links of all players in a team

def get_team_alphas(team_url):
    content = get_content_from_url(team_url)
    team_alphas = []
    if content == "Error finding page":
        return content
    soup = BeautifulSoup(content,'html.parser')
    alpha_bar = soup.find_all('div',{'class':'ds-px-4'})
    for elements in alpha_bar:
        a_tags = elements.find_all('a',{'class':'ds-h-10'})
        for tag in a_tags:
            if '/alpha-' in tag['href']:
                team_alphas.append(tag['href'])
    team_alphas = [base_url+ link for link in team_alphas]
    return team_alphas
    
    

In [8]:
# Finds all the links to player data from alphabetical links

def get_players_by_alpha(url):
    content = get_content_from_url(url)
    if content == "Error finding page":
        return content
    soup = BeautifulSoup(content, 'html.parser')
    grid = soup.find_all('div',{'class':'ds-grid'})
    players_links = []
    for div in grid:
        links = div.find_all('a',{'class':'ds-flex'})
        for link in links:
            players_links.append(base_url+link['href'])
    return players_links

In [11]:
# Finds all the player data and returns it as a dictionary
def get_player_data(player_url):
    content = get_content_from_url(player_url)
    if content == "Error finding page":
        print(content)
    soup = BeautifulSoup(content,'html.parser')
    player_data = {}
    player_profile_div = soup.find_all('div',{'class':'ds-p-4'})[0]
    for div in player_profile_div:
        title = div.find_all('p',{'class':'ds-uppercase'})
        for tag in title:
            player_key = tag.text
            player_value = tag.next_sibling.text
            player_data[player_key] = player_value
    stats = get_player_stats(player_url)
    for key,value in stats.items():
        player_data[key] = value
        
    records_url = get_player_records(soup)
    player_data['records_url'] = records_url
    player_image_urls = get_player_image(player_url)
    for key, value in player_image_urls.items():
        player_data[key] = value
    return player_data

def get_player_stats(player_url):
    tables = pd.read_html(player_url)
    if len(tables) < 2:
        return {}
    batting_fielding = tables[0]
    bowling = tables[1] 
    return {
        'batting_fielding': batting_fielding,
        'bowling': bowling
    }

def get_player_records(player_soup):
    records_url = player_soup.find_all('a',string='View more records')
    if len(records_url) == 0:
        return ''
    records_url = base_url + records_url[0]['href']
    return records_url

def get_player_image(player_url):
    browser = start_chrome(player_url, headless=True)
    html = browser.page_source
    player_soup = BeautifulSoup(html,'html.parser')
    image_div = player_soup.find('img')
    player_image = image_div['src']
    bg_image_div = player_soup.find_all('div',{'class': 'ds-bg-cover'})
    for img in bg_image_div:
        bg_image = img['style'].split('(')[1][1:-3]
    return {
        'player_image': player_image,
        'background_image': bg_image
    }

In [12]:
get_player_data('https://www.espncricinfo.com/cricketers/virat-kohli-253802')

{'Full Name': 'Virat Kohli',
 'Born': 'November 05, 1988, Delhi',
 'Age': '34y 290d',
 'Batting Style': 'Right hand Bat',
 'Bowling Style': 'Right arm Medium',
 'Playing Role': 'Top order Batter',
 'batting_fielding':    Format  Mat  Inns  NO   Runs    HS    Ave     BF      SR  100s  50s    4s  \
 0    Test  111   187  11   8676  254*  49.29  15708   55.23    29   29   966   
 1     ODI  275   265  40  12898   183  57.32  13776   93.62    46   65  1211   
 2    T20I  115   107  31   4008  122*  52.73   2905  137.96     1   37   356   
 3      FC  143   235  18  10925  254*  50.34  19611   55.70    36   37  1279   
 4  List A  309   298  43  14340   183  56.23  15312   93.65    50   73  1375   
 5     T20  374   357  68  11965  122*  41.40   8972  133.35     8   91  1069   
 
     6s   Ct  St  
 0   24  110   0  
 1  138  142   0  
 2  117   50   0  
 3   39  141   0  
 4  162  160   0  
 5  371  170   0  ,
 'bowling':    Format  Mat  Inns  Balls  Runs  Wkts   BBI   BBM     Ave  Econ   

In [102]:
get_teams_url(teams_url)

[{'team_name': 'Afghanistan',
  'team_url': 'https://www.espncricinfo.com/cricketers/team/afghanistan-40'},
 {'team_name': 'Australia',
  'team_url': 'https://www.espncricinfo.com/cricketers/team/australia-2'},
 {'team_name': 'Bangladesh',
  'team_url': 'https://www.espncricinfo.com/cricketers/team/bangladesh-25'},
 {'team_name': 'England',
  'team_url': 'https://www.espncricinfo.com/cricketers/team/england-1'},
 {'team_name': 'India',
  'team_url': 'https://www.espncricinfo.com/cricketers/team/india-6'},
 {'team_name': 'Ireland',
  'team_url': 'https://www.espncricinfo.com/cricketers/team/ireland-29'},
 {'team_name': 'New Zealand',
  'team_url': 'https://www.espncricinfo.com/cricketers/team/new-zealand-5'},
 {'team_name': 'Pakistan',
  'team_url': 'https://www.espncricinfo.com/cricketers/team/pakistan-7'},
 {'team_name': 'South Africa',
  'team_url': 'https://www.espncricinfo.com/cricketers/team/south-africa-3'},
 {'team_name': 'Sri Lanka',
  'team_url': 'https://www.espncricinfo.com/

In [19]:
teams = get_teams_url('https://www.espncricinfo.com/team')
teams_alphas = get_team_alphas(teams[1]['team_url'])
players = get_players_by_alpha(teams_alphas[0])
player_data = []
for player in players:
    print(player)
    player_data.append(get_player_data(player))
    time.sleep(1)

https://www.espncricinfo.com/cricketers/ted-a-beckett-3931
https://www.espncricinfo.com/cricketers/sean-abbott-398666
https://www.espncricinfo.com/cricketers/warwick-adlam-3940
https://www.espncricinfo.com/cricketers/ashton-agar-505120
https://www.espncricinfo.com/cricketers/wes-agar-959833
https://www.espncricinfo.com/cricketers/lachlan-aitken-1356600
https://www.espncricinfo.com/cricketers/lee-albon-53590
https://www.espncricinfo.com/cricketers/terry-alderman-3943
https://www.espncricinfo.com/cricketers/george-alexander-3944
https://www.espncricinfo.com/cricketers/harry-alexander-3945
https://www.espncricinfo.com/cricketers/sarah-aley-53670
https://www.espncricinfo.com/cricketers/frank-allan-3950
https://www.espncricinfo.com/cricketers/peter-allan-3953
https://www.espncricinfo.com/cricketers/richard-allen-323798
https://www.espncricinfo.com/cricketers/reginald-allen-3958
https://www.espncricinfo.com/cricketers/phil-alley-3961
https://www.espncricinfo.com/cricketers/jeremy-allison-428

In [20]:
player_data[7]

{'Full Name': 'Terence Michael Alderman',
 'Born': 'June 12, 1956, Subiaco, Perth, Western Australia',
 'Age': '67y 71d',
 'Batting Style': 'Right hand Bat',
 'Bowling Style': 'Right arm Fast medium',
 'Playing Role': 'Bowler',
 'batting_fielding':    Format  Mat Inns  Balls   Runs  Wkts   BBI     BBM    Ave  Econ    SR 4w  \
 0    Test   41   73  10181   4616   170  6/47  10/151  27.15  2.72  59.8  5   
 1     ODI   65   65   3371   2056    88  5/17    5/17  23.36  3.65  38.3  1   
 2      FC  245    -  48701  22701   956  8/46       -  23.74  2.79  50.9  -   
 3  List A  166    -   8829   5373   232  5/17    5/17  23.15  3.65  38.0  5   
 
    5w  10w  
 0  14    1  
 1   2    0  
 2  53    8  
 3   4    0  ,
 'bowling':    Format  Mat  Inns   NO  Runs   HS   Ave  BF     SR  100s  50s  4s 6s   Ct  \
 0    Test   41    53   22   203  26*  6.54   -      -     0    0  19  0   27   
 1     ODI   65    18    6    32   9*  2.66  97  32.98     0    0   2  0   29   
 2      FC  245   265  10

In [21]:
for player in player_data:
    print(player['Full Name'])

Edward Lambert a'Beckett
Sean Anthony Abbott
Warwick James Adlam
Ashton Charles Agar
Wesley Austin Agar
Lachlan Aitken
Leanne Margaret Albon
Terence Michael Alderman
George Alexander
Harry Houston Alexander
Sarah Elizabeth Aley
Francis Erskine Allan
Peter John Allan
Richard Allen
Reginald Charles Allen
Phillip John Sydney Alley
Jeremy Hammond Allison
Mary Allitt
Elizabeth Amos
Charlie Anderson
Matthew Allan Anderson
Timothy Laurence Anderson
Sarah Joy Andrews
Thomas David Andrews
Thomas James Edwin Andrews
Jo Angel
Austin Anlezark
Denise Audrey Annetts
Peggy Antonio
Monty Archdale
Kenneth Alan Archer
Ronald Graham Archer
Glenarvon Huntley Armstrong
Timothy John Armstrong
Warwick Windridge Armstrong
Ben Matthew Ashkenazi
Nathan William Ashley
Shaun Nicholas Austin
Clinton Auty
Riley R Ayre


In [14]:
print(teams)

NameError: name 'teams' is not defined

In [2]:
from selenium.webdriver import Chrome
from

In [3]:
def get_html(url:str):
    driver = Chrome()
    driver.get(url)
    driver.close()