This notebook is used to obtain all required data to be fed into the model. The data will be parsed from https://www.espncricinfo.com/, which is a cricket sports website, and a database. 

The outputs of this script are:

1. match_url.csv (All T20I matches played prior to the World Cup including stats from the matches)
2. match_data.csv (All players that were involved in matches recorded in 1.)
3. player_data.csv (Statistics of all players that were involved in all recorded matches)
4. player_match_data.csv (Same as 2. but includes each players country required for data preprocessing)
5. player_squad_data.csv (Based on the selected squads for the tournement all statistics for selected players) 

Note: It is not recommened to run this script as the parsing takes siginicant time.  

In [None]:
# Importing all required libraries
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import pandas as pd
import re

In [None]:
# The TeamData Function takes as an input of each team and runs the ObtainData function
def TeamData (team):
    for i in range(1,10):
        ObtainData(str(team),str(i),f)

# The ObtainData function runs through all teams match history and records all games played by the team. 
# Each team page follows the same url structure and the input of the function is changed to allow different teams to be recorded
# The function looks for a specific string in the page HTML and based on that stores certain values for each team
def ObtainData(team,page,f):
    url = 'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=3;page='+page+';team='+team+';template=results;type=team;view=results'
    uClient = uReq(url)
    raw_html = uClient.read()
    uClient.close()   
    page_soup = soup(raw_html, "html.parser")
    data = page_soup.findAll("a", string='Match scorecard')
    table = page_soup.findAll("table", {"class" : "engineTable"})[2].tbody
    
    store = table.find_all("tr")
    data = page_soup.findAll("a", string='Match scorecard')
    page_check = page_soup.findAll("tr", {'class', 'data1'})[0].findAll('td', {'class', 'left'})[0].text
    
    for i in range(len(store)):
        if (page_check == 'No records available to match this query'):
            break
        else:
            results_check = page_soup.findAll("tr", {'class', 'data1'})[i].findAll('td', {'class', 'left'})[1].text
            if results_check == 'won' or results_check == 'lost':
                f.write("\n")
                match_url = 'https://www.espncricinfo.com' + data[i]['href'] 
                insidedata = store[i].find_all("td")
                f.write(match_url + ",")
                for k in range(len(insidedata)):
                    if (insidedata[k].text == 'No records available to match this query'):
                        break
                    else:
                        f.write(insidedata[k].text + ",")

In [None]:
# This command runs the above command for all teams listed in the dictionary and stores it into a file. 
teams = {'40': 'Afghanistan', '2' : 'Australia', '25' : 'Bangladesh', '1':'England','6':'India','29':'Ireland',
         '28':'Namibia','15':'Netherlands','5':'NewZealand','37':'Oman','7':'Pakistan','20':'P.N.G.',
         '30':'Scotland','3':'SouthAfrica','8':'SriLanka','4':'WestIndies'}

filename = "match_url.csv"

f = open(filename, 'w')
headers = "match_url,Team, Result, Margin, BR, Toss, Bat, ,Opposition, Ground, Date\n"
f.write(headers)

for key in teams.keys():
    TeamData(key)
    
f.close()

In [None]:
# This function is used to record all urls of the players that have played in any recorded matches.
def player_url_obtainer(match):
  
    for i in range(len(match)):
        url = match['match_url'][i]
        uClient = uReq(url)
        raw_html = uClient.read()
        uClient.close()   
        page_soup = soup(raw_html, "html.parser")

        name_finder=page_soup.findAll("a", {"title" :re.compile(r"View full profile of ")})
        
        for i in range(len(name_finder)):
            player_url.append('https://www.espncricinfo.com' + name_finder[i]['href'])


    for i in range(len(player_url)):
        f.write("\n")
        f.write(player_url[i])

In [None]:
# Read the match_url generated earlier and create new data with only match_url tat can be used in the player_url_obtainer function.
match_url = pd.read_csv('match_url.csv')
match_url = match_url.reset_index()
match_url.columns = ['match_url','Team','Result','Margin','BR','Toss','Bat', 'Unnamed1','Opposition', 'Ground','Date','Unnamed2','Unnamed3']
match_url = match_url.drop_duplicates(subset=['match_url'])
match_url = match_url.reset_index()
match = match_url.drop(['index','Unnamed1', 'Unnamed2','Unnamed3'],axis = 1)
filename = "player_url.csv"
f = open(filename, 'w')
headers = "player_url"
f.write(headers)
player_url = []
player_url_obtainer(match)
f.close()

In [None]:
# Based on the player url this function takes all data from each players url
# Care had to be taken in order of structure of data, that is the reason for the four if loops and the mention of T20I
def player_data_obtainer(playerdf,f):
    
    for i in range(len(playerdf)):
        url = playerdf['player_url'][i]
        uClient = uReq(url)
        raw_html = uClient.read()
        uClient.close()   
        page_soup = soup(raw_html, "html.parser")

        f.write("\n,")
        f.write(page_soup.findAll("h5", {"class" : "player-card-description gray-900"})[0].text + ",")
        f.write(page_soup.findAll("span", {"class" : "player-card__country-name"})[0].text + ",")

        tablefinder = page_soup.findAll('div' ,{'class': 'card overflow-hidden mb-3'})

        if tablefinder[1].findAll('h5')[0].text == 'Batting & Fielding':
            statfinder = page_soup.findAll('span')
            for i in range(len(statfinder)):
                if statfinder[i].text == 'T20I':
                    for j in range(1,15):
                        f.write(statfinder[i+j].text + ",")
                    break

        if tablefinder[1].findAll('h5')[1].text == 'Batting & Fielding':
            count = 0
            statfinder = page_soup.findAll('span')
            for i in range(len(statfinder)):
                if statfinder[i].text == 'T20I':
                    count += 1
                    if count == 1:
                        continue
                    if count == 2:
                        for j in range(1,15):
                            f.write(statfinder[i+j].text + ",")
                        break

        if tablefinder[1].findAll('h5')[0].text == 'Bowling':
            count = 0
            statfinder = page_soup.findAll('span')
            for i in range(len(statfinder)):
                if statfinder[i].text == 'T20I':
                    for j in range(1,14):
                        f.write(statfinder[i+j].text + ",")
                    break

        if tablefinder[1].findAll('h5')[1].text == 'Bowling':
            count = 0
            statfinder = page_soup.findAll('span')
            for i in range(len(statfinder)):
                if statfinder[i].text == 'T20I':
                    count += 1
                    if count == 1:
                        continue
                    if count == 2:
                        for j in range(1,14):
                            f.write(statfinder[i+j].text + ",")
                        break

In [None]:
# Stores the recorded data from the above function into a new csv with the required data
player_url = pd.read_csv('player_url.csv')
player_url = player_url.drop_duplicates()
player_url = player_url.reset_index()

filename = "player_data.csv"
f = open(filename, 'w')
headers = ",Player_Name,Country,Mat_Bat,Inns_Bat,NO,Runs,HS,Ave,BF,SR,100s,50s,4s,6s,Ct,St,Mat_Bowl,Inns_Bowl,Balls,Runs,Wkts,BBI,BBM,Ave,Econ,SR,4w,5w,10w"
f.write(headers)
player_data_obtainer(player_url,f)
f.close()

In [None]:
#  From each match scorecard the players names are obtained this removes any additional strings in the name, such as (c)
def GetMatchInformation(match, f):
    for i in range(len(match)):

        url = match['match_url'][i]
        uClient = uReq(url)
        raw_html = uClient.read()
        uClient.close()   
        page_soup = soup(raw_html, "html.parser")

        name_finder=page_soup.findAll("a", {"title" :re.compile(r"View full profile of ")})
        for i in range(len(name_finder)):
            f.write("\n,")
            f.write(url + ",")
            name = name_finder[i].text
            name = re.sub('(c)', '', name)
            name = re.sub('[^A-Za-z0-9 ]+', '', name)
            f.write(name)

In [None]:
# Clean the inputted csv to input only match_url into the GetMatchInformation function. 
match_url = pd.read_csv('match_url.csv')
match_url = match_url.reset_index()
match_url.columns = ['match_url','Team','Result','Margin','BR','Toss','Bat', 'Unnamed1','Opposition', 'Ground','Date','Unnamed2','Unnamed3']
match_url = match_url.drop_duplicates(subset=['match_url'])
match_url = match_url.reset_index()
match = match_url.drop(['index','Unnamed1', 'Unnamed2','Unnamed3'],axis = 1)

filename = "match_data.csv"
f = open(filename, 'w')
headers = ",match_url,player_name"
f.write(headers)
GetMatchInformation(match, f)
f.close()

In [None]:
# Similar to the GetMatchInformation, this function below takes all player name and country from the player url page
def GetPlayerInformation(match, f):
    for i in range(len(match)):
        
        print("Count= " + str(i))

        url = match['match_url'][i]
        uClient = uReq(url)
        raw_html = uClient.read()
        uClient.close()   
        page_soup = soup(raw_html, "html.parser")

        name_finder=page_soup.findAll("a", {"title" :re.compile(r"View full profile of ")})
        for i in range(len(name_finder)):
            player_url = ('https://www.espncricinfo.com' + name_finder[i]['href'])
            uClient = uReq(player_url)
            second_html = uClient.read()
            uClient.close()   
            new_soup = soup(second_html, "html.parser")
            f.write("\n,")
            f.write(url + ",")
            f.write(new_soup.findAll("h5", {"class" : "player-card-description gray-900"})[0].text + ",")
            f.write(new_soup.findAll("span", {"class" : "player-card__country-name"})[0].text + ",")

In [None]:
# Clean the match_url.csv obtained earlier to remove 'v' from the team_2 column
all_matches_with_data_df = pd.read_csv('match_url.csv')
all_matches_with_data_df = all_matches_with_data_df.reset_index()
headers = ['match_url', 'team_1', 'Result', 'winning_margin', 'BR', 'Toss', 'Bat', 'Unnamed0', 'team_2','Ground','Date','Unnamed1','Unnamed2']
all_matches_with_data_df.columns = headers 
all_matches_with_data_df = all_matches_with_data_df.drop(['BR','Unnamed0','Ground','Unnamed1', 'Unnamed2'], axis = 1)
all_matches_with_data_df['team_2'] = all_matches_with_data_df['team_2'].str.replace('v ','')
all_matches_with_data_df = all_matches_with_data_df.drop_duplicates(subset=['match_url'])
all_matches_with_data_df = all_matches_with_data_df.reset_index(drop=True)

# Get the player_match_data information
filename = "player_match_data.csv"
f = open(filename, 'w')
headers = ",match_url,player_name,player_country"
match = all_matches_with_data_df
match = match.reset_index(drop=True)
f.write(headers)
GetPlayerInformation(match, f)
f.close()

In [None]:
# This command is used to find all the T20I world cup squads by prasing the ESPNCricinfo page
filename = "squads.csv"
f = open(filename, 'w')
headers = "squad_url,player_url"
f.write(headers)

url = 'https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2021-22-1267897/squads'
uClient = uReq(url)
raw_html = uClient.read()
uClient.close()   
page_soup = soup(raw_html, "html.parser")
squadurl = page_soup.findAll("a", {"class" : "black-link d-none d-md-inline-block pl-2"})
for i in range(len(squadurl)):
    squad_url = 'https://www.espncricinfo.com' + squadurl[i]['href']
    uClient = uReq(squad_url)
    sqaud_raw_html = uClient.read()
    uClient.close()   
    page_soup_player = soup(sqaud_raw_html, "html.parser")
    squad_player = page_soup_player.findAll("a", {"class" : "h3 benton-bold name black-link d-inline"})
    for j in range(len(squad_player)):
        squad_player_url = 'https://www.espncricinfo.com' + squad_player[j]['href']
        f.write("\n")
        f.write(squad_url + ",")
        f.write(squad_player_url)

f.close()

In [None]:
# This obtains all the stats of the players in each squad using the same function as before
player_squad_url = pd.read_csv('squads.csv')
player_squad_url = player_squad_url.drop(['squad_url'],axis=1)
filename = "player_squad_data.csv"
f = open(filename, 'w')
headers = ",Player_Name,Country,Mat_Bat,Inns_Bat,NO,Runs,HS,Ave,BF,SR,100s,50s,4s,6s,Ct,St,Mat_Bowl,Inns_Bowl,Balls,Runs,Wkts,BBI,BBM,Ave,Econ,SR,4w,5w,10w"
f.write(headers)
player_data_obtainer(player_squad_url,f)
f.close()