# Batting collapse frequency. Are England unique?

### Research Question:
- How often do batting collapses happen?
- Does England collapse more often that other teams?
- Is this statement true? "Joe Root rarely stops a collapse, and is often a part of it"
- Which players are best at stopping a collapse?


### Methodology:
- Create a table of fall of wicket:
    - MatchID
    - Date
    - Batting team
    - Bowling team
    - Match type
    - Innings
    - Fall of Wicket 1 (runs, batsman)
    - Fall of Wicket 2 (runs, batsman)
    - etc.


### Problem breakdown:
- Data Source: howstat cricket scorecards
- Extract Fall Of Wickets from a single game
- Extract FoW from multiple games
- Extract FoW from all (relevant) games

In [10]:
import urllib
import re
import pandas as pd
import numpy as np

from dateutil.parser import parse
from bs4 import BeautifulSoup

In [97]:
class BaseParser:

    def __init__(self, url=None):
        self.url = url
        
    def __str__(self):
        return f'Parser Class for url: {self.url}'

    def read_html(self):
        # read url as html
        
        #using self.url here rather than in function parameter, 
        #as never want to read a different url than self.url
        with urllib.request.urlopen(self.url) as url:
            try:
                s = url.read()
                print(f'read url: {self.url}')
            except:
                pass
        
        return s
    
    def create_soup(self, html):
        # create a bs4 soup from the html
        
        soup = BeautifulSoup(html, 'html.parser')
        print('read soup')
        
        return soup
    
    def execute(self):
        print('Reading html')
        html_page = self.read_html()
        
        print('Reading soup')
        self.soup = self.create_soup(html_page)

In [98]:
a = BaseParser(url="http://howstat.com/cricket/Statistics/Matches/MatchScorecard.asp?MatchCode=1800")
a.execute()
#print(a)
#print(a.url)

Reading html
read url: http://howstat.com/cricket/Statistics/Matches/MatchScorecard.asp?MatchCode=1800
Reading soup
read soup


In [167]:
class MatchParser(BaseParser):

    def __init__(self, match_id):
        # Override init method to use base Match url
        
        super(MatchParser, self).__init__()
        self.match_id = match_id
        base_url = 'http://howstat.com/cricket/Statistics/Matches/MatchScorecard.asp?MatchCode='
        self.url = base_url + str(self.match_id)  
    
    def __str__(self):
        # Override str method to show that this is a Match class
        return f'Match Parser Class for match_id: {self.match_id}'
 

    def parse_match_info(self, soup):
        match_info_data = np.array([item.text.strip() for item in soup.find_all(class_="TextBlack8")])[:5]
        match_info_category = np.array([item.text.strip() for item in soup.find_all(class_="TextBlackBold8")])[:5]

        d_match_info = pd.DataFrame(match_info_data, match_info_category).T.to_dict(orient='records')[0]
        d_match_info['Match Date:'] = parse(d_match_info['Match Date:']).date()
        # print(d_match_info)

        return d_match_info
        
    def parse_single_scorecard(self, innings_section):
        """
        Takes a section of html soup containing the start of a scorecard, and builds a scorecard df upto and including the innings total.

        innings_section: bs4 object, which is of TextBlackBold8 class and contains the word "Innings" in its text.
        returns:  df of scorecard, and innings name as string
        """
        # The first parent is the header row of the scorecard
        l_headers = [x.text.replace('\xa0', ' ').strip() for x in innings_section.parent.find_all('td')]

        # We now want to loop through the siblings of the header row to get each batsman's scores (and append the batsman score row to the data list), 
        # until we find the Total row, at which point we append the Total row then terminate.
        nextNode = innings_section.parent
        l_data = []
        while True:
            nextNode = nextNode.find_next_sibling()  # use find_next_sibling() instead of next_sibling to avoid line breaks
            row = [x.text.replace('\xa0', ' ').replace('\r', '').replace('\n', '').strip() for x in nextNode.find_all('td')]

            if 'Total' in row:
                l_data.append(row)
                break
            else:
                l_data.append(row)
        
        # Create output df from the list of data
        innings_name = l_headers.pop(0).split('Innings')[0].strip()
        l_headers.insert(0, 'Details')
        l_headers.insert(0, 'Player')
        df_scorecard = pd.DataFrame(l_data, columns=l_headers)

        return innings_name, df_scorecard       
 
    def clean_scorecards(self, d_scorecards, d_match_info, l_innings):
        # Combine scorecards dfs, add key column from the dict
        df = pd.concat(d_scorecards, keys=l_innings).reset_index()

        # Expand key column into Team, Innings columns. Add Date column. Add MatchID column
        df['MatchId'] = self.match_id   
        df['MatchDate'] = d_match_info['Match Date:']     
        meta = df.level_0.str.rsplit(n=1, expand=True)
        df[['Team', 'Innings']] = meta
        df = df.drop(['level_0'], axis=1)

        # Clean up column names and order
        cols = list(df.columns)
        cols = ['ScorecardIdx' if col == "level_1" else col for col in cols]
        df.columns = cols
        cols = cols[-4:] + cols[:-4]
        df = df[cols]

        # Clean up data types
        df['MatchDate'] = df['MatchDate'].apply(pd.to_datetime, errors='coerce', yearfirst=True)
        
        df['% of Total'] = df['% of Total'].str.replace('%', '')
        cols_numeric = ['ScorecardIdx', 'R', 'BF', '4s', '6s', 'SR', '% of Total']
        df[cols_numeric] = df[cols_numeric].apply(pd.to_numeric, errors='coerce')
        df['% of Total'] = df['% of Total']/100
        
        df_clean_scorecards = df.copy()

        return df_clean_scorecards
       
    def parse_all_scorecards(self, soup):
        # Loop through each TextBlackBold8 element
        # If the text contains the word 'Innings', the next section will be the innings scorecard: so parse it.
        d_scorecards = {}
        l_innings = []
        for item in soup.find_all(class_="TextBlackBold8"):
            item_text = item.text.replace('\xa0', ' ').strip()
            if 'Innings' in item_text:

                innings, df_scorecard = self.parse_single_scorecard(item)
                print(f'Parsed scorecard for: {innings} innings')
                
                l_innings.append(innings)
                d_scorecards[innings] = df_scorecard
                
        df_scorecards = self.clean_scorecards(d_scorecards, self.match_info, l_innings)
        
        return df_scorecards, l_innings
    
        
    def parse_single_fall_of_wickets(self, fow_section):
        """
        Takes a section of html containing the start of a fall of wickets section, and reads & reformats the FoW table.

        fow_section: bs4 object, a td element containing the words "Fall of Wickets"
        returns: df of fall of wickets
        """

        table = fow_section.parent.find('table')  # Move up one level and select table
        td = table.find_all('td')  # get table data
        tab = []

        for x in td:
            d_row = {}
            row = x.text.strip().replace(u'\xa0', u' ')
            row = row.split(' ')
            score = row[0].split('-')
            d_row['Wicket'] = score[0]
            d_row['Runs'] = score[1]
            d_row['Player'] = row[-1]
            tab.append(d_row)

        df_fow = pd.DataFrame(tab)

        return df_fow

    
    def clean_fall_of_wickets(self, d_fow, d_match_info, l_innings):
        # Combine scorecards dfs, add key column from the dict
        df = pd.concat(d_fow, keys=l_innings).reset_index()

        # Expand key column into Team, Innings columns. Add Date column. Add matchId column
        df['MatchId'] = self.match_id
        df['MatchDate'] = d_match_info['Match Date:']        
        meta = df.level_0.str.rsplit(n=1, expand=True)
        df[['Team', 'Innings']] = meta
        df = df.drop(['level_0', 'level_1'], axis=1)

        # Clean up column names and order
        cols = list(df.columns)
        cols = cols[-4:] + cols[:-4]
        df = df[cols]

        # Clean up data types
        df['MatchDate'] = df['MatchDate'].apply(pd.to_datetime, errors='coerce', yearfirst=True)        
        cols_numeric = ['Wicket', 'Runs']
        df[cols_numeric] = df[cols_numeric].apply(pd.to_numeric, errors='coerce')

        return df    
    
    
    def parse_all_fall_of_wickets(self, soup, l_innings):
        # Loop through each Fall of Wickets section, and parse the FoW record.   
        # Store in a dict with keys as innings names from parsed scorecard section.
        # This assumes there will always be a FoW for each scorecard
        l_fow = []
        fow_sections = soup.findAll("td", text=re.compile('Fall of Wickets'))
        for item in fow_sections:
            df_fow = self.parse_single_fall_of_wickets(item)
            l_fow.append(df_fow)
            print('Parsed FoW')

        # Convert list of FoW dfs to dict
        if len(l_fow) == len(l_innings):
            print(f'There are {len(l_fow)} innings with fall of wicket data')
            d_fow = {l_innings[i]: l_fow[i] for i in range(len(l_fow))} 
        
        
        df = self.clean_fall_of_wickets(d_fow, self.match_info, l_innings)
        
        return df

    
    def execute(self):
        html_page = super().read_html()
        self.soup = super().create_soup(html_page)
        
        self.match_info = self.parse_match_info(self.soup)
        self.scorecards, l_innings = self.parse_all_scorecards(self.soup) # l_innings required for FoW parsing, but not to keep as an attribute
        self.fall_of_wickets = self.parse_all_fall_of_wickets(self.soup, l_innings) # l_innings req., so always execute scorecard parsing first
        

In [168]:
b = MatchParser(2400)
b.execute()
#print(b)
#print(b.url)

read url: http://howstat.com/cricket/Statistics/Matches/MatchScorecard.asp?MatchCode=2400
read soup
Parsed scorecard for: England 1st innings
Parsed scorecard for: South Africa 1st innings
Parsed scorecard for: England 2nd innings
Parsed scorecard for: South Africa 2nd innings
Parsed FoW
Parsed FoW
Parsed FoW
Parsed FoW
There are 4 innings with fall of wicket data


In [170]:
b.fall_of_wickets

Unnamed: 0,MatchId,MatchDate,Team,Innings,Wicket,Runs,Player
0,2400,2020-01-03,England,1st,1,8,Crawley
1,2400,2020-01-03,England,1st,2,63,Sibley
2,2400,2020-01-03,England,1st,3,105,Root
3,2400,2020-01-03,England,1st,4,127,Denly
4,2400,2020-01-03,England,1st,5,185,Stokes
5,2400,2020-01-03,England,1st,6,221,Buttler
6,2400,2020-01-03,England,1st,7,231,Curran
7,2400,2020-01-03,England,1st,8,231,Bess
8,2400,2020-01-03,England,1st,9,234,Broad
9,2400,2020-01-03,England,1st,10,269,Anderson


In [162]:
import sys
sys.getsizeof(b.scorecards)

18763

In [56]:
class Parent(object):
    def greet(self):
        print('Hello from Parent')

class Child(Parent):
    def __init__(self):
        super(Child, self).__init__()

    #def hello(self):
        #print('Hello from Child')
     #   self.greet()
        #super(Child, self).greet()

In [57]:
child = Child()
child.greet()

Hello from Parent


In [2]:
with urllib.request.urlopen("http://howstat.com/cricket/Statistics/Matches/MatchScorecard.asp?MatchCode=1800") as url:
    s = url.read()

In [4]:
from cricsheet.io_html import scrape_howstat

In [5]:
d_match_info = scrape_howstat.parse_match_info(soup)

In [6]:
# Loop through each TextBlackBold8 element
# If the text contains the word 'Innings', the next section will be the innings scorecard: so parse it.
d_scorecards = {}
l_innings = []
for item in soup.find_all(class_="TextBlackBold8"):
    item_text = item.text.replace('\xa0', ' ').strip()
    if 'Innings' in item_text:
        
        # Extract the Innings number and Team
        #l_innings.append(item_text.split('Innings')[0])
        
        # Go through siblings until the Total. This will be the batting scorecard
        #print(item.parent.next_sibling)
        #l_items.append(item)
        
        # parse the scorecard
        innings, df_scorecard = scrape_howstat.parse_scorecard(item)
        print(f'Parsed scorecard for: {innings} innings')
        l_innings.append(innings)
        
        
        d_scorecards[innings] = df_scorecard

Parsed scorecard for: Pakistan 1st innings
Parsed scorecard for: India 1st innings
Parsed scorecard for: Pakistan 2nd innings
Parsed scorecard for: India 2nd innings


In [7]:
# Loop through each Fall of Wickets section, and parse the FoW record.   
# Store in a dict with keys as innings names from parsed scorecard section.
# This assumes there will always be a FoW for each scorecard
l_fow = []
fow_sections = soup.findAll("td", text=re.compile('Fall of Wickets'))
for item in fow_sections:
    df_fow = scrape_howstat.parse_fall_of_wickets(item)
    l_fow.append(df_fow)
    print('Parsed FoW')
 
# Convert list of FoW dfs to dict
if len(l_fow) == len(l_innings):
    print(f'There are {len(l_fow)} innings with fall of wicket data')
    d_fow = {l_innings[i]: l_fow[i] for i in range(len(l_fow))}
    

Parsed FoW
Parsed FoW
Parsed FoW
Parsed FoW
There are 4 innings with fall of wicket data


In [8]:
df_scorecards = scrape_howstat.clean_scorecards(d_scorecards, d_match_info, l_innings)

In [9]:
df_fow = scrape_howstat.clean_fow(d_fow, d_match_info, l_innings)

In [10]:
df_scorecards.head()

Unnamed: 0,MatchDate,Team,Innings,ScorecardIdx,Player,Details,R,BF,4s,6s,SR,% of Total
0,2006-01-21,Pakistan,1st,0,Shoaib Malik,c Dravid b R P Singh,19.0,33.0,4.0,0.0,57.58,3.23%
1,2006-01-21,Pakistan,1st,1,Salman Butt,c †Dhoni b Khan,37.0,57.0,7.0,0.0,64.91,6.29%
2,2006-01-21,Pakistan,1st,2,Younis Khan,c Yuvraj Singh b R P Singh,83.0,131.0,13.0,0.0,63.36,14.12%
3,2006-01-21,Pakistan,1st,3,Mohammad Yousuf,c †Dhoni b R P Singh,65.0,119.0,8.0,1.0,54.62,11.05%
4,2006-01-21,Pakistan,1st,4,Inzamam-ul-Haq*,c †Dhoni b Khan,119.0,193.0,12.0,1.0,61.66,20.24%


In [11]:
df_fow.head()

Unnamed: 0,MatchDate,Team,Innings,Wicket,Runs,Player
0,2006-01-21,Pakistan,1st,1,49,Malik
1,2006-01-21,Pakistan,1st,2,65,Butt
2,2006-01-21,Pakistan,1st,3,207,Khan
3,2006-01-21,Pakistan,1st,4,216,Yousuf
4,2006-01-21,Pakistan,1st,5,467,Afridi


In [12]:
# FoW
"""HTML structure: 
    innings + team name
    innings scorecard
    fall of wickets
    innings + team name
    innings scorecard
    fall of wickets
    
    etc.
    
    
Procedure:
Get this table:
    Read innings + team name (s)
    Read the scorecard (s)
    Read the FoW (s)
    Split by innings + team name.
Reformat into readable table
"""




'HTML structure: \n    innings + team name\n    innings scorecard\n    fall of wickets\n    innings + team name\n    innings scorecard\n    fall of wickets\n    \n    etc.\n    \n    \nProcedure:\nGet this table:\n    Read innings + team name (s)\n    Read the scorecard (s)\n    Read the FoW (s)\n    Split by innings + team name.\nReformat into readable table\n'