# Batting collapse frequency. Are England unique?

### Research Question:
- How often do batting collapses happen?
- Does England collapse more often that other teams?
- Is this statement true? "Joe Root rarely stops a collapse, and is often a part of it"
- Which players are best at stopping a collapse?


### Methodology:
- Create a table of fall of wicket:
    - MatchID
    - Date
    - Batting team
    - Bowling team
    - Match type
    - Innings
    - Fall of Wicket 1 (runs, batsman)
    - Fall of Wicket 2 (runs, batsman)
    - etc.


### Problem breakdown:
- Data Source: howstat cricket scorecards
- Extract Fall Of Wickets from a single game
- Extract FoW from multiple games
- Extract FoW from all (relevant) games

In [6]:
import urllib
import re
import pandas as pd
import numpy as np

from dateutil.parser import parse
from bs4 import BeautifulSoup

In [8]:
with urllib.request.urlopen("http://howstat.com/cricket/Statistics/Matches/MatchScorecard.asp?MatchCode=2418") as url:
    s = url.read()

In [9]:
soup = BeautifulSoup(s, 'html.parser')

In [None]:
# Match Info
def parse_match_info(html_scorecard):
    
    match_info_data = np.array([item.text.strip() for item in soup.find_all(class_="TextBlack8")])[:5]
    match_info_category = np.array([item.text.strip() for item in soup.find_all(class_="TextBlackBold8")])[:5]

    d_match_info = pd.DataFrame(match_info_data, match_info_category).T.to_dict(orient='records')[0]
    d_match_info['Match Date:'] = parse(d_match_info['Match Date:']).date()
    #print(d_match_info)
    
    return d_match_info

In [None]:
def parse_scorecard(innings_section):
    """
    Takes a section of html containing the start of a scorecard, and builds a scorecard df upto and including the innings total.
    
    innings_section: bs4 object, which is of TextBlackBold8 class and contains the word "Innings" in its text.
    returns:  df of scorecard, and innings name as string
    """
    # The first parent is the header row of the scorecard
    headers = [x.text.replace('\xa0', ' ').strip() for x in innings_section.parent.find_all('td')]
    
    # We now want to loop through the siblings of the header row to get each batsman's scores, until we find the Total row,
    # at which point we append the row then terminate.
    nextNode = innings_section.parent
    data = []
    while True:
        nextNode = nextNode.find_next_sibling() # use find_next_sibling() instead of next_sibling to avoid line breaks
        row = [x.text.replace('\xa0', ' ').replace('\r', '').replace('\n', '').strip() for x in nextNode.find_all('td')]

        if 'Total' in row:
            data.append(row)
            break
        else:
            data.append(row)

    innings_name = headers.pop(0).split('Innings')[0].strip()
    headers.insert(0, 'Details')
    headers.insert(0, 'Player')
    df_scorecard = pd.DataFrame(data, columns=headers)
    
    
    return innings_name, df_scorecard


def parse_fall_of_wickets(fow_section):
    """
    Takes a section of html containing the start of a fall of wickets section, and reads & reformats the FoW table.
    
    fow_section: bs4 object, a td element containing the words "Fall of Wickets"
    returns: df of fall of wickets
    """
    
    table = fow_section.parent.find('table') # Move up one level and select table
    td = table.find_all('td') # get table data
    tab = []
    
    for x in td:
        d_row = {}
        row = x.text.strip().replace(u'\xa0', u' ')
        row = row.split(' ')
        score = row[0].split('-')
        d_row['Wicket'] = score[0]
        d_row['Runs'] = score[1]
        d_row['Player'] = row[-1]
        tab.append(d_row)
        
    df_fow = pd.DataFrame(tab)
        
    return df_fow


# Loop through each TextBlackBold8 element
# If the text contains the word 'Innings', the next section will be the innings scorecard: so parse it.
d_scorecards = {}
l_innings = []
for item in soup.find_all(class_="TextBlackBold8"):
    item_text = item.text.replace('\xa0', ' ').strip()
    if 'Innings' in item_text:
        
        # Extract the Innings number and Team
        #l_innings.append(item_text.split('Innings')[0])
        
        # Go through siblings until the Total. This will be the batting scorecard
        #print(item.parent.next_sibling)
        #l_items.append(item)
        
        # parse the scorecard
        innings, df_scorecard = parse_scorecard(item)
        print(f'Parsed scorecard for: {innings} innings')
        l_innings.append(innings)
        
        
        d_scorecards[innings] = df_scorecard
        
        
# Loop through each Fall of Wickets section, and parse the FoW record.   
# Store in a dict with keys as innings names from parsed scorecard section.
# This assumes there will always be a FoW for each scorecard
l_fow = []
fow_sections = soup.findAll("td", text=re.compile('Fall of Wickets'))
for item in fow_sections:
    df_fow = parse_fall_of_wickets(item)
    l_fow.append(df_fow)
    print('Parsed FoW')
 
# Convert list of FoW dfs to dict
if len(l_fow) == len(l_innings):
    print(f'There are {len(l_fow)} innings with fall of wicket data')
    d_fow = {l_innings[i]: l_fow[i] for i in range(len(l_fow))}
    

In [None]:
def clean_scorecards(d_scorecards):
    # Combine scorecards dfs, add key column from the dict
    df = pd.concat(d_scorecards, keys=l_innings).reset_index()

    # Expand key column into Team, Innings columns. Add Date column.
    meta = df.level_0.str.rsplit(n=1, expand=True)
    df[['Team', 'Innings']] = meta
    df = df.drop(['level_0'], axis=1)
    df['MatchDate'] = d_match_info['Match Date:']

    # Clean up column names and order
    cols = list(df.columns)
    cols = ['ScorecardIdx' if col == "level_1" else col for col in cols]
    df.columns = cols
    cols.insert(0, cols.pop(-2))
    cols.insert(0, cols.pop(-2))
    cols.insert(0, cols.pop(-1))
    df = df[cols]

    # Clean up data types
    cols_numeric = ['ScorecardIdx', 'R', 'BF', '4s', '6s', 'SR']
    df[cols_numeric] = df[cols_numeric].apply(pd.to_numeric, errors='coerce')
    df['MatchDate'] = df['MatchDate'].apply(pd.to_datetime, errors='coerce', yearfirst=True)
    
    return df

df_scorecards = clean_scorecards(d_scorecards)

In [None]:
def clean_fow(d_fow):
    # Combine scorecards dfs, add key column from the dict
    df = pd.concat(d_fow, keys=l_innings).reset_index()

    # Expand key column into Team, Innings columns. Add Date column.
    meta = df.level_0.str.rsplit(n=1, expand=True)
    df[['Team', 'Innings']] = meta
    df = df.drop(['level_0', 'level_1'], axis=1)
    df['MatchDate'] = d_match_info['Match Date:']

    # Clean up column names and order
    cols = list(df.columns)
    cols.insert(0, cols.pop(-2))
    cols.insert(0, cols.pop(-2))
    cols.insert(0, cols.pop(-1))
    df = df[cols]

    # Clean up data types
    cols_numeric = ['Wicket', 'Runs']
    df[cols_numeric] = df[cols_numeric].apply(pd.to_numeric, errors='coerce')
    df['MatchDate'] = df['MatchDate'].apply(pd.to_datetime, errors='coerce', yearfirst=True)
    
    return df

df_fow = clean_fow(d_fow)

In [None]:
df_fow

In [None]:
df_scorecards

In [None]:
# FoW
"""HTML structure:
1 large table, contains: 
    innings + team name
    innings scorecard
    fall of wickets
    innings + team name
    innings scorecard
    fall of wickets
    
    etc.
    
    
Procedure:
Get this table:
    Read innings + team name (s)
    Read the scorecard (s)
    Read the FoW (s)
    Split by innings + team name.
Reformat into readable table
"""