In [5]:
import pandas as pd

In [6]:
filepath = '../data/processed/howstat/fall_of_wickets/'

In [7]:
file = 'fow_2400.csv'

In [8]:
df = pd.read_csv(filepath+file, index_col=0, parse_dates=[2], infer_datetime_format=True)

In [9]:
df[df.MatchInnings==2]

Unnamed: 0,MatchId,MatchDate,MatchInnings,Team,TeamInnings,Wicket,Runs,Player
10,2400,2020-01-03,2,South Africa,1st,1,26,Malan
11,2400,2020-01-03,2,South Africa,1st,2,38,Hamza
12,2400,2020-01-03,2,South Africa,1st,3,40,Plessis
13,2400,2020-01-03,2,South Africa,1st,4,157,Elgar
14,2400,2020-01-03,2,South Africa,1st,5,191,Kock
15,2400,2020-01-03,2,South Africa,1st,6,200,Dussen
16,2400,2020-01-03,2,South Africa,1st,7,207,Pretorius
17,2400,2020-01-03,2,South Africa,1st,8,215,Maharaj
18,2400,2020-01-03,2,South Africa,1st,9,215,Rabada
19,2400,2020-01-03,2,South Africa,1st,10,223,Nortje


Let's say a batting collapse is defined as losing at least 3 wickets for at most 30 runs.
We want to know for each MatchId-MatchInnings whether that innings contains a batting collapse.
We want:
- Number of batting collapses
- Batting positions involved and their positions

Say we have a game with FoW:
1-26 Malan
2-38 Hamza
3-40 Plessis

I.e. only 3 wickets fell (say, declared).
They lost 3 wickets for 14 runs (additional to when the 1st wicket fell), so this is a batting collapse.
In other words: the score was 26-0, which became 40-3, so 3 wickets lost for 14 runs

Say we have a game with FoW:
1-157
2-191
3-200
4-207

Wickets 2-4 fell for 16 runs (i.e. batters 2,3,4 lost their wickets).
This is a batting collapse.




Coding approach:
Check each group of 3 wickets. Groups will be: 5-2, 4-1, 3-0.
Create a dict: this is simple than making a dict: {wicket:runs}


In [58]:
from collections import namedtuple

def check_collapse_n_wickets(d_runs, d_players, n):
    """
    Takes a dictionary of wickets falling (runs for wicket i=1:10), and n a number of wickets fallen to define a collapse.
    Returns a list of all collapses for n number of wickets fallen.
    The list contains Collapse namedtuple: start wicket, end wicket of collapse, number of runs, and positions involved.
    """
    
    Collapse = namedtuple("Collapse", ["start", "end", "runs", "positions", "batters"])      
    n_collapses = 0
    l_collapses = []
    
    for i in range(n,len(d_runs)):
        l_positions_involved = []
        l_batters_involved = []

        # skip the case from 0 to i, since only i wickets will have fallen
        if i == n:
            continue

        # calculate runs lost for wicket # i-n, i-n+1,...,i
        # e.g. if n=2 and i=5, wickets 3,4,5 have fallen
        diff = d_runs[i] - d_runs[i-n]

        if diff <= 30:
            l_positions_involved = [s for s in range(i-n,i+1)]
            l_batters_involved = [d_players[s] for s in range(i-n,i+1)]
            collapse = Collapse(start=i-n, end=i, runs=diff, positions=l_positions_involved, batters=l_batters_involved)
            l_collapses.append(collapse)
            
    return l_collapses

In [59]:
def check_all_collapses(d_runs, d_players):
    """
    Go through all length batting collapses to see if any smaller are extended.
    e.g. lose 3 wickets for 30, lose 4 wickets for 30, lose 5 wickets for 30 -> only count as 1 collapse
    
    check if batters for small n is contained within batters for larger n
    
    do some optimisations at a later stage
    """
    # build list of collapses for every length of collapse (min.2, max.10 wickets lost)
    l_collapses = []
    for i in reversed(range(2,10)):
        l_collapse = check_collapse_n_wickets(d_runs, d_players, i)
        if len(l_collapse) > 0:
            l_collapses += l_collapse
            
            
    # reduce to drop any "sub-collapses" e.g. 4,5,6 is a sub-collapse of 4,5,6,7
    l_collapses_reduced = l_collapses[:]
    for m in l_collapses:
        for n in l_collapses:
            if set(m.positions) <= set(n.positions) and m != n:
                # if is a sub-collapse, remove the smaller object from the list: we no longer need to test it
                l_collapses_reduced.remove(m)
                # and break, as 
                break
                
                
    # return number of collapses
    return l_collapses_reduced

In [60]:
def return_collapses(df):          
    """
    for each innings (group), want to return one row for each collapse, 
    containing columns: start, end, runs, positions, (batters)
    """
    
    l_runs = list(df.Runs)
    l_runs.insert(0,0)
    l_player = list(df.Player)
    l_player.insert(0,"")
    
    d_runs = {i:l_runs[i] for i in range(len(l_runs))}
    d_players = {i:l_player[i] for i in range(len(l_player))}
    
    l_collapses = check_all_collapses(d_runs, d_players)
    
    return pd.DataFrame(l_collapses)

In [63]:
df.groupby(['MatchId','MatchInnings', 'Team']).apply(return_collapses)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,start,end,runs,positions,batters
MatchId,MatchInnings,Team,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2400,1,England,0,6.0,9.0,13.0,"[6, 7, 8, 9]","[Buttler, Curran, Bess, Broad]"
2400,2,South Africa,0,5.0,9.0,24.0,"[5, 6, 7, 8, 9]","[Kock, Dussen, Pretorius, Maharaj, Rabada]"
2400,2,South Africa,1,6.0,10.0,23.0,"[6, 7, 8, 9, 10]","[Dussen, Pretorius, Maharaj, Rabada, Nortje]"
2400,2,South Africa,2,1.0,3.0,14.0,"[1, 2, 3]","[Malan, Hamza, Plessis]"
2400,4,South Africa,0,6.0,10.0,11.0,"[6, 7, 8, 9, 10]","[Kock, Dussen, Pretorius, Nortje, Philander]"


In [10]:
#I'd like results to be returned as:
#-MatchId
#-MatchInnings
#-Team
#-CollapseCount
#-CollapseDetails:
#      (one option)
#        Start tuple
#        End tuple
#        Runs tuple
#        Batting positions tuple
#        Batters tuple

Questions to answer:
- Number of collapses by Team, by year (unique collapses, innings with a collapse)
- Positions most often involved
- Batters most often involved

In [None]:
# **Need to have long format rather than wide, so return multiple rows if multiple collapses