In [None]:
import pandas as pd

In [None]:
filepath = '../data/raw/csv/howstat/fall_of_wickets/'

In [None]:
file = 'fow_2400.csv'

In [None]:
df = pd.read_csv(filepath+file, index_col=0, parse_dates=[2], infer_datetime_format=True)

In [None]:
df[df.MatchInnings==2]

Let's say a batting collapse is defined as losing at least 3 wickets for at most 30 runs.
We want to know for each MatchId-MatchInnings whether that innings contains a batting collapse.
We want:
- Number of batting collapses
- Batting positions involved and their positions

Say we have a game with FoW:
1-26 Malan
2-38 Hamza
3-40 Plessis

I.e. only 3 wickets fell (say, declared).
They lost 3 wickets for 14 runs (additional to when the 1st wicket fell), so this is a batting collapse.
In other words: the score was 26-0, which became 40-3, so 3 wickets lost for 14 runs

Say we have a game with FoW:
1-157
2-191
3-200
4-207

Wickets 2-4 fell for 16 runs (i.e. batters 2,3,4 lost their wickets).
This is a batting collapse.




Coding approach:
Check each group of 3 wickets. Groups will be: 5-2, 4-1, 3-0.
Create a dict: this is simple than making a dict: {wicket:runs}


In [None]:
filepath_scores = '../data/raw/csv/howstat/scorecards/'
file_scores = 'scorecard_2400.csv'

df_scores = pd.read_csv(filepath_scores+file_scores, index_col=0, parse_dates=[2], infer_datetime_format=True)

In [None]:
df_scores[df_scores.MatchInnings==2]

In [None]:
df_scores = df_scores[['MatchId', 'MatchInnings', 'Team', 'TeamInnings', 'Player', 'R', 'BF']]

In [None]:
df_scores_test = df_scores[df_scores.MatchInnings==2]
df_scores_test

In [None]:
df_scores['BattingPosition'] = df_scores.groupby(['MatchId','MatchInnings', 'Team']).cumcount() + 1

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=1):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()

    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit, scorer=fuzz.partial_ratio))    
    df_1['matches'] = m

    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2

    return df_1

In [None]:
df_merged = fuzzy_merge(df, df_scores, 'Player', 'Player', 80)

In [None]:
df_merged.head()

In [None]:
df_merged = df_merged.merge(df_scores, how='left', left_on=['MatchId', 'MatchInnings', 'Team', 'TeamInnings', 'matches'], right_on=['MatchId', 'MatchInnings', 'Team', 'TeamInnings', 'Player'])
df_merged = df_merged.drop(['Player_y', 'matches'], axis=1)
df_merged = df_merged.rename(columns={"Player_x": "Player"})
df_merged

In [None]:
from collections import namedtuple

def check_collapse_n_wickets(d_runs, n):
    """
    Takes a dictionary of wickets falling (runs for wicket i=1:10), and n a number of wickets fallen to define a collapse.
    Returns a list of all collapses for n number of wickets fallen.
    The list contains Collapse namedtuple: start wicket, end wicket of collapse, number of runs, and positions involved.
    """
    
    Collapse = namedtuple("Collapse", ["start", "end", "runs", "wickets_lost", "batters", "batters_positon", "batters_runs", "batters_bf"])      
    n_collapses = 0
    l_collapses = []
    
    for i in range(n,len(d_runs)):
        l_wickets_lost = []
        l_batters_involved = []
        l_batters_position = []
        l_batters_runs = []
        l_batters_bf = []
        

        # skip the case from 0 to i, since only i wickets will have fallen
        if i == n:
            continue

        # calculate runs lost for wicket # i-n, i-n+1,...,i
        # e.g. if n=2 and i=5, wickets 3,4,5 have fallen
        diff = d_runs[i][0] - d_runs[i-n][0]

        if diff <= 30:
            l_wickets_lost = [s for s in range(i-n,i+1)]
            l_batters_involved = [d_runs[s][1] for s in range(i-n,i+1)]
            l_batters_position = [d_runs[s][2] for s in range(i-n,i+1)]
            l_batters_runs = [d_runs[s][3] for s in range(i-n,i+1)]
            l_batters_bf = [d_runs[s][4] for s in range(i-n,i+1)]
            collapse = Collapse(start=i-n, 
                                end=i, 
                                runs=diff, 
                                wickets_lost=l_wickets_lost, 
                                batters=l_batters_involved, 
                                batters_positon=l_batters_position, 
                                batters_runs=l_batters_runs, 
                                batters_bf=l_batters_bf)
            l_collapses.append(collapse)
            
    return l_collapses

In [None]:
def check_all_collapses(d_runs):
    """
    Go through all length batting collapses to see if any smaller are extended.
    e.g. lose 3 wickets for 30, lose 4 wickets for 30, lose 5 wickets for 30 -> only count as 1 collapse
    
    check if batters for small n is contained within batters for larger n
    
    do some optimisations at a later stage
    """
    # build list of collapses for every length of collapse (min.2, max.10 wickets lost)
    l_collapses = []
    for i in reversed(range(2,10)):
        l_collapse = check_collapse_n_wickets(d_runs, i)
        if len(l_collapse) > 0:
            l_collapses += l_collapse
            
            
    # reduce to drop any "sub-collapses" e.g. 4,5,6 is a sub-collapse of 4,5,6,7
    l_collapses_reduced = l_collapses[:]
    for m in l_collapses:
        for n in l_collapses:
            if set(m.wickets_lost) <= set(n.wickets_lost) and m != n:
                # if is a sub-collapse, remove the smaller object from the list: we no longer need to test it
                l_collapses_reduced.remove(m)
                # and break, as 
                break
                
                
    # return number of collapses
    return l_collapses_reduced

In [None]:
def return_collapses(df):          
    """
    for each innings (group), want to return one row for each collapse, 
    containing columns: start, end, runs, positions, (batters)
    """
    
    l_runs = list(df.Runs)
    l_runs.insert(0,0)
    l_player = list(df.Player)
    l_player.insert(0,"")
    l_batting_position = list(df.BattingPosition)
    l_batting_position.insert(0,"")
    l_batter_runs = list(df.R)
    l_batter_runs.insert(0,"")
    l_batter_bf = list(df.BF)
    l_batter_bf.insert(0,"")
    
    d_runs = {i:(l_runs[i], l_player[i], l_batting_position[i], l_batter_runs[i], l_batter_bf[i]) for i in range(len(l_runs))}
    
    l_collapses = check_all_collapses(d_runs)
    
    return pd.DataFrame(l_collapses)

In [None]:
df_merged.groupby(['MatchId','MatchInnings', 'Team']).apply(return_collapses)

Questions to answer:
- Number of collapses by Team, by year (unique collapses, innings with a collapse)
- Positions most often involved
- Batters most often involved