In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import ipywidgets as widgets

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.precision", 2)

In [2]:
# Read All IPL Data
deliveres = pd.read_csv("C:/Users/utkar/OneDrive/Desktop/Analytics/Data/IPL Ball-by-Ball 2008-2020.csv")
matches = pd.read_csv("C:/Users/utkar/OneDrive/Desktop/Analytics/Data/IPL Matches 2008-2020.csv")

In [3]:
# Goal 
# Get Best Bowlers against any team x, at venue y, during phase z 
# For example - Who are the best bowlers against 'RCB' at 'Chinnaswamy' during 'PowerPlay' overs
# I've used MCDM with Topsis to arrive at a score for each batsman
# The Criterion considered are Bowler Economy, Average, Strike Rate and DotPercentage
# A random(not really !!) weight is assigned to each of the above for each phase
# We've narrowed the dataset to atleast 2 innings and IPL 2016 onwards

In [4]:
def isBowlersWicket(player_dismissed, dismissal_kind): 
    if(type(player_dismissed) == str): 
        if dismissal_kind in ['caught', 'bowled', 'lbw',
       'stumped', 'caught and bowled', 'hit wicket']:
            return 1
        else: 
            return 0
    else: 
        return 0

In [5]:
# Return match phase, based on over information
def get_phase(over): 
    if (over < 6) : 
        return 'PowerPlay'
    elif (over < 15) : 
        return 'Middle'
    else:
        return 'End'

In [6]:
def getBowlerStats(df, venue, opposition, phase):

    df = df[df.venue  == venue]
    df = df[df.batting_team == opposition]
    df = df[df.phase == phase]
    
    df.reset_index(inplace = True, drop = True)
    
    df['isDot'] = df['total_runs'].apply(lambda x: 1 if x == 0 else 0)
    df['isOne'] = df['total_runs'].apply(lambda x: 1 if x == 1 else 0)
    df['isTwo'] = df['total_runs'].apply(lambda x: 1 if x == 2 else 0)
    df['isThree'] = df['total_runs'].apply(lambda x: 1 if x == 3 else 0)

    df['isFour'] = df['batsman_runs'].apply(lambda x: 1 if x == 4 else 0)
    df['isSix'] = df['batsman_runs'].apply(lambda x: 1 if x == 6 else 0)

    runs = pd.DataFrame(df.groupby('bowler')['total_runs'].sum()).reset_index().rename(columns = {'total_runs' : 'runs'})
    innings = pd.DataFrame(df.groupby('bowler')['id'].nunique()).reset_index().rename(columns = {'id' : 'innings'})
    balls = pd.DataFrame(df.groupby('bowler')['id'].count()).reset_index().rename(columns = {'id' : 'balls'})
    dismissals = pd.DataFrame(df.groupby('bowler')['isBowlerWkt'].sum()).reset_index().rename(columns = {'isBowlerWkt' : 'dismissals'})
    dots = pd.DataFrame(df.groupby('bowler')['isDot'].sum()).reset_index().rename(columns = {'isDot' : 'dots'})
    ones = pd.DataFrame(df.groupby('bowler')['isOne'].sum()).reset_index().rename(columns = {'isOne' : 'ones'})
    twos = pd.DataFrame(df.groupby('bowler')['isTwo'].sum()).reset_index().rename(columns = {'isTwo' : 'twos'})
    threes = pd.DataFrame(df.groupby('bowler')['isThree'].sum()).reset_index().rename(columns = {'isThree' : 'threes'})
    fours = pd.DataFrame(df.groupby('bowler')['isFour'].sum()).reset_index().rename(columns = {'isFour' : 'fours'})
    sixes = pd.DataFrame(df.groupby('bowler')['isSix'].sum()).reset_index().rename(columns = {'isSix' : 'sixes'})

    mega = pd.merge(runs, balls , on = 'bowler').merge(innings, on = 'bowler').merge(dismissals, on = 'bowler').merge(dots, on = 'bowler').merge(ones, on = 'bowler').merge(twos, on = 'bowler').merge(threes, on = 'bowler').merge(fours, on = 'bowler').merge(sixes, on = 'bowler')
    mega['ECO'] = mega.apply(lambda x: 6 * x['runs'] / x['balls'], axis=1)
    mega['AVG'] = mega.apply(lambda x:  x['runs'] if x['dismissals'] == 0 else x['runs'] / x['dismissals'] , axis=1)
    mega['SR'] = mega.apply(lambda x:  x['balls'] if x['dismissals'] == 0 else x['balls'] / x['dismissals'], axis=1)
    mega['DOTPER'] = mega.apply(lambda x:  x['dots'] / x['balls'], axis=1)
    
    return mega


In [7]:
def getWeights(phase): 
    if (phase == 'PowerPlay'): 
        # Giving preference to Economy and Dot Percentage in PowerPlay. 
        # The onus should be on Keeping Runs Dry
        wt_eco, wt_avg, wt_sr, wt_dp = 0.3 , 0.2 , 0.2, 0.3
    elif (phase == 'Middle'): 
        # Giving preference to Average and Strike Rates 
        # Bowlers who can get wickets in middle overs
        wt_eco, wt_avg, wt_sr, wt_dp = 0.2 , 0.3 , 0.3, 0.2
    else:
        # Major preference to Economy and Dot Percentage
        # Average and Strike Rates should have low priority as most bowlers would go for runs at this stage
        wt_eco, wt_avg, wt_sr, wt_dp = 0.4, 0.1, 0.1, 0.4
        
    return (wt_eco, wt_avg, wt_sr, wt_dp)

In [8]:
def getTopsisScore(mydf, phase):
    
    if mydf.empty: 
        print ("Empty DataFrame")
        return mydf
    
    # Square the required fields and store in another column
    mydf['calc_ECO']  = mydf['ECO'].apply(lambda x: x * x)
    mydf['calc_AVG'] = mydf['AVG'].apply(lambda x: x * x)
    mydf['calc_SR']  = mydf['SR'].apply(lambda x: x * x)
    mydf['calc_DP']  = mydf['DOTPER'].apply(lambda x: x * x)

    # Get Weights depending on phase
    wt_eco, wt_avg, wt_sr, wt_dp = getWeights(phase)
    
    # get SQRT of sum of squares
    eco_sqrt , avg_sqrt , sr_sqrt , dp_sqrt = np.sqrt(mydf[['calc_ECO' , 'calc_AVG' , 'calc_SR' , 'calc_DP']].sum(axis=0))

    #divide calculated squares with sum of squares
    mydf['calc_ECO'] = mydf['calc_ECO'].apply(lambda x : x / eco_sqrt)
    mydf['calc_AVG'] = mydf['calc_AVG'].apply(lambda x : x / avg_sqrt)
    mydf['calc_SR'] = mydf['calc_SR'].apply(lambda x : x / sr_sqrt)
    mydf['calc_DP'] = mydf['calc_DP'].apply(lambda x : x / dp_sqrt)

    # multiply each calculated value with it's weight 
    mydf['calc_ECO'] = mydf['calc_ECO'].apply(lambda x : wt_eco * x)
    mydf['calc_AVG'] = mydf['calc_AVG'].apply(lambda x : wt_avg * x)
    mydf['calc_SR'] = mydf['calc_SR'].apply(lambda x : wt_sr * x)
    mydf['calc_DP'] = mydf['calc_DP'].apply(lambda x : wt_dp * x)

    # Get best and worst of each criterion
    best_eco , worst_eco = min(mydf['calc_ECO']) , max(mydf['calc_ECO'])
    best_avg , worst_avg = min(mydf['calc_AVG']) , max(mydf['calc_AVG'])
    best_sr  , worst_sr  = min(mydf['calc_SR']) , max(mydf['calc_SR'])
    best_dp  , worst_dp  = max(mydf['calc_DP']) , min(mydf['calc_DP'])
    
    # Deviation from best - Square of difference 
    mydf['dev_best_eco']  = mydf['calc_ECO'].apply(lambda x : (x-best_eco) * (x-best_eco))
    mydf['dev_best_avg']  = mydf['calc_AVG'].apply(lambda x : (x-best_avg) * (x-best_avg))
    mydf['dev_best_sr']   = mydf['calc_SR'].apply(lambda x : (x-best_sr) * (x-best_sr))
    mydf['dev_best_dp']   = mydf['calc_DP'].apply(lambda x : (x-best_dp) * (x-best_dp))

    # SquareRoot of sum of all deviations from best
    mydf['dev_best_sqrt'] = np.sqrt(mydf.apply(lambda x: x['dev_best_eco'] + x['dev_best_avg'] + x['dev_best_sr'] + x['dev_best_dp'] , axis=1))
    
    # Deviation from Worst - Square of difference 
    mydf['dev_worst_eco']  = mydf['calc_ECO'].apply(lambda x : (x-worst_eco) * (x-worst_eco))
    mydf['dev_worst_avg']  = mydf['calc_AVG'].apply(lambda x : (x-worst_avg) * (x-worst_avg))
    mydf['dev_worst_sr']   = mydf['calc_SR'].apply(lambda x : (x-worst_sr) * (x-worst_sr))
    mydf['dev_worst_dp']   = mydf['calc_DP'].apply(lambda x : (x-worst_dp) * (x-worst_dp))

    # SquareRoot of sum of all deviations from worst
    mydf['dev_worst_sqrt'] = np.sqrt(mydf.apply(lambda x: x['dev_worst_eco'] + x['dev_worst_avg'] + x['dev_worst_sr'] + x['dev_worst_dp'] , axis=1))
    
    # Calculate Score - worst / (worst + best) .. shows how far from worst are you (i.e - closer to best)
    mydf['score'] = mydf.apply(lambda x : x['dev_worst_sqrt'] / (x['dev_worst_sqrt'] + x['dev_best_sqrt']) , axis=1)
    
    return mydf

In [9]:
# Main - Start

# Create Copies
del_df = deliveres.copy()
match_df = matches.copy()

In [10]:
# Merge two dataframes
comb = pd.merge(del_df , match_df, on = 'id', how = 'left') 

# Get Data from IPL 2016 onwards
comb = comb[(comb.id > 980901)]

In [11]:
# Add Match Phase and isBowlerWkt
comb['phase'] = comb['over'].apply(lambda x : get_phase(x))
comb['isBowlerWkt'] = comb.apply(lambda x: isBowlersWicket(x['player_dismissed'] , x['dismissal_kind']) , axis=1)

In [12]:
# For Dropdown
venue_list = comb['venue'].unique().tolist()
team_list = comb['batting_team'].unique().tolist()

venue_drop = widgets.Dropdown(options = venue_list , description='Venue' , value='M Chinnaswamy Stadium')
team_drop = widgets.Dropdown(options = team_list , description='Batting Team' , value='Royal Challengers Bangalore')
phase_drop = widgets.Dropdown(options = ['PowerPlay' , 'Middle' , 'End'] , description='Match Phase', value='PowerPlay')

In [13]:
def on_change_venue(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("Venue changed to %s" % change['new'])
        
def on_change_team(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("Batting Team changed to %s" % change['new'])
        
def on_change_phase(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("Phase changed to %s" % change['new'])

In [14]:
venue_drop.observe(on_change_venue)
display(venue_drop)

team_drop.observe(on_change_team)
display(team_drop)

phase_drop.observe(on_change_phase)
display(phase_drop)

Dropdown(description='Venue', index=2, options=('Eden Gardens', 'Punjab Cricket Association IS Bindra Stadium,…

Dropdown(description='Batting Team', index=4, options=('Kolkata Knight Riders', 'Delhi Daredevils', 'Kings XI …

Dropdown(description='Match Phase', options=('PowerPlay', 'Middle', 'End'), value='PowerPlay')

In [15]:
print ("Venue = " , venue_drop.value)
print ("Batting Team = ", team_drop.value)
print ("Match Phase = " , phase_drop.value)

Venue =  M Chinnaswamy Stadium
Batting Team =  Royal Challengers Bangalore
Match Phase =  PowerPlay


In [16]:
# Inputs
venue = venue_drop.value
bowlingTeam = team_drop.value
phase = phase_drop.value

In [17]:
# Get per Bowler DF for stadium, oppostion, phase data
mydf = getBowlerStats(comb , venue , bowlingTeam , phase)

In [18]:
# Get Medians for filter
mydf.balls.median() , mydf.innings.median()

(13.0, 1.0)

In [19]:
# Atleast median balls and innings
mydf = mydf[(mydf.balls >= (mydf.balls.median())) & (mydf.innings >= (mydf.innings.median()))]

In [20]:
# debug 
mydf.head()

Unnamed: 0,bowler,runs,balls,innings,dismissals,dots,ones,twos,threes,fours,sixes,ECO,AVG,SR,DOTPER
0,A Nehra,20,13,1,0,7,2,0,0,3,1,9.23,20.0,13.0,0.54
1,AB Dinda,19,18,1,0,6,7,4,0,1,0,6.33,19.0,18.0,0.33
2,AD Russell,11,19,1,0,9,9,1,0,0,0,3.47,11.0,19.0,0.47
5,AS Rajpoot,34,20,1,0,6,7,1,1,4,1,10.2,34.0,20.0,0.3
6,B Kumar,39,39,3,3,20,13,0,0,5,1,6.0,13.0,13.0,0.51


In [21]:
final_df = getTopsisScore(mydf,  phase)

In [22]:
final_df[['bowler','runs', 'balls', 'innings' , 'dismissals' , 'ECO', 'AVG' , 'SR' , 'DOTPER', 'score']].sort_values(by = 'score', ascending = False).reset_index(drop=True)

Unnamed: 0,bowler,runs,balls,innings,dismissals,ECO,AVG,SR,DOTPER,score
0,UT Yadav,15,19,1,2,4.74,7.5,9.5,0.58,0.98
1,DS Kulkarni,34,36,3,5,5.67,6.8,7.2,0.56,0.97
2,NB Singh,10,13,1,0,4.62,10.0,13.0,0.46,0.95
3,Sandeep Sharma,44,42,3,4,6.29,11.0,10.5,0.6,0.95
4,B Kumar,39,39,3,3,6.0,13.0,13.0,0.51,0.93
5,GJ Maxwell,13,13,1,1,6.0,13.0,13.0,0.46,0.93
6,S Nadeem,15,13,2,0,6.92,15.0,13.0,0.54,0.91
7,Rashid Khan,15,13,2,1,6.92,15.0,13.0,0.46,0.91
8,PJ Cummins,16,13,1,1,7.38,16.0,13.0,0.46,0.9
9,Mustafizur Rahman,14,18,2,0,4.67,14.0,18.0,0.5,0.88
