In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import ipywidgets as widgets

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read All IPL Data
deliveres = pd.read_csv("C:/Users/utkar/OneDrive/Desktop/Analytics/Data/IPL Ball-by-Ball 2008-2020.csv")
matches = pd.read_csv("C:/Users/utkar/OneDrive/Desktop/Analytics/Data/IPL Matches 2008-2020.csv")

In [3]:
# Goal 
# Get Best Batsman against any team x, at venue y, during phase z 
# For example - Who are the best batsmen against 'Mumbai Indians' at 'Wankhede Stadium' during 'End' overs
# I've used MCDM with Topsis to arrive at a score for each batsman
# The Criterion considered are Strike Rate, Balls Per Dismissal, Runs Per Innings and Dot Percentage
# A random(not really !!) weight is assigned to each of the above. 
# We've narrowed the dataset to atleast 2 innings and IPL 2016 onwards

#Edit - Added Dropdown widget to make making choices simpler and remove typing mistakes

In [4]:
# Return match phase, based on over information
def get_phase(over): 
    if (over < 6) : 
        return 'PowerPlay'
    elif (over < 15) : 
        return 'Middle'
    else:
        return 'End'

In [5]:
# Balls per dismissal
def get_BPD(balls, dismissals): 
    if (dismissals == 0): 
        return balls/1
    else: 
        return balls/dismissals

In [6]:
# Balls per boundary
def get_BPB(balls, boundries): 
    if (boundries == 0): 
        return balls/1
    else: 
        return balls/boundries

In [7]:
# Create a mega DataFrame for all batsmen, 
# adding all extra information about RPI, SR, BPB, BPD and DP
# inputs are venue, opposition and match phase
def getCustom(df, venue, opposition, phase): 
    
    df = df[df.venue  == venue]
    df = df[df.bowling_team == opposition]
    df = df[df.phase == phase]
    
    df.reset_index(inplace = True, drop = True)
    
    df['isDot']   = df['batsman_runs'].apply(lambda x: 1 if x == 0 else 0)
    df['isOne']   = df['batsman_runs'].apply(lambda x: 1 if x == 1 else 0)
    df['isTwo']   = df['batsman_runs'].apply(lambda x: 1 if x == 2 else 0)
    df['isThree'] = df['batsman_runs'].apply(lambda x: 1 if x == 3 else 0)
    df['isFour']  = df['batsman_runs'].apply(lambda x: 1 if x == 4 else 0)
    df['isSix']   = df['batsman_runs'].apply(lambda x: 1 if x == 6 else 0)

    runs = df.groupby('batsman').batsman_runs.sum().reset_index().rename(columns = {'batsman_runs' : 'runs'})
    balls = df.groupby('batsman').ball.count().reset_index().rename(columns = {'ball' : 'balls'})
    innings = df.groupby('batsman').id.nunique().reset_index().rename(columns = {'id' : 'innings'})
    dismissals = df.groupby('batsman').player_dismissed.count().reset_index().rename(columns = {'player_dismissed' : 'dismissals'})
    sixes = df.groupby('batsman').isSix.sum().reset_index().rename(columns = {'isSix' : 'sixes'}) 
    fours = df.groupby('batsman').isFour.sum().reset_index().rename(columns = {'isFour' : 'fours'})
    dots = df.groupby('batsman').isDot.sum().reset_index().rename(columns = {'isDot' : 'dots'})
   
    mega = pd.merge(runs, balls, on = 'batsman', how = 'left').merge(innings, on = 'batsman').merge(dismissals, on = 'batsman').merge(fours, on = 'batsman').merge(sixes , on = 'batsman').merge(dots, on = 'batsman')
    mega['RPI'] = mega['runs'] / mega['innings']
    mega['SR'] = 100 * (mega['runs'] / mega['balls'])
    mega['BPB'] = mega.apply(lambda x: get_BPB(x['balls'], x['fours'] + x['sixes']) , axis=1)
    mega['BPD'] = mega.apply(lambda x: get_BPD(x['balls'], x['dismissals']) , axis=1)
    mega['DP'] = mega.apply(lambda x: 100 * (x['dots'] / x['balls']) , axis=1)
    
    return mega

In [8]:
def getWeights(phase): 
    if (phase == 'PowerPlay'): 
        # Giving preference to Strike Rate and Dot Percentage in PowerPlay. 
        # The onus should be on scoring lots of runs, even at the cost of wickets
        wt_sr, wt_rpi, wt_bpd, wt_dp = 0.3 , 0.3 , 0.2, 0.2
    elif (phase == 'Middle'): 
        # Giving preference to RPI and BPD during middle 
        # Players who can bat long and score runs, compared to heavy strikers
        wt_sr, wt_rpi, wt_bpd, wt_dp = 0.2 , 0.3 , 0.3, 0.2
    else:
        # Major preference to Strike Rate and Dot Percentage
        # RPI and BPD are immaterial as most teams would go for runs at this stage
        wt_sr, wt_rpi, wt_bpd, wt_dp = 0.5, 0.1, 0.1, 0.3
        
    return (wt_sr, wt_rpi, wt_bpd, wt_dp)

In [9]:
def getTopsisScore(mydf, phase):
    
    if mydf.empty: 
        print ("Empty DataFrame")
        return mydf
    
    # Square the required fields and store in another column
    mydf['calc_SR']  = mydf['SR'].apply(lambda x: x * x)
    mydf['calc_RPI'] = mydf['RPI'].apply(lambda x: x * x)
    mydf['calc_BPD']  = mydf['BPD'].apply(lambda x: x * x)
    mydf['calc_DP']  = mydf['DP'].apply(lambda x: x * x)

    # Get Weights depending on phase
    wt_sr, wt_rpi, wt_bpd, wt_dp = getWeights(phase)
    
    # get SQRT of sum of squares
    sr_sqrt , rpi_sqrt , bpd_sqrt , dp_sqrt = np.sqrt(mydf[['calc_SR' , 'calc_RPI' , 'calc_BPD' , 'calc_DP']].sum(axis=0))

    #divide calculated squares with sum of squares
    mydf['calc_SR'] = mydf['calc_SR'].apply(lambda x : x / sr_sqrt)
    mydf['calc_RPI'] = mydf['calc_RPI'].apply(lambda x : x / rpi_sqrt)
    mydf['calc_BPD'] = mydf['calc_BPD'].apply(lambda x : x / bpd_sqrt)
    mydf['calc_DP'] = mydf['calc_DP'].apply(lambda x : x / dp_sqrt)

    # multiply each calculated value with it's weight 
    mydf['calc_SR'] = mydf['calc_SR'].apply(lambda x : wt_sr * x)
    mydf['calc_RPI'] = mydf['calc_RPI'].apply(lambda x : wt_rpi * x)
    mydf['calc_BPD'] = mydf['calc_BPD'].apply(lambda x : wt_bpd * x)
    mydf['calc_DP'] = mydf['calc_DP'].apply(lambda x : wt_dp * x)

    # Get best and worst of each criterion
    best_sr , worst_sr   = max(mydf['calc_SR']) , min(mydf['calc_SR'])
    best_rpi , worst_rpi = max(mydf['calc_RPI']) , min(mydf['calc_RPI'])
    best_bpd , worst_bpd = max(mydf['calc_BPD']) , min(mydf['calc_BPD'])
    best_dp , worst_dp   = min(mydf['calc_DP']) , max(mydf['calc_DP'])
    
    # Deviation from best - Square of difference 
    mydf['dev_best_sr']   = mydf['calc_SR'].apply(lambda x : (x-best_sr) * (x-best_sr))
    mydf['dev_best_rpi']  = mydf['calc_RPI'].apply(lambda x : (x-best_rpi) * (x-best_rpi))
    mydf['dev_best_bpd']  = mydf['calc_BPD'].apply(lambda x : (x-best_bpd) * (x-best_bpd))
    mydf['dev_best_dp']   = mydf['calc_DP'].apply(lambda x : (x-best_dp) * (x-best_dp))

    # SquareRoot of sum of all deviations from best
    mydf['dev_best_sqrt'] = np.sqrt(mydf.apply(lambda x: x['dev_best_sr'] + x['dev_best_rpi'] + x['dev_best_bpd'] + x['dev_best_dp'] , axis=1))
    
    # Deviation from Worst - Square of difference 
    mydf['dev_worst_sr']   = mydf['calc_SR'].apply(lambda x : (x-worst_sr) * (x-worst_sr))
    mydf['dev_worst_rpi']  = mydf['calc_RPI'].apply(lambda x : (x-worst_rpi) * (x-worst_rpi))
    mydf['dev_worst_bpd']  = mydf['calc_BPD'].apply(lambda x : (x-worst_bpd) * (x-worst_bpd))
    mydf['dev_worst_dp']   = mydf['calc_DP'].apply(lambda x : (x-worst_dp) * (x-worst_dp))

    # SquareRoot of sum of all deviations from worst
    mydf['dev_worst_sqrt'] = np.sqrt(mydf.apply(lambda x: x['dev_worst_sr'] + x['dev_worst_rpi'] + x['dev_worst_bpd'] + x['dev_worst_dp'] , axis=1))
    
    # Calculate Score - worst / (worst + best) .. shows how far from worst are you (i.e - closer to best)
    mydf['score'] = mydf.apply(lambda x : x['dev_worst_sqrt'] / (x['dev_worst_sqrt'] + x['dev_best_sqrt']) , axis=1)
    
    return mydf

In [10]:
# Main - Start

# Create Copies
del_df = deliveres.copy()
match_df = matches.copy()

In [11]:
# Merge two dataframes
comb = pd.merge(del_df , match_df, on = 'id', how = 'left') 

# Get Data from IPL 2016 onwards
comb = comb[(comb.id > 980901)]

In [12]:
# Add Match Phase
comb['phase'] = comb['over'].apply(lambda x : get_phase(x))

In [13]:
venue_list = comb['venue'].unique().tolist()
team_list = comb['bowling_team'].unique().tolist()

In [14]:
venue_drop = widgets.Dropdown(options = venue_list , description='Venue' , value='Wankhede Stadium')
team_drop = widgets.Dropdown(options = team_list , description='Bowling Team' , value='Mumbai Indians')
phase_drop = widgets.Dropdown(options = ['PowerPlay' , 'Middle' , 'End'] , description='Match Phase', value='End')

In [15]:
def on_change_venue(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("Venue changed to %s" % change['new'])
        
def on_change_team(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("Bowling Team changed to %s" % change['new'])
        
def on_change_phase(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("Phase changed to %s" % change['new'])

In [16]:
venue_drop.observe(on_change_venue)
display(venue_drop)

team_drop.observe(on_change_team)
display(team_drop)

phase_drop.observe(on_change_phase)
display(phase_drop)

Dropdown(description='Venue', index=6, options=('Eden Gardens', 'Punjab Cricket Association IS Bindra Stadium,…

Dropdown(description='Bowling Team', index=6, options=('Delhi Daredevils', 'Kolkata Knight Riders', 'Gujarat L…

Dropdown(description='Match Phase', index=2, options=('PowerPlay', 'Middle', 'End'), value='End')

In [17]:
print ("Venue = " , venue_drop.value)
print ("Bowling Team = ", team_drop.value)
print ("Match Phase = " , phase_drop.value)

Venue =  Wankhede Stadium
Bowling Team =  Mumbai Indians
Match Phase =  End


In [18]:
# Inputs
venue = venue_drop.value
bowlingTeam = team_drop.value
phase = phase_drop.value

In [19]:
# Get per batsman DF for stadium, oppostion, phase data
mydf = getCustom(comb , venue , bowlingTeam , phase)

In [20]:
# Atleast 2 innings
mydf = mydf[(mydf.innings > 1)]

In [21]:
final_df = getTopsisScore(mydf,  phase)

In [22]:
# debug 
print ("Venue = " , venue_drop.value)
print ("Bowling Team = ", team_drop.value)
print ("Match Phase = " , phase_drop.value)

final_df[['batsman','runs' , 'SR' , 'RPI' , 'BPD' , 'DP' , 'innings', 'score']].sort_values(by = 'score', ascending = False).reset_index(drop=True)

Venue =  Wankhede Stadium
Bowling Team =  Mumbai Indians
Match Phase =  End


Unnamed: 0,batsman,runs,SR,RPI,BPD,DP,innings,score
0,KL Rahul,76,230.30303,38.0,33.0,18.181818,2,0.979742
1,MK Pandey,72,225.0,36.0,32.0,15.625,2,0.949879
2,DJ Bravo,56,215.384615,28.0,13.0,38.461538,2,0.858808
3,SV Samson,29,181.25,14.5,8.0,37.5,2,0.618304
4,JJ Roy,41,178.26087,20.5,23.0,30.434783,2,0.603393
5,AJ Finch,32,177.777778,16.0,18.0,16.666667,2,0.60325
6,MK Tiwary,48,177.777778,16.0,27.0,25.925926,3,0.601895
7,P Negi,33,165.0,16.5,10.0,35.0,2,0.518664
8,KD Karthik,64,152.380952,32.0,42.0,33.333333,2,0.452405
9,YK Pathan,29,152.631579,14.5,19.0,42.105263,2,0.444426
