In [1]:
# Basics
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.neighbors import DistanceMetric
from scipy.spatial.distance import cosine

# Clean up options
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.precision", 2)

In [2]:
deliveres = pd.read_csv("C:/Users/utkar/OneDrive/Desktop/Analytics/Data/IPL Ball-by-Ball 2008-2020.csv")
del_df = deliveres.copy()

del_df['isDot']   = del_df['batsman_runs'].apply(lambda x: 1 if x == 0 else 0)
del_df['isBoundary']   = del_df['batsman_runs'].apply(lambda x: 1 if (x == 4 or x == 6) else 0)

In [3]:
del_df.head(5)

Unnamed: 0,id,inning,over,ball,batsman,non_striker,bowler,batsman_runs,extra_runs,total_runs,...,is_wicket,dismissal_kind,player_dismissed,fielder,extras_type,batting_team,bowling_team,Unnamed: 18,isDot,isBoundary
0,335982,1,6,5,RT Ponting,BB McCullum,AA Noffke,1,0,1,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,0,0
1,335982,1,6,6,BB McCullum,RT Ponting,AA Noffke,1,0,1,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,0,0
2,335982,1,7,1,BB McCullum,RT Ponting,Z Khan,0,0,0,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,1,0
3,335982,1,7,2,BB McCullum,RT Ponting,Z Khan,1,0,1,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,0,0
4,335982,1,7,3,RT Ponting,BB McCullum,Z Khan,1,0,1,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,0,0


In [4]:
final_df = pd.DataFrame(columns = ['ballsFaced' , 'BattingStrikeRate' , 'BattingRPI', 'DotBallPercentage' , 'BoundaryPercentage'])
final_df['ballsFaced'] = del_df.groupby('batsman').batsman_runs.count()
final_df['BattingStrikeRate'] = del_df.groupby('batsman').batsman_runs.sum() / del_df.groupby('batsman').batsman_runs.count() * 100
final_df['BattingRPI'] = del_df.groupby('batsman').batsman_runs.sum() / del_df.groupby('batsman').id.nunique()
final_df['DotBallPercentage'] = del_df.groupby('batsman')['isDot'].sum() / del_df.groupby('batsman')['isDot'].count() * 100
final_df['BoundaryPercentage'] = del_df.groupby('batsman')['isBoundary'].sum() / del_df.groupby('batsman')['isBoundary'].count() * 100

In [5]:
final_df.reset_index(inplace=True)

In [6]:
final_df.head(5)

Unnamed: 0,batsman,ballsFaced,BattingStrikeRate,BattingRPI,DotBallPercentage,BoundaryPercentage
0,A Ashish Reddy,196,142.86,12.17,31.12,15.82
1,A Chandila,7,57.14,2.0,42.86,0.0
2,A Chopra,75,70.67,8.83,60.0,9.33
3,A Choudhary,20,125.0,8.33,20.0,10.0
4,A Dananjaya,5,80.0,4.0,40.0,0.0


In [7]:
# Filter to atleast 500 balls faced to remove non regular batsmen
final_df = final_df[final_df.ballsFaced > 1000]

In [8]:
# Reference Row - Virat
virat_df = final_df[np.isclose(final_df['BattingRPI'], 31.95 , 0.01)].drop(labels = ['batsman' , 'ballsFaced'] , axis=1)

In [9]:
data = final_df.drop(labels = ['batsman' , 'ballsFaced'] , axis=1)
final_df['SimilarToViratCosine'] = data.apply(lambda x: np.format_float_positional(cosine(x, virat_df) , precision=5), axis=1)

In [10]:
final_df.sort_values(by = 'SimilarToViratCosine', ascending = True).head(10)

Unnamed: 0,batsman,ballsFaced,BattingStrikeRate,BattingRPI,DotBallPercentage,BoundaryPercentage,SimilarToViratCosine
505,V Kohli,4609,127.53,31.95,35.26,15.32,0.0
151,F du Plessis,1828,125.93,29.9,35.83,15.21,0.0001
116,DA Warner,3819,137.58,37.0,37.42,18.46,0.00023
232,KL Rahul,1990,133.02,36.76,35.88,16.98,0.00032
209,JP Duminy,1680,120.77,27.05,33.45,12.2,0.00044
435,SE Marsh,1908,129.82,35.9,38.47,18.03,0.00053
454,SPD Smith,1858,125.57,27.13,33.15,14.53,0.00054
407,S Dhawan,4208,123.5,29.7,38.33,16.63,0.0006
238,KS Williamson,1222,132.49,31.13,31.83,15.63,0.00061
438,SK Raina,4041,132.84,28.4,34.94,17.0,0.00065


In [11]:
# Get Euclidean distances 
dist = DistanceMetric.get_metric('euclidean')
final_df['SimilarToViratEuclidean'] = dist.pairwise(data, virat_df )

In [12]:
final_df.sort_values(by = 'SimilarToViratEuclidean', ascending = True).head(10)

Unnamed: 0,batsman,ballsFaced,BattingStrikeRate,BattingRPI,DotBallPercentage,BoundaryPercentage,SimilarToViratCosine,SimilarToViratEuclidean
505,V Kohli,4609,127.53,31.95,35.26,15.32,0.0,0.0
151,F du Plessis,1828,125.93,29.9,35.83,15.21,0.0001,2.67
379,RG Sharma,4088,127.94,26.96,37.48,16.44,0.00083,5.59
454,SPD Smith,1858,125.57,27.13,33.15,14.53,0.00054,5.67
407,S Dhawan,4208,123.5,29.7,38.33,16.63,0.0006,5.7
238,KS Williamson,1222,132.49,31.13,31.83,15.63,0.00061,6.09
435,SE Marsh,1908,129.82,35.9,38.47,18.03,0.00053,6.21
438,SK Raina,4041,132.84,28.4,34.94,17.0,0.00065,6.61
458,SS Iyer,1802,122.09,28.21,38.35,15.09,0.00076,7.3
232,KL Rahul,1990,133.02,36.76,35.88,16.98,0.00032,7.51
