In [1]:
# Basics
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.neighbors import DistanceMetric
from scipy.spatial.distance import cosine

# Clean up options
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.precision", 4)

In [2]:
deliveres = pd.read_csv("C:/Users/utkar/OneDrive/Desktop/Analytics/Data/IPL Ball-by-Ball 2008-2020.csv")
del_df = deliveres.copy()

del_df['isDot']   = del_df['batsman_runs'].apply(lambda x: 1 if x == 0 else 0)
del_df['isBoundary']   = del_df['batsman_runs'].apply(lambda x: 1 if (x == 4 or x == 6) else 0)

In [3]:
del_df.head(5)

Unnamed: 0,id,inning,over,ball,batsman,non_striker,bowler,batsman_runs,extra_runs,total_runs,...,is_wicket,dismissal_kind,player_dismissed,fielder,extras_type,batting_team,bowling_team,Unnamed: 18,isDot,isBoundary
0,335982,1,6,5,RT Ponting,BB McCullum,AA Noffke,1,0,1,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,0,0
1,335982,1,6,6,BB McCullum,RT Ponting,AA Noffke,1,0,1,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,0,0
2,335982,1,7,1,BB McCullum,RT Ponting,Z Khan,0,0,0,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,1,0
3,335982,1,7,2,BB McCullum,RT Ponting,Z Khan,1,0,1,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,0,0
4,335982,1,7,3,RT Ponting,BB McCullum,Z Khan,1,0,1,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,0,0


In [4]:
final_df = pd.DataFrame(columns = ['ballsFaced' , 'BattingStrikeRate' , 'BattingRPI', 'DotBallPercentage' , 'BoundaryPercentage'])
final_df['ballsFaced'] = del_df.groupby('batsman').batsman_runs.count()
final_df['BattingStrikeRate'] = del_df.groupby('batsman').batsman_runs.sum() / del_df.groupby('batsman').batsman_runs.count() * 100
final_df['BattingRPI'] = del_df.groupby('batsman').batsman_runs.sum() / del_df.groupby('batsman').id.nunique()
final_df['DotBallPercentage'] = del_df.groupby('batsman')['isDot'].sum() / del_df.groupby('batsman')['isDot'].count() * 100
final_df['BoundaryPercentage'] = del_df.groupby('batsman')['isBoundary'].sum() / del_df.groupby('batsman')['isBoundary'].count() * 100

In [5]:
final_df.reset_index(inplace=True)

In [6]:
final_df.tail(5)

Unnamed: 0,batsman,ballsFaced,BattingStrikeRate,BattingRPI,DotBallPercentage,BoundaryPercentage
532,YV Takawale,183,104.918,19.2,54.6448,15.847
533,Yashpal Singh,67,70.1493,11.75,58.209,7.4627
534,Younis Khan,7,42.8571,3.0,57.1429,0.0
535,Yuvraj Singh,2205,124.7166,21.8254,44.263,16.5986
536,Z Khan,141,82.9787,4.3333,52.4823,9.2199


In [7]:
# Filter to atleast 300 balls faced to remove non regular batsmen
final_df = final_df[final_df.ballsFaced > 500]

In [8]:
min_sr = final_df.BattingStrikeRate.min()
max_sr = final_df.BattingStrikeRate.max()

min_rpi = final_df.BattingRPI.min()
max_rpi = final_df.BattingRPI.max()

min_dbp = final_df.DotBallPercentage.min()
max_dbp = final_df.DotBallPercentage.max()

min_bp = final_df.BoundaryPercentage.min()
max_bp = final_df.BoundaryPercentage.max()

(min_sr, max_sr) , (min_rpi, max_rpi) , (min_dbp, max_dbp) , (min_bp, max_bp)

((101.73453996983409, 171.9954648526077),
 (7.786666666666667, 37.61904761904762),
 (31.833060556464808, 50.959860383944154),
 (12.103174603174603, 27.05061082024433))

In [9]:
final_df['BattingStrikeRate'] = final_df['BattingStrikeRate'].apply(lambda x: (x - min_sr) / (max_sr - min_sr))
final_df['BattingRPI'] = final_df['BattingRPI'].apply(lambda x: (x - min_rpi) / (max_rpi - min_rpi))
final_df['DotBallPercentage'] = final_df['DotBallPercentage'].apply(lambda x: (x - min_dbp) / (max_dbp - min_dbp))
final_df['BoundaryPercentage'] = final_df['BoundaryPercentage'].apply(lambda x: (x - min_bp) / (max_bp - min_bp))

In [10]:
# Reference Row - AB de Villiers
selected_batsman = 'AB de Villiers'
reference_df = final_df[final_df.batsman == selected_batsman].drop(labels = ['batsman' , 'ballsFaced'] , axis=1)

In [11]:
data = final_df.drop(labels = ['batsman' , 'ballsFaced'] , axis=1)
final_df['CosineSimilarity'] = data.apply(lambda x: np.format_float_positional(cosine(x, reference_df) , precision=5), axis=1)

In [12]:
# Get Euclidean distances 
dist = DistanceMetric.get_metric('euclidean')
final_df['EuclideanDistanceSimilarity'] = dist.pairwise(data, reference_df )

In [13]:
maxCosine = float(max(final_df['CosineSimilarity']))
maxEuclidean = float(max(final_df['EuclideanDistanceSimilarity']))
maxCosine , maxEuclidean

(0.87696, 1.259714754852849)

In [14]:
final_df['CosineSimilarity'] = final_df['CosineSimilarity'].apply(lambda x: ((1 - (float(x) / maxCosine)) * 100 ))
final_df['EuclideanDistanceSimilarity'] = final_df['EuclideanDistanceSimilarity'].apply(lambda x: ((1 - (float(x) / maxEuclidean)) * 100 ))

In [15]:
final_df.sort_values(by = 'CosineSimilarity', ascending = False).head(10)

Unnamed: 0,batsman,ballsFaced,BattingStrikeRate,BattingRPI,DotBallPercentage,BoundaryPercentage,CosineSimilarity,EuclideanDistanceSimilarity
24,AB de Villiers,3264,0.6665,0.7809,0.0032,0.4713,100.0,100.0
388,RR Pant,1416,0.6417,0.7638,0.1597,0.5463,98.6522,86.0196
115,DA Miller,1374,0.4684,0.5444,0.1051,0.2274,98.0854,67.7462
438,SK Raina,4041,0.4427,0.691,0.1625,0.3277,97.1492,74.3762
75,BA Stokes,700,0.4226,0.51,0.1581,0.2511,97.0637,64.0262
238,KS Williamson,1222,0.4377,0.7826,0.0,0.236,96.3659,73.9436
298,MP Stoinis,623,0.4368,0.3974,0.1652,0.2534,95.7227,58.4813
301,MS Dhoni,3493,0.4394,0.5921,0.1692,0.2035,95.7079,65.7153
193,JC Buttler,1184,0.6124,0.747,0.336,0.569,94.7056,71.9997
454,SPD Smith,1858,0.3392,0.6483,0.0691,0.1625,94.4809,62.3936


In [16]:
final_df.sort_values(by = 'EuclideanDistanceSimilarity', ascending = False).head(10)

Unnamed: 0,batsman,ballsFaced,BattingStrikeRate,BattingRPI,DotBallPercentage,BoundaryPercentage,CosineSimilarity,EuclideanDistanceSimilarity
24,AB de Villiers,3264,0.6665,0.7809,0.0032,0.4713,100.0,100.0
388,RR Pant,1416,0.6417,0.7638,0.1597,0.5463,98.6522,86.0196
438,SK Raina,4041,0.4427,0.691,0.1625,0.3277,97.1492,74.3762
238,KS Williamson,1222,0.4377,0.7826,0.0,0.236,96.3659,73.9436
193,JC Buttler,1184,0.6124,0.747,0.336,0.569,94.7056,71.9997
116,DA Warner,3819,0.5101,0.9792,0.292,0.4253,94.1445,69.3237
232,KL Rahul,1990,0.4452,0.9713,0.2116,0.3266,93.3634,69.2976
115,DA Miller,1374,0.4684,0.5444,0.1051,0.2274,98.0854,67.7462
425,SA Yadav,1536,0.4275,0.5279,0.2486,0.4055,94.1902,65.79
301,MS Dhoni,3493,0.4394,0.5921,0.1692,0.2035,95.7079,65.7153
