In [1]:
# Basics
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.neighbors import DistanceMetric
from scipy.spatial.distance import cosine

# Clean up options
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.precision", 2)

In [2]:
deliveres = pd.read_csv("C:/Users/utkar/OneDrive/Desktop/Analytics/Data/IPL Ball-by-Ball 2008-2020.csv")
del_df = deliveres.copy()

del_df['isDot']   = del_df['batsman_runs'].apply(lambda x: 1 if x == 0 else 0)
del_df['isBoundary']   = del_df['batsman_runs'].apply(lambda x: 1 if (x == 4 or x == 6) else 0)

In [3]:
del_df.head(5)

Unnamed: 0,id,inning,over,ball,batsman,non_striker,bowler,batsman_runs,extra_runs,total_runs,...,is_wicket,dismissal_kind,player_dismissed,fielder,extras_type,batting_team,bowling_team,Unnamed: 18,isDot,isBoundary
0,335982,1,6,5,RT Ponting,BB McCullum,AA Noffke,1,0,1,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,0,0
1,335982,1,6,6,BB McCullum,RT Ponting,AA Noffke,1,0,1,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,0,0
2,335982,1,7,1,BB McCullum,RT Ponting,Z Khan,0,0,0,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,1,0
3,335982,1,7,2,BB McCullum,RT Ponting,Z Khan,1,0,1,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,0,0
4,335982,1,7,3,RT Ponting,BB McCullum,Z Khan,1,0,1,...,0,,,,,Kolkata Knight Riders,Royal Challengers Bangalore,,0,0


In [4]:
final_df = pd.DataFrame(columns = ['ballsFaced' , 'BattingStrikeRate' , 'BattingRPI', 'DotBallPercentage' , 'BoundaryPercentage'])
final_df['ballsFaced'] = del_df.groupby('batsman').batsman_runs.count()
final_df['BattingStrikeRate'] = del_df.groupby('batsman').batsman_runs.sum() / del_df.groupby('batsman').batsman_runs.count() * 100
final_df['BattingRPI'] = del_df.groupby('batsman').batsman_runs.sum() / del_df.groupby('batsman').id.nunique()
final_df['DotBallPercentage'] = del_df.groupby('batsman')['isDot'].sum() / del_df.groupby('batsman')['isDot'].count() * 100
final_df['BoundaryPercentage'] = del_df.groupby('batsman')['isBoundary'].sum() / del_df.groupby('batsman')['isBoundary'].count() * 100

In [5]:
final_df.reset_index(inplace=True)

In [6]:
final_df.tail(5)

Unnamed: 0,batsman,ballsFaced,BattingStrikeRate,BattingRPI,DotBallPercentage,BoundaryPercentage
532,YV Takawale,183,104.92,19.2,54.64,15.85
533,Yashpal Singh,67,70.15,11.75,58.21,7.46
534,Younis Khan,7,42.86,3.0,57.14,0.0
535,Yuvraj Singh,2205,124.72,21.83,44.26,16.6
536,Z Khan,141,82.98,4.33,52.48,9.22


In [7]:
# Filter to atleast 300 balls faced to remove non regular batsmen
final_df = final_df[final_df.ballsFaced > 500]

In [8]:
# Reference Row - AB de Villiers
selected_batsman = 'AB de Villiers'
virat_df = final_df[final_df.batsman == selected_batsman].drop(labels = ['batsman' , 'ballsFaced'] , axis=1)

In [9]:
data = final_df.drop(labels = ['batsman' , 'ballsFaced'] , axis=1)
final_df['CosineSimilarity'] = data.apply(lambda x: np.format_float_positional(cosine(x, virat_df) , precision=5), axis=1)

In [10]:
# Get Euclidean distances 
dist = DistanceMetric.get_metric('euclidean')
final_df['EuclideanDistanceSimilarity'] = dist.pairwise(data, virat_df )

In [11]:
maxCosine = float(max(final_df['CosineSimilarity']))
maxEuclidean = float(max(final_df['EuclideanDistanceSimilarity']))
maxCosine , maxEuclidean

(0.0282, 50.97627793586094)

In [12]:
final_df['CosineSimilarity'] = final_df['CosineSimilarity'].apply(lambda x: ((1 - (float(x) / maxCosine)) * 100 ))
final_df['EuclideanDistanceSimilarity'] = final_df['EuclideanDistanceSimilarity'].apply(lambda x: ((1 - (float(x) / maxEuclidean)) * 100 ))

In [13]:
final_df.sort_values(by = 'CosineSimilarity', ascending = False).head(10)

Unnamed: 0,batsman,ballsFaced,BattingStrikeRate,BattingRPI,DotBallPercentage,BoundaryPercentage,CosineSimilarity,EuclideanDistanceSimilarity
24,AB de Villiers,3264,148.56,31.08,31.89,19.15,100.0,100.0
388,RR Pant,1416,146.82,30.57,34.89,20.27,99.08,92.79
238,KS Williamson,1222,132.49,31.13,31.83,15.63,97.87,67.72
438,SK Raina,4041,132.84,28.4,34.94,17.0,96.49,67.87
193,JC Buttler,1184,144.76,30.07,38.26,20.61,96.06,85.05
115,DA Miller,1374,134.64,24.03,33.84,15.5,96.03,68.33
454,SPD Smith,1858,125.57,27.13,33.15,14.53,95.99,53.28
301,MS Dhoni,3493,132.61,25.45,35.07,15.14,95.28,65.33
75,BA Stokes,700,131.43,23.0,34.86,15.86,93.97,61.84
463,SV Samson,1970,131.17,25.33,36.04,15.53,93.83,62.48


In [14]:
final_df.sort_values(by = 'EuclideanDistanceSimilarity', ascending = False).head(10)

Unnamed: 0,batsman,ballsFaced,BattingStrikeRate,BattingRPI,DotBallPercentage,BoundaryPercentage,CosineSimilarity,EuclideanDistanceSimilarity
24,AB de Villiers,3264,148.56,31.08,31.89,19.15,100.0,100.0
388,RR Pant,1416,146.82,30.57,34.89,20.27,99.08,92.79
193,JC Buttler,1184,144.76,30.07,38.26,20.61,96.06,85.05
507,V Sehwag,1833,148.83,26.23,40.21,24.0,91.91,78.84
220,KA Pollard,2107,143.47,20.99,38.59,18.7,88.97,74.22
116,DA Warner,3819,137.58,37.0,37.42,18.46,90.43,73.2
168,HH Pandya,897,150.39,18.23,36.68,19.96,86.17,72.81
161,GJ Maxwell,1013,148.57,19.29,39.29,20.73,85.74,72.52
206,JM Bairstow,576,137.15,37.62,40.1,19.1,85.74,69.59
530,YK Pathan,2330,137.51,20.94,37.42,18.07,89.57,68.57
