In [1]:
import numpy as np # for Linear algebra
import pandas as pd # for data manipulation/CSV I/O
pd.options.mode.chained_assignment = None    # To avoid the SettingWithCopyWarning

In [2]:
deliveries=pd.read_csv("dataset/deliveries.csv")
matches =pd.read_csv("dataset/matches.csv")

In [3]:
deliveries.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,4,0,4,,,
3,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,4,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
4,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,5,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,2,2,,,


In [4]:
print(deliveries.columns)

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batsman', 'non_striker', 'bowler', 'is_super_over', 'wide_runs',
       'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs',
       'batsman_runs', 'extra_runs', 'total_runs', 'player_dismissed',
       'dismissal_kind', 'fielder'],
      dtype='object')


In [5]:
column_heads = list(set(list(deliveries.columns)).difference(set(['inning', 'non_striker', 'is_super_over', 'legbye_runs', 'bye_runs'])))

column_heads

['total_runs',
 'wide_runs',
 'batting_team',
 'noball_runs',
 'player_dismissed',
 'extra_runs',
 'ball',
 'dismissal_kind',
 'bowler',
 'penalty_runs',
 'match_id',
 'over',
 'batsman_runs',
 'bowling_team',
 'fielder',
 'batsman']

In [6]:
deliveries_bowlers = deliveries[column_heads]
deliveries_bowlers.head()

Unnamed: 0,total_runs,wide_runs,batting_team,noball_runs,player_dismissed,extra_runs,ball,dismissal_kind,bowler,penalty_runs,match_id,over,batsman_runs,bowling_team,fielder,batsman
0,0,0,Sunrisers Hyderabad,0,,0,1,,TS Mills,0,1,1,0,Royal Challengers Bangalore,,DA Warner
1,0,0,Sunrisers Hyderabad,0,,0,2,,TS Mills,0,1,1,0,Royal Challengers Bangalore,,DA Warner
2,4,0,Sunrisers Hyderabad,0,,0,3,,TS Mills,0,1,1,4,Royal Challengers Bangalore,,DA Warner
3,0,0,Sunrisers Hyderabad,0,,0,4,,TS Mills,0,1,1,0,Royal Challengers Bangalore,,DA Warner
4,2,2,Sunrisers Hyderabad,0,,2,5,,TS Mills,0,1,1,0,Royal Challengers Bangalore,,DA Warner


In [7]:
deliveries_bowlers['bowler'].nunique()

378

In [8]:
deliveries_bowlers.fillna("0", inplace=True)
deliveries_bowlers["Fours"] = np.where(deliveries_bowlers["batsman_runs"] == 4, 1, 0)
deliveries_bowlers["Sixes"] = np.where(deliveries_bowlers["batsman_runs"] == 6, 1, 0)

deliveries_bowlers['dismissal_kind'].unique()

array(['0', 'caught', 'bowled', 'run out', 'lbw', 'caught and bowled',
       'stumped', 'retired hurt', 'hit wicket', 'obstructing the field'],
      dtype=object)

In [9]:
dismissal_kind = ['caught', 'bowled', 'lbw', 'stumped', 'caught and bowled', 'hit wicket']
deliveries_bowlers["Wickets"] = np.where(deliveries_bowlers['dismissal_kind'].isin(dismissal_kind), 1, 0)

deliveries_bowlers.sample()

Unnamed: 0,total_runs,wide_runs,batting_team,noball_runs,player_dismissed,extra_runs,ball,dismissal_kind,bowler,penalty_runs,match_id,over,batsman_runs,bowling_team,fielder,batsman,Fours,Sixes,Wickets
11321,1,0,Kolkata Knight Riders,0,0,0,5,0,AR Patel,0,48,15,1,Kings XI Punjab,0,CA Lynn,0,0,0


In [10]:
bowlers_info = deliveries_bowlers.groupby(['bowler', 'match_id'], as_index=False).agg(
    {'over': lambda x: x.nunique(), 'total_runs': 'sum', 'Fours': 'sum', 'Sixes': 'sum', 'Wickets': 'sum'}).reset_index()

recent_form = bowlers_info.copy()
bowlers_info.sample()

Unnamed: 0,index,bowler,match_id,over,total_runs,Fours,Sixes,Wickets
1162,1162,BA Stokes,7940,4,29,2,1,2


In [11]:
bowlers_info['3WI'] = np.where(bowlers_info['Wickets']>2, 1, 0)
career_stats = bowlers_info.groupby('bowler', as_index=False).agg(
    {'match_id': 'count', 'total_runs': 'sum', 'over': 'sum', 'Fours': 'sum', 'Sixes': 'sum', 
     'Wickets': 'sum', '3WI': 'sum'}).reset_index()

most_wickets_in_match = bowlers_info.groupby(['bowler']).agg({'Wickets': 'max'}).reset_index()
bowlers_info = pd.merge(bowlers_info, most_wickets_in_match, on=['bowler', 'Wickets'])

least_runs_for_most_wickets = bowlers_info.groupby(['bowler']).agg({'total_runs': 'min'}).reset_index()
bowlers_info = pd.merge(bowlers_info, least_runs_for_most_wickets, on=['bowler', 'total_runs'])

bowlers_info.head()

Unnamed: 0,index,bowler,match_id,over,total_runs,Fours,Sixes,Wickets,3WI
0,8,A Ashish Reddy,376,4,25,1,1,3,1
1,21,A Chandila,365,4,13,1,0,4,1
2,35,A Choudhary,42,4,17,1,0,2,0
3,37,A Dananjaya,7902,4,48,2,4,0,0
4,39,A Flintoff,122,3,11,1,0,1,0


In [12]:
bowlers_info['BBI'] = ( bowlers_info['Wickets'].map(str)) + '/' + (bowlers_info['total_runs'].map(str) )

bowlers_info.head()

Unnamed: 0,index,bowler,match_id,over,total_runs,Fours,Sixes,Wickets,3WI,BBI
0,8,A Ashish Reddy,376,4,25,1,1,3,1,3/25
1,21,A Chandila,365,4,13,1,0,4,1,4/13
2,35,A Choudhary,42,4,17,1,0,2,0,2/17
3,37,A Dananjaya,7902,4,48,2,4,0,0,0/48
4,39,A Flintoff,122,3,11,1,0,1,0,1/11


In [13]:
bowlers_info['bowler'].count()

384

In [14]:
recent_form.head(30)
recent_form = recent_form.groupby('bowler').tail(4)
recent_form.drop(['match_id','Fours','Sixes'],axis=1,inplace=True)
recent_form = recent_form.groupby(['bowler']).agg({'over':'sum' , 'total_runs':'sum', 'Wickets':'sum'}).reset_index()
recent_form['Form_Average'] = np.where(recent_form['total_runs']/recent_form['Wickets'] == np.inf ,
                                  recent_form['total_runs'],
                                  recent_form['total_runs']/recent_form['Wickets'])
recent_form['Form_Econ'] = np.where(recent_form['total_runs']/recent_form['over'] == np.inf,
                                   0,recent_form['total_runs']/recent_form['over'])

recent_form.rename(columns={'bowler':'Bowler','total_runs':'Recent_runs', 'Wickets':'Recent_wickets'},inplace=True)
recent_form.drop(['over'],axis=1,inplace=True)
recent_form.head()

Unnamed: 0,Bowler,Recent_runs,Recent_wickets,Form_Average,Form_Econ
0,A Ashish Reddy,75,3,25.0,9.375
1,A Chandila,99,3,33.0,7.615385
2,A Choudhary,89,4,22.25,6.846154
3,A Dananjaya,48,0,48.0,12.0
4,A Flintoff,106,2,53.0,9.636364


In [15]:
bowlers_info[bowlers_info.duplicated(['bowler'], keep='first')]

Unnamed: 0,index,bowler,match_id,over,total_runs,Fours,Sixes,Wickets,3WI,BBI
161,3410,K Rabada,41,4,28,2,1,2,0,2/28
229,4679,N Rana,7908,2,11,1,0,2,0,2/11
248,5002,PJ Cummins,55,4,22,1,0,2,0,2/22
287,6195,Rashid Khan,7918,4,19,2,0,3,1,3/19
288,6204,Rashid Khan,7952,4,19,2,0,3,1,3/19
316,6605,SE Bond,230,4,24,2,0,2,0,2/24


In [16]:
bowlers_info[bowlers_info.duplicated(['bowler'], keep='last')]

Unnamed: 0,index,bowler,match_id,over,total_runs,Fours,Sixes,Wickets,3WI,BBI
160,3407,K Rabada,31,4,28,3,0,2,0,2/28
228,4678,N Rana,7896,1,11,1,1,2,0,2/11
247,4996,PJ Cummins,24,4,22,1,0,2,0,2/22
286,6176,Rashid Khan,6,4,19,1,0,3,1,3/19
287,6195,Rashid Khan,7918,4,19,2,0,3,1,3/19
315,6601,SE Bond,196,4,24,2,1,2,0,2/24


In [17]:
bowlers_info = bowlers_info.drop_duplicates('bowler', keep='last')
career_stats = pd.merge(career_stats, bowlers_info[['bowler', 'BBI']], on='bowler')

career_stats.head()

Unnamed: 0,index,bowler,match_id,total_runs,over,Fours,Sixes,Wickets,3WI,BBI
0,0,A Ashish Reddy,20,400,45,26,20,18,1,3/25
1,1,A Chandila,12,245,39,17,10,11,1,4/13
2,2,A Choudhary,5,144,17,13,6,5,0,2/17
3,3,A Dananjaya,1,48,4,2,4,0,0,0/48
4,4,A Flintoff,3,106,11,7,7,2,0,1/11


In [18]:
career_stats = career_stats.rename(columns={'bowler': 'Bowler', 'total_runs': 'Runs', 'match_id': 'Matches', 'over': 'Overs'})
career_stats['Economy'] = np.around(career_stats['Runs'] / career_stats['Overs'], 2)
career_stats['Average'] = np.where(career_stats['Wickets'] == 0 , career_stats['Runs'], 
                                   np.around(career_stats['Runs'] / career_stats['Wickets'], 2) )
career_stats['SR'] = np.where(career_stats['Wickets'] == 0 , 0, 
                              np.around((career_stats['Overs']*6) / career_stats['Wickets'], 2))

# Reordering columns for presentation


In [19]:
career_stats = pd.merge(career_stats, recent_form, on='Bowler')

sequence = ['Bowler', 'Matches', 'Wickets', 'Runs', 'Overs' , 'Average' ,'Economy', 'SR', 'BBI',  '3WI', 'Fours', 'Sixes',
           'Recent_runs','Recent_wickets','Form_Average','Form_Econ']

career_stats = career_stats[sequence]
career_stats.head()

Unnamed: 0,Bowler,Matches,Wickets,Runs,Overs,Average,Economy,SR,BBI,3WI,Fours,Sixes,Recent_runs,Recent_wickets,Form_Average,Form_Econ
0,A Ashish Reddy,20,18,400,45,22.22,8.89,15.0,3/25,1,26,20,75,3,25.0,9.375
1,A Chandila,12,11,245,39,22.27,6.28,21.27,4/13,1,17,10,99,3,33.0,7.615385
2,A Choudhary,5,5,144,17,28.8,8.47,20.4,2/17,0,13,6,89,4,22.25,6.846154
3,A Dananjaya,1,0,48,4,48.0,12.0,0.0,0/48,0,2,4,48,0,48.0,12.0
4,A Flintoff,3,2,106,11,53.0,9.64,33.0,1/11,0,7,7,106,2,53.0,9.636364


In [20]:
career_stats.sort_values(by='Wickets', ascending=False).reset_index(drop='T')[:10]


Unnamed: 0,Bowler,Matches,Wickets,Runs,Overs,Average,Economy,SR,BBI,3WI,Fours,Sixes,Recent_runs,Recent_wickets,Form_Average,Form_Econ
0,SL Malinga,110,154,3034,432,19.7,7.02,16.83,5/13,15,277,63,98,8,12.25,6.125
1,A Mishra,136,146,3576,479,24.49,7.47,19.68,5/17,15,204,160,102,6,17.0,6.8
2,PP Chawla,143,139,3742,480,26.92,7.8,20.72,4/21,11,273,150,83,3,27.666667,7.545455
3,DJ Bravo,119,136,3375,395,24.82,8.54,17.43,4/22,10,261,129,174,5,34.8,10.875
4,Harbhajan Singh,146,134,3725,519,27.8,7.18,23.24,5/18,9,268,127,93,1,93.0,13.285714
5,B Kumar,102,120,2779,380,23.16,7.31,19.0,5/20,12,274,71,106,1,106.0,6.625
6,SP Narine,97,112,2569,384,22.94,6.69,20.57,5/20,12,194,79,121,3,40.333333,7.5625
7,UT Yadav,107,111,3249,378,29.27,8.6,20.43,4/24,13,344,97,115,6,19.166667,7.666667
8,R Ashwin,122,110,2975,434,27.05,6.85,23.67,4/34,5,177,106,101,4,25.25,9.181818
9,A Nehra,88,106,2537,320,23.93,7.93,18.11,4/11,14,279,76,102,7,14.571429,7.846154


In [21]:
export_csv = career_stats.to_csv ('dataset/bowlers.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path