# Batting Order Model Development

This notebook will explore different models to get the win probability of a match given the state of the match and the next batsman to walk in to the crease. The input also contains the remaining quota of balls for different bowler types. This is a future value, but under the assumption of 5 bowlers or 4 bowlers with two part time bowlers, that future value is already available for the team. 

Input --> Batter,  Team , Ball number, Innings, Non- striker  
Ouput --> Win Probability

In [1]:
import pandas as pd

# Load the data
data = pd.read_csv('../Data/selected_data/processed_data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217242 entries, 0 to 217241
Data columns (total 32 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   batter               217242 non-null  object 
 1   bowler               217242 non-null  object 
 2   non_striker          217242 non-null  object 
 3   runs_by_bat          217242 non-null  int64  
 4   extra_runs           217242 non-null  int64  
 5   total_runs_delivery  217242 non-null  int64  
 6   current_team_total   217242 non-null  int64  
 7   runs_remain          217242 non-null  float64
 8   batter_runs          217242 non-null  int64  
 9   balls_faced          217242 non-null  int64  
 10  wickets_fallen       217242 non-null  int64  
 11  extra_type           217242 non-null  object 
 12  delivery             217242 non-null  float64
 13  over                 217242 non-null  int64  
 14  wicket_type          217242 non-null  object 
 15  player_out       

Selecting Player "KM Mendis"

In [14]:
# sl_batsmen = data[data['batting_team'] == "Sri Lanka"]['batter'].unique()
# sl_batsmen

array(['N Dickwella', 'WU Tharanga', 'EMDY Munaweera', 'DAS Gunaratne',
       'TAM Siriwardana', 'CK Kapugedera', 'S Prasanna', 'BKG Mendis',
       'KMDN Kulasekara', 'SL Malinga', 'MD Shanaka', 'JRMVB Sanjaya',
       'MDKJ Perera', 'NLTC Perera', 'AD Mathews', 'SMA Priyanjan',
       'I Udana', 'MD Gunathilaka', 'S Samarawickrama', 'ML Udawatte',
       'SS Pathirana', 'PC de Silva', 'A Dananjaya', 'PVD Chameera',
       'MVT Fernando', 'N Pradeep', 'LD Chandimal', 'BMAJ Mendis',
       'RAS Lakmal', 'DM de Silva', 'PHKD Mendis', 'MA Aponso',
       'PADLR Sandakan', 'WIA Fernando', 'AK Perera', 'CAK Rajitha',
       'CBRLS Kumara', 'PBB Rajapaksa', 'BOP Fernando', 'PWH de Silva',
       'GSNFG Jayasuriya', 'LD Madushanka', 'M Bhanuka', 'B Fernando',
       'P Nissanka', 'KNA Bandara', 'KIC Asalanka', 'C Karunaratne',
       'RTM Mendis', 'M Theekshana', 'JDF Vandersay', 'K Mishara',
       'J Liyanage', 'P Jayawickrama', 'N Thushara', 'PM Liyanagamage',
       'D Madushanka', 'M P

In [2]:
all_batsmen = data['batter'].unique()
all_batsmen

array(['AJ Finch', 'M Klinger', 'TM Head', 'MC Henriques', 'AJ Turner',
       'JP Faulkner', 'N Dickwella', 'WU Tharanga', 'EMDY Munaweera',
       'DAS Gunaratne', 'TAM Siriwardana', 'CK Kapugedera', 'S Prasanna',
       'BR Dunk', 'TD Paine', 'PJ Cummins', 'AJ Tye', 'JA Richardson',
       'BKG Mendis', 'KMDN Kulasekara', 'SL Malinga', 'MD Shanaka',
       'JRMVB Sanjaya', 'KL Rahul', 'AT Rayudu', 'Mandeep Singh',
       'MK Pandey', 'KM Jadhav', 'MS Dhoni', 'AR Patel', 'R Dhawan',
       'CJ Chibhabha', 'H Masakadza', 'R Mutumbami', 'Sikandar Raza',
       'MN Waller', 'CT Mutombodzi', 'E Chigumbura', 'AG Cremer',
       'N Madziva', 'PJ Moor', 'DT Tiripano', 'T Muzarabani',
       'DS Kulkarni', 'V Sibanda', 'T Maruma', 'Tamim Iqbal',
       'Imrul Kayes', 'Sabbir Rahman', 'Shakib Al Hasan', 'Soumya Sarkar',
       'Mahmudullah', 'Mosaddek Hossain', 'Mashrafe Mortaza',
       'Nurul Hasan', 'Rubel Hossain', 'NT Broom', 'KS Williamson',
       'C Munro', 'CJ Anderson', 'TC Bruce', 

In [15]:
# SL ICC T20 Team
# selected_batters = ["PWH de Silva",'KIC Asalanka','BKG Mendis',"P Nissanka",'PHKD Mendis','S Samarawickrama','AD Mathews','MD Shanaka','DM de Silva','M Theekshana','PVD Chameera','N Thushara','M Pathirana','D Madushanka']

In [4]:
def getPlayerScores(player_name: str, innings: list[int] = [1, 2] ) -> pd.DataFrame:
    # Get the data for BKG Mendis if batter is BKG Mendis or non-striker is BKG Mendis
	player_data = data.loc[
		((data['batter'] == player_name) | (data['non_striker'] == player_name)) & (data['innings'].isin(innings))
	]

	player_data.head()

	# 3 matches missing from the data
	# group data by match_id
	gp = player_data.groupby('match_id')
	cols = ['batting_team', 'batter', 'non_striker', 'batter_runs', 'balls_faced', 'wicket_type', 'won', 'innings', 'over', 'delivery', 'wickets_fallen','bowling_team','venue']
	player_scores= gp.last().loc[:, cols]

	# get the first ball he faced or at non-striker
	first_ball = gp.first().loc[:, ['over', 'delivery', 'wickets_fallen']]
	first_ball['first_ball'] = (first_ball['over'] * 6 + first_ball['delivery']).astype(int)

	player_scores['first_ball'] = first_ball['first_ball']
	player_scores['wickets_fallen'] = first_ball['wickets_fallen']

	# when BKG Mendis is the non-striker when the last ball was bowled
	# The batter_runs and balls_faced are not his, but the on_strike batter's
	# So, we need to get the last ball he faced
	# he might not even have faced a ball

	# get the last ball he faced

	matches_non_striker = player_scores[player_scores['non_striker'] == player_name].index
	
	# Sometimes the player might not even have faced a single ball
	# Eg: Afghanistan_Sri Lanka_2022-11-01 MD Shanaka not out on the non strikers end

	player_scores.loc[matches_non_striker, ['batter_runs', 'balls_faced']] = [0, 0]
	
	
	# get the last batter == player_name row from gp data
	gp = player_data[(player_data['batter'] == player_name) & (player_data['match_id'].isin(matches_non_striker))].groupby(['match_id'])
	last_batter_scores = gp.last()[['batter_runs', 'balls_faced']]	
	
	# update the rows with non_striker with correct values
	player_scores.update(last_batter_scores)
	
	



	# adding new features
	# strike rate
	player_scores['strike_rate'] = round(player_scores['batter_runs'] / player_scores['balls_faced'] * 100, 2)
	player_scores['out'] = player_scores['wicket_type'] != '0'
	player_scores['last_ball'] = (player_scores['over'] * 6 + player_scores['delivery']).astype(int)
 
 
	player_scores['batter'] = player_name
	player_scores.drop('non_striker', inplace=True, axis = 1)

	# drop over and delivery
	player_scores.drop(['over', 'delivery'], inplace=True, axis=1)
 
	return player_scores

In [17]:
# getPlayerScores('BKG Mendis')

# merged_df = pd.DataFrame()

# for player in selected_batters:
#     player_scores = getPlayerScores(player)
    
#     merged_df = pd.concat([merged_df, player_scores])



In [5]:
merged_df = pd.DataFrame()

for player in all_batsmen:
    player_scores = getPlayerScores(player)
    
    merged_df = pd.concat([merged_df, player_scores])

In [6]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14957 entries, Australia_England_2011-01-12 to India_Pakistan_2016-02-27
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   batting_team    14957 non-null  object 
 1   batter          14957 non-null  object 
 2   batter_runs     14957 non-null  int64  
 3   balls_faced     14957 non-null  int64  
 4   wicket_type     14957 non-null  object 
 5   won             14957 non-null  float64
 6   innings         14957 non-null  int64  
 7   wickets_fallen  14957 non-null  int64  
 8   bowling_team    14957 non-null  object 
 9   venue           14957 non-null  object 
 10  first_ball      14957 non-null  int32  
 11  strike_rate     14669 non-null  float64
 12  out             14957 non-null  bool   
 13  last_ball       14957 non-null  int32  
dtypes: bool(1), float64(2), int32(2), int64(4), object(5)
memory usage: 1.5+ MB


In [7]:
merged_df.loc[merged_df["strike_rate"].isnull()]

Unnamed: 0_level_0,batting_team,batter,batter_runs,balls_faced,wicket_type,won,innings,wickets_fallen,bowling_team,venue,first_ball,strike_rate,out,last_ball
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Sri Lanka_West Indies_2021-03-05,Sri Lanka,N Dickwella,0,0,0,1.0,1,6,West Indies,"Coolidge Cricket Ground, Antigua",117,,False,120
Australia_Sri Lanka_2017-02-17,Australia,TD Paine,0,0,run out,0.0,1,5,Sri Lanka,Melbourne Cricket Ground,114,,True,118
Australia_South Africa_2014-11-05,Australia,PJ Cummins,0,0,0,0.0,1,6,South Africa,Adelaide Oval,120,,False,120
Australia_Sri Lanka_2017-02-17,Australia,PJ Cummins,0,0,0,0.0,1,6,Sri Lanka,Melbourne Cricket Ground,119,,False,120
England_Australia_2015-08-31,Australia,PJ Cummins,0,0,run out,0.0,2,8,England,Sophia Gardens,119,,True,119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
South Africa_Pakistan_2019-02-01,Pakistan,Usman Shinwari,0,0,0,0.0,2,9,South Africa,Newlands,118,,False,120
England_Pakistan_2015-11-30,Pakistan,Anwar Ali,0,0,0,-1.0,2,7,England,Sharjah Cricket Stadium,120,,False,120
Bangladesh_West Indies_2014-03-25,Bangladesh,Al-Amin Hossain,0,0,bowled,0.0,2,9,West Indies,Shere Bangla National Stadium,114,,True,115
India_Bangladesh_2019-11-10,Bangladesh,Al-Amin Hossain,0,0,bowled,0.0,2,10,India,"Vidarbha Cricket Association Stadium, Jamtha",116,,True,116


In [8]:
merged_df["strike_rate"] = merged_df["strike_rate"].fillna(0)

In [9]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14957 entries, Australia_England_2011-01-12 to India_Pakistan_2016-02-27
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   batting_team    14957 non-null  object 
 1   batter          14957 non-null  object 
 2   batter_runs     14957 non-null  int64  
 3   balls_faced     14957 non-null  int64  
 4   wicket_type     14957 non-null  object 
 5   won             14957 non-null  float64
 6   innings         14957 non-null  int64  
 7   wickets_fallen  14957 non-null  int64  
 8   bowling_team    14957 non-null  object 
 9   venue           14957 non-null  object 
 10  first_ball      14957 non-null  int32  
 11  strike_rate     14957 non-null  float64
 12  out             14957 non-null  bool   
 13  last_ball       14957 non-null  int32  
dtypes: bool(1), float64(2), int32(2), int64(4), object(5)
memory usage: 2.0+ MB


In [10]:
merged_df.head()

Unnamed: 0_level_0,batting_team,batter,batter_runs,balls_faced,wicket_type,won,innings,wickets_fallen,bowling_team,venue,first_ball,strike_rate,out,last_ball
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Australia_England_2011-01-12,Australia,AJ Finch,15,14,0,0.0,1,3,England,Adelaide Oval,91,107.14,False,120
Australia_England_2011-01-14,Australia,AJ Finch,53,33,0,1.0,1,4,England,Melbourne Cricket Ground,63,160.61,False,120
Australia_England_2014-01-29,Australia,AJ Finch,52,31,caught,1.0,1,0,England,Bellerive Oval,1,167.74,True,64
Australia_England_2014-01-31,Australia,AJ Finch,10,11,lbw,1.0,2,0,England,Melbourne Cricket Ground,1,90.91,True,31
Australia_England_2014-02-02,Australia,AJ Finch,30,21,caught,1.0,1,0,England,Stadium Australia,1,142.86,True,40


In [11]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14957 entries, Australia_England_2011-01-12 to India_Pakistan_2016-02-27
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   batting_team    14957 non-null  object 
 1   batter          14957 non-null  object 
 2   batter_runs     14957 non-null  int64  
 3   balls_faced     14957 non-null  int64  
 4   wicket_type     14957 non-null  object 
 5   won             14957 non-null  float64
 6   innings         14957 non-null  int64  
 7   wickets_fallen  14957 non-null  int64  
 8   bowling_team    14957 non-null  object 
 9   venue           14957 non-null  object 
 10  first_ball      14957 non-null  int32  
 11  strike_rate     14957 non-null  float64
 12  out             14957 non-null  bool   
 13  last_ball       14957 non-null  int32  
dtypes: bool(1), float64(2), int32(2), int64(4), object(5)
memory usage: 2.0+ MB


In [12]:
merged_df.drop(columns=["batter_runs","balls_faced","wicket_type","strike_rate","out","last_ball"],inplace=True)

In [13]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14957 entries, Australia_England_2011-01-12 to India_Pakistan_2016-02-27
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   batting_team    14957 non-null  object 
 1   batter          14957 non-null  object 
 2   won             14957 non-null  float64
 3   innings         14957 non-null  int64  
 4   wickets_fallen  14957 non-null  int64  
 5   bowling_team    14957 non-null  object 
 6   venue           14957 non-null  object 
 7   first_ball      14957 non-null  int32  
dtypes: float64(1), int32(1), int64(2), object(4)
memory usage: 1.5+ MB


In [14]:
merged_df.to_csv('./all_batters.csv',index=False)

In [15]:
from sklearn.model_selection import cross_val_score, train_test_split # type: ignore
from sklearn.ensemble import RandomForestClassifier # type: ignore
from sklearn.metrics import accuracy_score# type: ignore
RANDOM_STATE = 42
data =merged_df

# features = ['batter', 'innings', 'wickets_fallen', 'bowling_team', 'first_ball']
target = 'won'

# Preprocess the data
data = pd.get_dummies(data=data,columns=['batter', 'batting_team', 'bowling_team','venue'],dtype=int)



data.head()
data["won"].value_counts()
y = data[target]
X = data.drop(columns=[target])

model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
scores = cross_val_score(model, X, y, cv=5)

# Print the mean and standard deviation of the cross-validation scores
print('Cross-validation scores:', scores)
print('Mean cross-validation score:', scores.mean())
print('Standard deviation of cross-validation scores:', scores.std())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
win_probability = model.predict_proba(X_test)[:, 1]
print('Win probability:', win_probability)







Cross-validation scores: [0.82286096 0.71223262 0.74155801 0.7930458  0.78301571]
Mean cross-validation score: 0.7705426216260188
Standard deviation of cross-validation scores: 0.03908796276225078
Accuracy: 0.8131684491978609
Win probability: [0.66       0.2        0.56033333 ... 0.39       0.35       0.16      ]


In [16]:
X_test.head()

Unnamed: 0_level_0,innings,wickets_fallen,first_ball,batter_A Balbirnie,batter_A Dananjaya,batter_A Flintoff,batter_A Khan,batter_A Ndlovu,batter_A Nehra,batter_A Nel,...,"venue_Warner Park, Basseterre, St Kitts","venue_Warner Park, St Kitts",venue_Western Australia Cricket Association Ground,venue_Westpac Stadium,"venue_Windsor Park, Roseau","venue_Windsor Park, Roseau, Dominica",venue_Zahur Ahmed Chowdhury Stadium,"venue_Zahur Ahmed Chowdhury Stadium, Chattogram","venue_Zayed Cricket Stadium, Abu Dhabi",venue_Zhejiang University of Technology Cricket Field
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
India_South Africa_2015-10-02,1,1,20,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
South Africa_West Indies_2015-01-14,1,1,77,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
India_Australia_2017-10-10,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
South Africa_Ireland_2021-07-19,2,8,70,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sri Lanka_Pakistan_2015-07-30,1,5,120,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
win_proba_df = pd.DataFrame(win_probability, columns=['win_probability'])

X_test = X_test.reset_index(drop=True)
win_proba_df = win_proba_df.reset_index(drop=True)
result = pd.concat([X_test, win_proba_df], axis=1)

result.head()

Unnamed: 0,innings,wickets_fallen,first_ball,batter_A Balbirnie,batter_A Dananjaya,batter_A Flintoff,batter_A Khan,batter_A Ndlovu,batter_A Nehra,batter_A Nel,...,"venue_Warner Park, St Kitts",venue_Western Australia Cricket Association Ground,venue_Westpac Stadium,"venue_Windsor Park, Roseau","venue_Windsor Park, Roseau, Dominica",venue_Zahur Ahmed Chowdhury Stadium,"venue_Zahur Ahmed Chowdhury Stadium, Chattogram","venue_Zayed Cricket Stadium, Abu Dhabi",venue_Zhejiang University of Technology Cricket Field,win_probability
0,1,1,20,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.66
1,1,1,77,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.2
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.560333
3,2,8,70,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.98
4,1,5,120,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.15


In [18]:
y_train.value_counts()

won
 0.0    6725
 1.0    4872
-1.0     368
Name: count, dtype: int64

In [19]:
y_test.value_counts()

won
 0.0    1685
 1.0    1209
-1.0      98
Name: count, dtype: int64