# Batting Order Model Development

This notebook will explore different models to get the win probability of a match given the state of the match and the next batsman to walk in to the crease. The input also contains the remaining quota of balls for different bowler types. This is a future value, but under the assumption of 5 bowlers or 4 bowlers with two part time bowlers, that future value is already available for the team. 

Input --> Batter,  Team , Ball number, Innings, Non- striker  
Ouput --> Win Probability

In [13]:
import pandas as pd

# Load the data
data = pd.read_csv('../Data/selected_data/processed_data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217242 entries, 0 to 217241
Data columns (total 32 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   batter               217242 non-null  object 
 1   bowler               217242 non-null  object 
 2   non_striker          217242 non-null  object 
 3   runs_by_bat          217242 non-null  int64  
 4   extra_runs           217242 non-null  int64  
 5   total_runs_delivery  217242 non-null  int64  
 6   current_team_total   217242 non-null  int64  
 7   runs_remain          217242 non-null  float64
 8   batter_runs          217242 non-null  int64  
 9   balls_faced          217242 non-null  int64  
 10  wickets_fallen       217242 non-null  int64  
 11  extra_type           217242 non-null  object 
 12  delivery             217242 non-null  float64
 13  over                 217242 non-null  int64  
 14  wicket_type          217242 non-null  object 
 15  player_out       

Selecting Player "KM Mendis"

In [14]:
sl_batsmen = data[data['batting_team'] == "Sri Lanka"]['batter'].unique()
sl_batsmen

array(['N Dickwella', 'WU Tharanga', 'EMDY Munaweera', 'DAS Gunaratne',
       'TAM Siriwardana', 'CK Kapugedera', 'S Prasanna', 'BKG Mendis',
       'KMDN Kulasekara', 'SL Malinga', 'MD Shanaka', 'JRMVB Sanjaya',
       'MDKJ Perera', 'NLTC Perera', 'AD Mathews', 'SMA Priyanjan',
       'I Udana', 'MD Gunathilaka', 'S Samarawickrama', 'ML Udawatte',
       'SS Pathirana', 'PC de Silva', 'A Dananjaya', 'PVD Chameera',
       'MVT Fernando', 'N Pradeep', 'LD Chandimal', 'BMAJ Mendis',
       'RAS Lakmal', 'DM de Silva', 'PHKD Mendis', 'MA Aponso',
       'PADLR Sandakan', 'WIA Fernando', 'AK Perera', 'CAK Rajitha',
       'CBRLS Kumara', 'PBB Rajapaksa', 'BOP Fernando', 'PWH de Silva',
       'GSNFG Jayasuriya', 'LD Madushanka', 'M Bhanuka', 'B Fernando',
       'P Nissanka', 'KNA Bandara', 'KIC Asalanka', 'C Karunaratne',
       'RTM Mendis', 'M Theekshana', 'JDF Vandersay', 'K Mishara',
       'J Liyanage', 'P Jayawickrama', 'N Thushara', 'PM Liyanagamage',
       'D Madushanka', 'M P

In [15]:
selected_batters = ["PWH de Silva",'KIC Asalanka','BKG Mendis',"P Nissanka",'PHKD Mendis','S Samarawickrama','AD Mathews','MD Shanaka','DM de Silva','M Theekshana','PVD Chameera','N Thushara','M Pathirana','D Madushanka']

In [16]:
def getPlayerScores(player_name: str, innings: list[int] = [1, 2] ) -> pd.DataFrame:
    # Get the data for BKG Mendis if batter is BKG Mendis or non-striker is BKG Mendis
	player_data = data.loc[
		((data['batter'] == player_name) | (data['non_striker'] == player_name)) & (data['innings'].isin(innings))
	]

	player_data.head()

	# 3 matches missing from the data
	# group data by match_id
	gp = player_data.groupby('match_id')
	cols = [ 'batter', 'non_striker', 'batter_runs', 'balls_faced', 'wicket_type', 'won', 'innings', 'over', 'delivery', 'wickets_fallen','bowling_team']
	player_scores= gp.last().loc[:, cols]

	# get the first ball he faced or at non-striker
	first_ball = gp.first().loc[:, ['over', 'delivery', 'wickets_fallen']]
	first_ball['first_ball'] = (first_ball['over'] * 6 + first_ball['delivery']).astype(int)

	player_scores['first_ball'] = first_ball['first_ball']
	player_scores['wickets_fallen'] = first_ball['wickets_fallen']

	# when BKG Mendis is the non-striker when the last ball was bowled
	# The batter_runs and balls_faced are not his, but the on_strike batter's
	# So, we need to get the last ball he faced
	# he might not even have faced a ball

	# get the last ball he faced

	matches_non_striker = player_scores[player_scores['non_striker'] == player_name].index
	
	# Sometimes the player might not even have faced a single ball
	# Eg: Afghanistan_Sri Lanka_2022-11-01 MD Shanaka not out on the non strikers end

	player_scores.loc[matches_non_striker, ['batter_runs', 'balls_faced']] = [0, 0]
	
	
	# get the last batter == player_name row from gp data
	gp = player_data[(player_data['batter'] == player_name) & (player_data['match_id'].isin(matches_non_striker))].groupby(['match_id'])
	last_batter_scores = gp.last()[['batter_runs', 'balls_faced']]	
	
	# update the rows with non_striker with correct values
	player_scores.update(last_batter_scores)
	
	



	# adding new features
	# strike rate
	player_scores['strike_rate'] = round(player_scores['batter_runs'] / player_scores['balls_faced'] * 100, 2)
	player_scores['out'] = player_scores['wicket_type'] != '0'
	player_scores['last_ball'] = (player_scores['over'] * 6 + player_scores['delivery']).astype(int)

	# drop over and delivery
	player_scores.drop(['over', 'delivery'], inplace=True, axis=1)
	return player_scores

In [17]:
# getPlayerScores('BKG Mendis')

merged_df = pd.DataFrame()

for player in selected_batters:
    player_scores = getPlayerScores(player)
    
    merged_df = pd.concat([merged_df, player_scores])



In [18]:
merged_df.head()

Unnamed: 0_level_0,batter,non_striker,batter_runs,balls_faced,wicket_type,won,innings,wickets_fallen,bowling_team,first_ball,strike_rate,out,last_ball
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Afghanistan_Sri Lanka_2022-09-03,C Karunaratne,PWH de Silva,16,9,0,1.0,2,5,Afghanistan,101,177.78,False,115
Afghanistan_Sri Lanka_2024-02-21,PWH de Silva,S Samarawickrama,13,11,caught,0.0,2,2,Afghanistan,39,118.18,True,56
Australia_Sri Lanka_2019-10-27,PWH de Silva,MD Shanaka,5,10,run out,0.0,2,5,Australia,64,50.0,True,84
Australia_Sri Lanka_2019-10-30,PWH de Silva,I Udana,10,11,stumped,0.0,1,5,Australia,66,90.91,True,88
Australia_Sri Lanka_2022-02-11,PWH de Silva,MD Shanaka,13,11,caught,0.0,2,3,Australia,63,118.18,True,87


In [19]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 464 entries, Afghanistan_Sri Lanka_2022-09-03 to Sri Lanka_New Zealand_2023-04-05
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   batter          464 non-null    object 
 1   non_striker     464 non-null    object 
 2   batter_runs     464 non-null    int64  
 3   balls_faced     464 non-null    int64  
 4   wicket_type     464 non-null    object 
 5   won             464 non-null    float64
 6   innings         464 non-null    int64  
 7   wickets_fallen  464 non-null    int64  
 8   bowling_team    464 non-null    object 
 9   first_ball      464 non-null    int32  
 10  strike_rate     459 non-null    float64
 11  out             464 non-null    bool   
 12  last_ball       464 non-null    int32  
dtypes: bool(1), float64(2), int32(2), int64(4), object(4)
memory usage: 44.0+ KB


In [20]:
merged_df.drop(columns=["batter_runs","balls_faced","wicket_type","strike_rate","out","last_ball"],inplace=True)

In [21]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 464 entries, Afghanistan_Sri Lanka_2022-09-03 to Sri Lanka_New Zealand_2023-04-05
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   batter          464 non-null    object 
 1   non_striker     464 non-null    object 
 2   won             464 non-null    float64
 3   innings         464 non-null    int64  
 4   wickets_fallen  464 non-null    int64  
 5   bowling_team    464 non-null    object 
 6   first_ball      464 non-null    int32  
dtypes: float64(1), int32(1), int64(2), object(3)
memory usage: 27.2+ KB


In [22]:
merged_df.to_csv('./selected_batters.csv',index=False)

In [23]:
from sklearn.model_selection import cross_val_score, train_test_split # type: ignore
from sklearn.ensemble import RandomForestClassifier # type: ignore
from sklearn.metrics import accuracy_score# type: ignore
RANDOM_STATE = 42
data =merged_df

features = ['batter', 'innings', 'wickets_fallen', 'bowling_team', 'first_ball']
target = 'won'

# Preprocess the data
data = pd.get_dummies(data=data,columns=['batter', 'non_striker',  'bowling_team'],dtype=int)



data.head()
data["won"].value_counts()
y = data[target]
X = data.drop(columns=[target])

model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
scores = cross_val_score(model, X, y, cv=5)

# Print the mean and standard deviation of the cross-validation scores
print('Cross-validation scores:', scores)
print('Mean cross-validation score:', scores.mean())
print('Standard deviation of cross-validation scores:', scores.std())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
win_probability = model.predict_proba(X_test)[:, 1]
print('Win probability:', win_probability)







Cross-validation scores: [0.65591398 0.58064516 0.37634409 0.50537634 0.65217391]
Mean cross-validation score: 0.5540906965871903
Standard deviation of cross-validation scores: 0.10455657166498811
Accuracy: 0.6881720430107527
Win probability: [0.94       0.85       0.65       0.84       0.51       0.84
 0.9        0.6        0.83       0.81       0.64       0.56333333
 0.42       0.62       0.51       0.22       0.92       0.18
 0.88       0.53       0.59666667 0.60666667 0.69       0.54
 0.85       0.5        0.15       0.79       0.61       0.81
 0.65       0.96       0.73       0.98       0.44       0.91
 0.87       0.72       0.91       0.305      0.76       0.74
 0.34       0.59       0.38       0.65       0.41       0.78
 0.31       0.68       0.61       0.59       0.75       0.8
 0.74       0.66       0.54333333 0.85       0.96       0.37483333
 0.7        0.18       0.83       0.54       0.53       0.15
 0.78       0.19       0.68       0.58       0.95       0.64
 0.54       0.

In [24]:
X_test.head()

Unnamed: 0_level_0,innings,wickets_fallen,first_ball,batter_AD Mathews,batter_AM Fernando,batter_B Fernando,batter_BKG Mendis,batter_BOP Fernando,batter_C Karunaratne,batter_CBRLS Kumara,...,bowling_team_Australia,bowling_team_Bangladesh,bowling_team_England,bowling_team_India,bowling_team_Ireland,bowling_team_New Zealand,bowling_team_Pakistan,bowling_team_South Africa,bowling_team_West Indies,bowling_team_Zimbabwe
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
England_Sri Lanka_2021-06-26,2,5,69,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Sri Lanka_Australia_2022-10-25,1,5,97,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
Sri Lanka_New Zealand_2023-04-02,1,5,104,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Sri Lanka_Australia_2022-02-18,1,6,106,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
Sri Lanka_New Zealand_2012-09-27,2,3,89,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [25]:
win_proba_df = pd.DataFrame(win_probability, columns=['win_probability'])

X_test = X_test.reset_index(drop=True)
win_proba_df = win_proba_df.reset_index(drop=True)
result = pd.concat([X_test, win_proba_df], axis=1)

result.head()

Unnamed: 0,innings,wickets_fallen,first_ball,batter_AD Mathews,batter_AM Fernando,batter_B Fernando,batter_BKG Mendis,batter_BOP Fernando,batter_C Karunaratne,batter_CBRLS Kumara,...,bowling_team_Bangladesh,bowling_team_England,bowling_team_India,bowling_team_Ireland,bowling_team_New Zealand,bowling_team_Pakistan,bowling_team_South Africa,bowling_team_West Indies,bowling_team_Zimbabwe,win_probability
0,2,5,69,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0.94
1,1,5,97,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.85
2,1,5,104,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0.65
3,1,6,106,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.84
4,2,3,89,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0.51
