# Batting Order Model Development

This notebook will explore different models to get the win probability of a match given the state of the match and the next batsman to walk in to the crease. The input also contains the remaining quota of balls for different bowler types. This is a future value, but under the assumption of 5 bowlers or 4 bowlers with two part time bowlers, that future value is already available for the team. 

Input --> Batter,  Team , Ball number, Innings, Non- striker  
Ouput --> Win Probability

In [1]:
import pandas as pd

# Load the data
data = pd.read_csv('../Data/selected_data/processed_data_NRR_with_venues.csv')

Selecting Player "KM Mendis"

In [2]:
all_batsmen = data['batter'].unique()
all_batsmen

array(['AJ Finch', 'M Klinger', 'TM Head', 'MC Henriques', 'AJ Turner',
       'JP Faulkner', 'N Dickwella', 'WU Tharanga', 'EMDY Munaweera',
       'DAS Gunaratne', 'TAM Siriwardana', 'CK Kapugedera', 'S Prasanna',
       'BR Dunk', 'TD Paine', 'PJ Cummins', 'AJ Tye', 'JA Richardson',
       'BKG Mendis', 'KMDN Kulasekara', 'SL Malinga', 'MD Shanaka',
       'JRMVB Sanjaya', 'KL Rahul', 'AT Rayudu', 'Mandeep Singh',
       'MK Pandey', 'KM Jadhav', 'MS Dhoni', 'AR Patel', 'R Dhawan',
       'CJ Chibhabha', 'H Masakadza', 'R Mutumbami', 'Sikandar Raza',
       'MN Waller', 'CT Mutombodzi', 'E Chigumbura', 'AG Cremer',
       'N Madziva', 'PJ Moor', 'DT Tiripano', 'T Muzarabani',
       'DS Kulkarni', 'V Sibanda', 'T Maruma', 'Tamim Iqbal',
       'Imrul Kayes', 'Sabbir Rahman', 'Shakib Al Hasan', 'Soumya Sarkar',
       'Mahmudullah', 'Mosaddek Hossain', 'Mashrafe Mortaza',
       'Nurul Hasan', 'Rubel Hossain', 'NT Broom', 'KS Williamson',
       'C Munro', 'CJ Anderson', 'TC Bruce', 

In [4]:
# SL ICC T20 Team
# selected_batters = ["PWH de Silva",'KIC Asalanka','BKG Mendis',"P Nissanka",'PHKD Mendis','S Samarawickrama','AD Mathews','MD Shanaka','DM de Silva','M Theekshana','PVD Chameera','N Thushara','M Pathirana','D Madushanka']

In [3]:
def getPlayerScores(player_name: str, innings: list[int] = [1, 2] ) -> pd.DataFrame:
    # Get the data for BKG Mendis if batter is BKG Mendis or non-striker is BKG Mendis
	player_data = data.loc[
		((data['batter'] == player_name) | (data['non_striker'] == player_name)) & (data['innings'].isin(innings))
	]

	player_data.head()

	# 3 matches missing from the data
	# group data by match_id
	gp = player_data.groupby('match_id')
	cols = ['batting_team', 'batter', 'non_striker', 'batter_runs', 'balls_faced', 'wicket_type', 'won', 'innings', 'over', 'delivery', 'wickets_fallen','bowling_team','venue']
	player_scores= gp.last().loc[:, cols]

	# get the first ball he faced or at non-striker
	first_ball = gp.first().loc[:, ['over', 'delivery', 'wickets_fallen']]
	first_ball['first_ball'] = (first_ball['over'] * 6 + first_ball['delivery']).astype(int)

	player_scores['first_ball'] = first_ball['first_ball']
	player_scores['wickets_fallen'] = first_ball['wickets_fallen']

	# when BKG Mendis is the non-striker when the last ball was bowled
	# The batter_runs and balls_faced are not his, but the on_strike batter's
	# So, we need to get the last ball he faced
	# he might not even have faced a ball

	# get the last ball he faced

	matches_non_striker = player_scores[player_scores['non_striker'] == player_name].index
	
	# Sometimes the player might not even have faced a single ball
	# Eg: Afghanistan_Sri Lanka_2022-11-01 MD Shanaka not out on the non strikers end

	player_scores.loc[matches_non_striker, ['batter_runs', 'balls_faced']] = [0, 0]
	
	
	# get the last batter == player_name row from gp data
	gp = player_data[(player_data['batter'] == player_name) & (player_data['match_id'].isin(matches_non_striker))].groupby(['match_id'])
	last_batter_scores = gp.last()[['batter_runs', 'balls_faced']]	
	
	# update the rows with non_striker with correct values
	player_scores.update(last_batter_scores)
	
	



	# adding new features
	# strike rate
	player_scores['strike_rate'] = round(player_scores['batter_runs'] / player_scores['balls_faced'] * 100, 2)
	player_scores['out'] = player_scores['wicket_type'] != '0'
	player_scores['last_ball'] = (player_scores['over'] * 6 + player_scores['delivery']).astype(int)
 
 
	player_scores['batter'] = player_name
	player_scores.drop('non_striker', inplace=True, axis = 1)

	# drop over and delivery
	player_scores.drop(['over', 'delivery'], inplace=True, axis=1)
 
	return player_scores

In [4]:
merged_df = pd.DataFrame()

for player in all_batsmen:
    player_scores = getPlayerScores(player)
    
    merged_df = pd.concat([merged_df, player_scores])
    
    


In [5]:
merged_df.drop(columns=["batter_runs","balls_faced","wicket_type","strike_rate","out","last_ball"],inplace=True)

In [6]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14778 entries, Australia_England_2011-01-12 to India_Pakistan_2016-02-27
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   batting_team    14778 non-null  object 
 1   batter          14778 non-null  object 
 2   won             14778 non-null  float64
 3   innings         14778 non-null  int64  
 4   wickets_fallen  14778 non-null  int64  
 5   bowling_team    14778 non-null  object 
 6   venue           14778 non-null  object 
 7   first_ball      14778 non-null  int32  
dtypes: float64(1), int32(1), int64(2), object(4)
memory usage: 981.4+ KB


In [7]:
merged_df.to_csv('./all_batters.csv',index=False)

In [9]:
from sklearn.model_selection import cross_val_score, train_test_split # type: ignore
from sklearn.ensemble import RandomForestClassifier # type: ignore
from sklearn.metrics import accuracy_score# type: ignore
RANDOM_STATE = 42
data =merged_df

# features = ['batter', 'innings', 'wickets_fallen', 'bowling_team', 'first_ball']
target = 'won'

# Preprocess the data
data = pd.get_dummies(data=data,columns=['batter', 'batting_team', 'bowling_team','venue'],dtype=int)



data.head()
data["won"].value_counts()
y = data[target]
X = data.drop(columns=[target])

model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
scores = cross_val_score(model, X, y, cv=5)

# Print the mean and standard deviation of the cross-validation scores
print('Cross-validation scores:', scores)
print('Mean cross-validation score:', scores.mean())
print('Standard deviation of cross-validation scores:', scores.std())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
win_probability = model.predict_proba(X_test)[:, 1]
print('Win probability:', win_probability)







Cross-validation scores: [0.81461434 0.71244926 0.72699594 0.77631134 0.77631134]
Mean cross-validation score: 0.761336442670733
Standard deviation of cross-validation scores: 0.03703041123003895
Accuracy: 0.803788903924222
Win probability: [0.385 0.18  0.59  ... 0.94  0.29  0.63 ]


In [10]:
X_test.head()

Unnamed: 0_level_0,innings,wickets_fallen,first_ball,batter_A Balbirnie,batter_A Dananjaya,batter_A Flintoff,batter_A Khan,batter_A Ndlovu,batter_A Nehra,batter_A Nel,...,venue_Trent Bridge,venue_University Oval,venue_Vidarbha Cricket Association Stadium,venue_WACA Ground,venue_Wankhede Stadium,venue_Warner Park Sporting Complex,venue_Windsor Park,venue_Zahur Ahmed Chowdhury Stadium,venue_Zayed Cricket Stadium,venue_Zhejiang University of Technology Cricket Field
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Australia_New Zealand_2021-03-05,1,1,16,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
England_Pakistan_2010-09-05,2,2,37,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
West Indies_England_2017-09-16,1,9,117,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sri Lanka_Pakistan_2017-10-26,1,7,76,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Zimbabwe_Pakistan_2020-11-08,2,2,76,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
win_proba_df = pd.DataFrame(win_probability, columns=['win_probability'])

X_test = X_test.reset_index(drop=True)
win_proba_df = win_proba_df.reset_index(drop=True)
result = pd.concat([X_test, win_proba_df], axis=1)

result.head()

Unnamed: 0,innings,wickets_fallen,first_ball,batter_A Balbirnie,batter_A Dananjaya,batter_A Flintoff,batter_A Khan,batter_A Ndlovu,batter_A Nehra,batter_A Nel,...,venue_University Oval,venue_Vidarbha Cricket Association Stadium,venue_WACA Ground,venue_Wankhede Stadium,venue_Warner Park Sporting Complex,venue_Windsor Park,venue_Zahur Ahmed Chowdhury Stadium,venue_Zayed Cricket Stadium,venue_Zhejiang University of Technology Cricket Field,win_probability
0,1,1,16,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.385
1,2,2,37,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.18
2,1,9,117,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.59
3,1,7,76,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.85
4,2,2,76,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.11


In [15]:
y_train.value_counts()

In [16]:
y_test.value_counts()