# Batting Order Model Development

This notebook will explore different models to get the win probability of a match given the state of the match and the next batsman to walk in to the crease. The input also contains the remaining quota of balls for different bowler types. This is a future value, but under the assumption of 5 bowlers or 4 bowlers with two part time bowlers, that future value is already available for the team. 

Input --> Batter,  Team , Ball number, Innings, Non- striker  
Ouput --> Win Probability

In [1]:
import pandas as pd

# Load the data
data = pd.read_csv('../Data/selected_data/processed_data.csv')
venue_data = pd.read_csv('../Data/selected_data/venues.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217242 entries, 0 to 217241
Data columns (total 32 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   batter               217242 non-null  object 
 1   bowler               217242 non-null  object 
 2   non_striker          217242 non-null  object 
 3   runs_by_bat          217242 non-null  int64  
 4   extra_runs           217242 non-null  int64  
 5   total_runs_delivery  217242 non-null  int64  
 6   current_team_total   217242 non-null  int64  
 7   runs_remain          217242 non-null  float64
 8   batter_runs          217242 non-null  int64  
 9   balls_faced          217242 non-null  int64  
 10  wickets_fallen       217242 non-null  int64  
 11  extra_type           217242 non-null  object 
 12  delivery             217242 non-null  float64
 13  over                 217242 non-null  int64  
 14  wicket_type          217242 non-null  object 
 15  player_out       

Selecting Player "KM Mendis"

In [2]:
# sl_batsmen = data[data['batting_team'] == "Sri Lanka"]['batter'].unique()
# sl_batsmen

In [3]:
all_batsmen = data['batter'].unique()
all_batsmen

array(['AJ Finch', 'M Klinger', 'TM Head', 'MC Henriques', 'AJ Turner',
       'JP Faulkner', 'N Dickwella', 'WU Tharanga', 'EMDY Munaweera',
       'DAS Gunaratne', 'TAM Siriwardana', 'CK Kapugedera', 'S Prasanna',
       'BR Dunk', 'TD Paine', 'PJ Cummins', 'AJ Tye', 'JA Richardson',
       'BKG Mendis', 'KMDN Kulasekara', 'SL Malinga', 'MD Shanaka',
       'JRMVB Sanjaya', 'KL Rahul', 'AT Rayudu', 'Mandeep Singh',
       'MK Pandey', 'KM Jadhav', 'MS Dhoni', 'AR Patel', 'R Dhawan',
       'CJ Chibhabha', 'H Masakadza', 'R Mutumbami', 'Sikandar Raza',
       'MN Waller', 'CT Mutombodzi', 'E Chigumbura', 'AG Cremer',
       'N Madziva', 'PJ Moor', 'DT Tiripano', 'T Muzarabani',
       'DS Kulkarni', 'V Sibanda', 'T Maruma', 'Tamim Iqbal',
       'Imrul Kayes', 'Sabbir Rahman', 'Shakib Al Hasan', 'Soumya Sarkar',
       'Mahmudullah', 'Mosaddek Hossain', 'Mashrafe Mortaza',
       'Nurul Hasan', 'Rubel Hossain', 'NT Broom', 'KS Williamson',
       'C Munro', 'CJ Anderson', 'TC Bruce', 

In [4]:
# SL ICC T20 Team
# selected_batters = ["PWH de Silva",'KIC Asalanka','BKG Mendis',"P Nissanka",'PHKD Mendis','S Samarawickrama','AD Mathews','MD Shanaka','DM de Silva','M Theekshana','PVD Chameera','N Thushara','M Pathirana','D Madushanka']

In [5]:
def getPlayerScores(player_name: str, innings: list[int] = [1, 2] ) -> pd.DataFrame:
    # Get the data for BKG Mendis if batter is BKG Mendis or non-striker is BKG Mendis
	player_data = data.loc[
		((data['batter'] == player_name) | (data['non_striker'] == player_name)) & (data['innings'].isin(innings))
	]

	player_data.head()

	# 3 matches missing from the data
	# group data by match_id
	gp = player_data.groupby('match_id')
	cols = ['batting_team', 'batter', 'non_striker', 'batter_runs', 'balls_faced', 'wicket_type', 'won', 'innings', 'over', 'delivery', 'wickets_fallen','bowling_team','venue']
	player_scores= gp.last().loc[:, cols]

	# get the first ball he faced or at non-striker
	first_ball = gp.first().loc[:, ['over', 'delivery', 'wickets_fallen']]
	first_ball['first_ball'] = (first_ball['over'] * 6 + first_ball['delivery']).astype(int)

	player_scores['first_ball'] = first_ball['first_ball']
	player_scores['wickets_fallen'] = first_ball['wickets_fallen']

	# when BKG Mendis is the non-striker when the last ball was bowled
	# The batter_runs and balls_faced are not his, but the on_strike batter's
	# So, we need to get the last ball he faced
	# he might not even have faced a ball

	# get the last ball he faced

	matches_non_striker = player_scores[player_scores['non_striker'] == player_name].index
	
	# Sometimes the player might not even have faced a single ball
	# Eg: Afghanistan_Sri Lanka_2022-11-01 MD Shanaka not out on the non strikers end

	player_scores.loc[matches_non_striker, ['batter_runs', 'balls_faced']] = [0, 0]
	
	
	# get the last batter == player_name row from gp data
	gp = player_data[(player_data['batter'] == player_name) & (player_data['match_id'].isin(matches_non_striker))].groupby(['match_id'])
	last_batter_scores = gp.last()[['batter_runs', 'balls_faced']]	
	
	# update the rows with non_striker with correct values
	player_scores.update(last_batter_scores)
	
	



	# adding new features
	# strike rate
	player_scores['strike_rate'] = round(player_scores['batter_runs'] / player_scores['balls_faced'] * 100, 2)
	player_scores['out'] = player_scores['wicket_type'] != '0'
	player_scores['last_ball'] = (player_scores['over'] * 6 + player_scores['delivery']).astype(int)
 
 
	player_scores['batter'] = player_name
	player_scores.drop('non_striker', inplace=True, axis = 1)

	# drop over and delivery
	player_scores.drop(['over', 'delivery'], inplace=True, axis=1)
 
	return player_scores

In [6]:
# getPlayerScores('BKG Mendis')

# merged_df = pd.DataFrame()

# for player in selected_batters:
#     player_scores = getPlayerScores(player)
    
#     merged_df = pd.concat([merged_df, player_scores])



In [19]:
# merged_df = pd.DataFrame()

# for player in all_batsmen:
#     player_scores = getPlayerScores(player)
    
#     merged_df = pd.concat([merged_df, player_scores])
    
    
merged_df = pd.read_csv("./all_batters.csv")

In [20]:
merged_df.drop(columns=["batter_runs","balls_faced","wicket_type","strike_rate","out","last_ball"],inplace=True)

KeyError: "['batter_runs', 'balls_faced', 'wicket_type', 'strike_rate', 'out', 'last_ball'] not found in axis"

In [21]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14957 entries, 0 to 14956
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   batting_team    14957 non-null  object 
 1   batter          14957 non-null  object 
 2   won             14957 non-null  float64
 3   innings         14957 non-null  int64  
 4   wickets_fallen  14957 non-null  int64  
 5   bowling_team    14957 non-null  object 
 6   venue           14957 non-null  object 
 7   first_ball      14957 non-null  int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 934.9+ KB


In [24]:
# merged_df["venue_score"] = merged_df["venue"].apply(lambda x:venue_data.loc[venue_data["venue"]==x]["total_mean"])
merged_df["venue_score"] = merged_df["venue"].map(venue_data.set_index("venue")["total_mean"])


In [25]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14957 entries, 0 to 14956
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   batting_team    14957 non-null  object 
 1   batter          14957 non-null  object 
 2   won             14957 non-null  float64
 3   innings         14957 non-null  int64  
 4   wickets_fallen  14957 non-null  int64  
 5   bowling_team    14957 non-null  object 
 6   venue           14957 non-null  object 
 7   first_ball      14957 non-null  int64  
 8   venue_score     8033 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 1.0+ MB


In [29]:
merged_df.loc[merged_df["venue_score"].isnull()]

Unnamed: 0,batting_team,batter,won,innings,wickets_fallen,bowling_team,venue,first_ball,venue_score
12,Australia,AJ Finch,1.0,1,0,India,"Brisbane Cricket Ground, Woolloongabba",1,
16,Australia,AJ Finch,0.0,1,0,India,"Vidarbha Cricket Association Stadium, Jamtha, ...",1,
17,Australia,AJ Finch,0.0,1,0,India,"Rajiv Gandhi International Stadium, Uppal, Hyd...",1,
18,Australia,AJ Finch,1.0,1,0,Ireland,"Brisbane Cricket Ground, Woolloongabba, Brisbane",1,
19,Australia,AJ Finch,1.0,1,0,New Zealand,Westpac Stadium,0,
...,...,...,...,...,...,...,...,...,...
14930,Australia,PM Nevill,1.0,2,7,Bangladesh,M Chinnaswamy Stadium,109,
14932,Australia,PM Nevill,0.0,1,6,India,"Punjab Cricket Association IS Bindra Stadium, ...",116,
14934,Australia,PM Nevill,1.0,2,6,Sri Lanka,R Premadasa Stadium,103,
14950,Bangladesh,Shuvagata Hom,0.0,1,3,Australia,M Chinnaswamy Stadium,57,


In [11]:
# merged_df.to_csv('./all_batters.csv',index=False)

In [12]:
# from sklearn.model_selection import cross_val_score, train_test_split # type: ignore
# from sklearn.ensemble import RandomForestClassifier # type: ignore
# from sklearn.metrics import accuracy_score# type: ignore
# RANDOM_STATE = 42
# data =merged_df

# # features = ['batter', 'innings', 'wickets_fallen', 'bowling_team', 'first_ball']
# target = 'won'

# # Preprocess the data
# data = pd.get_dummies(data=data,columns=['batter', 'batting_team', 'bowling_team','venue'],dtype=int)



# data.head()
# data["won"].value_counts()
# y = data[target]
# X = data.drop(columns=[target])

# model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
# scores = cross_val_score(model, X, y, cv=5)

# # Print the mean and standard deviation of the cross-validation scores
# print('Cross-validation scores:', scores)
# print('Mean cross-validation score:', scores.mean())
# print('Standard deviation of cross-validation scores:', scores.std())

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)
# print('Accuracy:', accuracy_score(y_test, y_pred))
# win_probability = model.predict_proba(X_test)[:, 1]
# print('Win probability:', win_probability)







In [13]:
# X_test.head()

In [14]:
# win_proba_df = pd.DataFrame(win_probability, columns=['win_probability'])

# X_test = X_test.reset_index(drop=True)
# win_proba_df = win_proba_df.reset_index(drop=True)
# result = pd.concat([X_test, win_proba_df], axis=1)

# result.head()

In [15]:
# y_train.value_counts()

In [16]:
# y_test.value_counts()