# Derive the augmented features from the match state

In [298]:
import pandas as pd
import numpy as np


venues_df = pd.read_csv("train_venues.csv")
player_vs_bowler_df = pd.read_csv("train_player_vs_bowler.csv")
train_set = pd.read_csv("train_set.csv")

player_vs_bowler_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2747 entries, 0 to 2746
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   bowler_type            2747 non-null   object 
 1   strike_rate            2747 non-null   float64
 2   strike_rate_std        2596 non-null   float64
 3   wickets                2747 non-null   int64  
 4   deliveries_per_wicket  2747 non-null   float64
 5   deliveries             2747 non-null   int64  
 6   batter                 2747 non-null   object 
 7   index                  0 non-null      float64
dtypes: float64(4), int64(2), object(2)
memory usage: 171.8+ KB


In [299]:
# row from the training set without the historically derived data
input_dic = {
    # "batter": "A Flintoff",
    "innings": 1,
    "wickets_fallen": 1,
    "bowling_team": "Australia",
    "batting_team": "England",
    "toss_winner": 1,
    "runs_remain": 144.0,
    "first_ball": 24,
    "current_team_total": 29,
    "is_powerplay": True,
    "Left arm Fast": 0.0,
    "Left arm Orthodox": 18.0,
    "Left arm Wrist spin": 0.0,
    "Right arm Fast": 146.0,
    "Right arm Legbreak": 0.0,
    "Right arm Offbreak": 0.0,
    "venue": "GMHBA Stadium"
}


def derive_features(input_dic: dict) -> dict:
	bowler_types = ['Left arm Fast', 'Right arm Fast', 'Left arm Orthodox',
					'Left arm Wrist spin', 'Right arm Legbreak', 'Right arm Offbreak']


	def get_venue_info():
		venue = input_dic['venue']
		del input_dic['venue']
		if venue in venues_df['venue'].values:  # Check if venue exists in DataFrame
			venue_mean_total = venues_df.loc[venues_df['venue'] == venue, 'total_mean']
			venue_first_bat_won_ratio = venues_df.loc[venues_df['venue']
													== venue, 'first_bat_won_ratio']
			# Access single value from Series
			input_dic['venue_mean_total'] = venue_mean_total.values[0]
			input_dic['venue_first_bat_won_ratio'] = venue_first_bat_won_ratio.values[0]
		else:
			input_dic['venue_mean_total'] = venues_df['total_mean'].median()
			input_dic['venue_first_bat_won_ratio'] = venues_df['first_bat_won_ratio'].median()


	def get_runs_remain():
		if input_dic['innings'] == 2:
			return
		if input_dic['innings'] == 1:
			return input_dic['venue_mean_total'] - input_dic['current_team_total']


	bowler_types = ['Left arm Fast', 'Right arm Fast', 'Left arm Orthodox',
					'Left arm Wrist spin', 'Right arm Legbreak', 'Right arm Offbreak']


	def get_bowler_type_data():
		player_vs_bowler = player_vs_bowler_df[player_vs_bowler_df['batter']
											== input_dic['batter']]
		player_vs_bowler.set_index("bowler_type", inplace=True)
		for bowler_type in bowler_types:
			if bowler_type in player_vs_bowler.index:
				input_dic[f'{bowler_type} Expected Runs'] = player_vs_bowler.loc[bowler_type,
																				'strike_rate'] / 100 * input_dic[bowler_type]
				input_dic[f'{bowler_type} Expected Wickets'] = input_dic[bowler_type] / \
					player_vs_bowler.loc[bowler_type, 'deliveries_per_wicket']
				input_dic[f'{bowler_type} Strike Rate'] = player_vs_bowler.loc[bowler_type, 'strike_rate']
				input_dic[f'{bowler_type} Deliveries Per Wicket'] = player_vs_bowler.loc[bowler_type,
																						'deliveries_per_wicket']

			else:
				input_dic[bowler_type] = 0
				input_dic[f'{bowler_type} Expected Runs'] = 0
				input_dic[f'{bowler_type} Expected Wickets'] = 0
				input_dic[f'{bowler_type} Strike Rate'] = 0
				input_dic[f'{bowler_type} Deliveries Per Wicket'] = 0




	def get_expected():
		bowler_types_expected_wickets_cols = [bowler_type + " Expected Wickets" for bowler_type in bowler_types]
		bowler_types_expected_runs_cols = [bowler_type + " Expected Runs" for bowler_type in bowler_types]

		input_dic['expected_wickets'] = sum([input_dic[key] for key in bowler_types_expected_wickets_cols])
		input_dic['expected_runs'] = sum([input_dic[key] for key in bowler_types_expected_runs_cols])

		input_dic['expected_runs'] = input_dic['expected_runs'] / np.sqrt(input_dic['expected_wickets']) 


	get_venue_info()
	get_runs_remain()
	get_bowler_type_data()
	get_expected()

	return input_dic


batter_list = ["A Flintoff","V Kohli"]

data = pd.DataFrame()

for batter in batter_list:
	batter_dic = input_dic.copy()
	batter_dic['batter'] = batter
	new_features = derive_features(batter_dic)
 
	print(new_features)
	data = pd.concat([data,pd.DataFrame([new_features])])
 
 



{'innings': 1, 'wickets_fallen': 1, 'bowling_team': 'Australia', 'batting_team': 'England', 'toss_winner': 1, 'runs_remain': 144.0, 'first_ball': 24, 'current_team_total': 29, 'is_powerplay': True, 'Left arm Fast': 0.0, 'Left arm Orthodox': 18.0, 'Left arm Wrist spin': 0, 'Right arm Fast': 146.0, 'Right arm Legbreak': 0, 'Right arm Offbreak': 0.0, 'batter': 'A Flintoff', 'venue_mean_total': 173.0, 'venue_first_bat_won_ratio': 0.0, 'Left arm Fast Expected Runs': 0.0, 'Left arm Fast Expected Wickets': 0.0, 'Left arm Fast Strike Rate': 166.667, 'Left arm Fast Deliveries Per Wicket': inf, 'Right arm Fast Expected Runs': 125.14244000000001, 'Right arm Fast Expected Wickets': 41.714285714285715, 'Right arm Fast Strike Rate': 85.714, 'Right arm Fast Deliveries Per Wicket': 3.5, 'Left arm Orthodox Expected Runs': 18.0, 'Left arm Orthodox Expected Wickets': 0.0, 'Left arm Orthodox Strike Rate': 100.0, 'Left arm Orthodox Deliveries Per Wicket': inf, 'Left arm Wrist spin Expected Runs': 0, 'Left 

In [300]:
data.replace(np.inf,120,inplace=True)

In [301]:
data_processed = pd.get_dummies(data=data,dtype=int)

In [302]:
data_processed.columns

Index(['innings', 'wickets_fallen', 'toss_winner', 'runs_remain', 'first_ball',
       'current_team_total', 'is_powerplay', 'Left arm Fast',
       'Left arm Orthodox', 'Left arm Wrist spin', 'Right arm Fast',
       'Right arm Legbreak', 'Right arm Offbreak', 'venue_mean_total',
       'venue_first_bat_won_ratio', 'Left arm Fast Expected Runs',
       'Left arm Fast Expected Wickets', 'Left arm Fast Strike Rate',
       'Left arm Fast Deliveries Per Wicket', 'Right arm Fast Expected Runs',
       'Right arm Fast Expected Wickets', 'Right arm Fast Strike Rate',
       'Right arm Fast Deliveries Per Wicket',
       'Left arm Orthodox Expected Runs', 'Left arm Orthodox Expected Wickets',
       'Left arm Orthodox Strike Rate',
       'Left arm Orthodox Deliveries Per Wicket',
       'Left arm Wrist spin Expected Runs',
       'Left arm Wrist spin Expected Wickets',
       'Left arm Wrist spin Strike Rate',
       'Left arm Wrist spin Deliveries Per Wicket',
       'Right arm Legbreak Ex

In [303]:
data_processed.head()

Unnamed: 0,innings,wickets_fallen,toss_winner,runs_remain,first_ball,current_team_total,is_powerplay,Left arm Fast,Left arm Orthodox,Left arm Wrist spin,...,Right arm Offbreak Expected Runs,Right arm Offbreak Expected Wickets,Right arm Offbreak Strike Rate,Right arm Offbreak Deliveries Per Wicket,expected_wickets,expected_runs,bowling_team_Australia,batting_team_England,batter_A Flintoff,batter_V Kohli
0,1,1,1,144.0,24,29,True,0.0,18.0,0.0,...,0.0,0.0,150.0,120.0,41.714286,22.162871,1,1,1,0
0,1,1,1,144.0,24,29,True,0.0,18.0,0.0,...,0.0,0.0,120.385,86.667,3.792371,120.776162,1,1,0,1


In [304]:
import pandas as pd

all_columns = pd.read_csv("./all_columns.csv")

In [305]:
all_columns.columns

Index(['innings', 'wickets_fallen', 'toss_winner', 'runs_remain', 'first_ball',
       'current_team_total', 'is_powerplay', 'Left arm Fast',
       'Left arm Orthodox', 'Left arm Wrist spin',
       ...
       'batter_SA Abbott', 'batter_R Dravid', 'batter_R Ashwin',
       'batter_SS Pathirana', 'batter_LA Pomersbach', 'batter_NJ Reardon',
       'batter_PJ Hughes', 'batter_KMDN Kulasekara', 'batter_L Vincent',
       'batter_HR Walsh'],
      dtype='object', length=509)

In [306]:
len(data_processed.columns)

45

In [307]:
import pandas as pd

all_cols = all_columns.columns.tolist()
data_cols = data_processed.columns.tolist()


missing_columns = set(all_cols) - set(data_cols)

for col in missing_columns:
    data_processed[col] = 0

# data_columns_all = data_columns.reindex(columns=all_columns, fill_value=0)


  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_proce

In [308]:
data_processed.columns

Index(['innings', 'wickets_fallen', 'toss_winner', 'runs_remain', 'first_ball',
       'current_team_total', 'is_powerplay', 'Left arm Fast',
       'Left arm Orthodox', 'Left arm Wrist spin',
       ...
       'batter_J Mubarak', 'batter_PJ van Biljon', 'batter_SD Hope',
       'batter_S Dhawan', 'batter_DJ Hussey', 'batter_LA Pomersbach',
       'batter_DR Flynn', 'batter_SSJ Brooks', 'batter_CJ Bowes',
       'batter_NM Coulter-Nile'],
      dtype='object', length=509)

In [309]:
data = pd.DataFrame()

for batter in batter_list:
	batter_dic = input_dic.copy()
	batter_dic['batter'] = batter
	new_features = derive_features(batter_dic)
 
	print(new_features)
	data = pd.concat([data,pd.DataFrame([new_features])])
	
data.replace(np.inf,120,inplace=True)
data.reset_index(inplace=True, drop=True)
data_processed = pd.get_dummies(data=data,dtype=int)


all_cols = all_columns.columns.tolist()
data_cols = data_processed.columns.tolist()


missing_columns = set(all_cols) - set(data_cols)

for col in missing_columns:
    data_processed[col] = 0


	
 

  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_proce

{'innings': 1, 'wickets_fallen': 1, 'bowling_team': 'Australia', 'batting_team': 'England', 'toss_winner': 1, 'runs_remain': 144.0, 'first_ball': 24, 'current_team_total': 29, 'is_powerplay': True, 'Left arm Fast': 0.0, 'Left arm Orthodox': 18.0, 'Left arm Wrist spin': 0, 'Right arm Fast': 146.0, 'Right arm Legbreak': 0, 'Right arm Offbreak': 0.0, 'batter': 'A Flintoff', 'venue_mean_total': 173.0, 'venue_first_bat_won_ratio': 0.0, 'Left arm Fast Expected Runs': 0.0, 'Left arm Fast Expected Wickets': 0.0, 'Left arm Fast Strike Rate': 166.667, 'Left arm Fast Deliveries Per Wicket': inf, 'Right arm Fast Expected Runs': 125.14244000000001, 'Right arm Fast Expected Wickets': 41.714285714285715, 'Right arm Fast Strike Rate': 85.714, 'Right arm Fast Deliveries Per Wicket': 3.5, 'Left arm Orthodox Expected Runs': 18.0, 'Left arm Orthodox Expected Wickets': 0.0, 'Left arm Orthodox Strike Rate': 100.0, 'Left arm Orthodox Deliveries Per Wicket': inf, 'Left arm Wrist spin Expected Runs': 0, 'Left 

  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_processed[col] = 0
  data_proce

In [310]:
data_processed.head()

Unnamed: 0,innings,wickets_fallen,toss_winner,runs_remain,first_ball,current_team_total,is_powerplay,Left arm Fast,Left arm Orthodox,Left arm Wrist spin,...,batter_J Mubarak,batter_PJ van Biljon,batter_SD Hope,batter_S Dhawan,batter_DJ Hussey,batter_LA Pomersbach,batter_DR Flynn,batter_SSJ Brooks,batter_CJ Bowes,batter_NM Coulter-Nile
0,1,1,1,144.0,24,29,True,0.0,18.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,144.0,24,29,True,0.0,18.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [311]:
from joblib import load

nrr_model = load("pkls/nrr_model.pkl")
batter_run_model = load("pkls/batter_runs_model.pkl")
final_team_total_model = load("pkls/final_team_total_model.pkl")
sr_model = load("pkls/strike_rate_model.pkl")

In [312]:
import pandas as pd

# Assuming data_processed is a Pandas DataFrame

# Make predictions
nrr_pred = nrr_model.predict(data_processed)
sr_pred = sr_model.predict(data_processed)
batter_run_pred = batter_run_model.predict(data_processed)
final_team_total_pred = final_team_total_model.predict(data_processed)


# Add predictions as new columns
data["nrr_predicted"] = nrr_pred
data["sr_predicted"] = sr_pred
data["batter_run_predicted"] = batter_run_pred
data["final_team_total_predicted"] = final_team_total_pred

# Now data_processed contains the original features and the predictions


data.sort_values(by='nrr_predicted', ascending=False, inplace=True)
data.reset_index(inplace=True, drop=True)
data.head()

output = data.loc[0, ['batter', 'nrr_predicted', 'sr_predicted', 'batter_run_predicted', 'final_team_total_predicted']]
output

batter                           V Kohli
nrr_predicted                   0.679731
sr_predicted                  143.294281
batter_run_predicted           50.873676
final_team_total_predicted    251.300903
Name: 0, dtype: object