# Derive the augmented features from the match state

In [84]:
import pandas as pd
import numpy as np


venues_df = pd.read_csv("train_venues.csv")
player_vs_bowler_df = pd.read_csv("train_player_vs_bowler.csv")
train_set = pd.read_csv("train_set.csv")

player_vs_bowler_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2747 entries, 0 to 2746
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   bowler_type            2747 non-null   object 
 1   strike_rate            2747 non-null   float64
 2   strike_rate_std        2596 non-null   float64
 3   wickets                2747 non-null   int64  
 4   deliveries_per_wicket  2747 non-null   float64
 5   deliveries             2747 non-null   int64  
 6   batter                 2747 non-null   object 
 7   index                  0 non-null      float64
dtypes: float64(4), int64(2), object(2)
memory usage: 171.8+ KB


In [85]:
# row from the training set without the historically derived data
input_dic = {
    # "batter": "A Flintoff",
    "innings": 1,
    "wickets_fallen": 1,
    "bowling_team": "Australia",
    "batting_team": "England",
    "toss_winner": 1,
    "runs_remain": 144.0,
    "first_ball": 24,
    "current_team_total": 29,
    "is_powerplay": True,
    "Left arm Fast": 0.0,
    "Left arm Orthodox": 18.0,
    "Left arm Wrist spin": 0.0,
    "Right arm Fast": 146.0,
    "Right arm Legbreak": 0.0,
    "Right arm Offbreak": 0.0,
    "venue": "GMHBA Stadium"
}


def derive_features(input_dic: dict) -> dict:
	bowler_types = ['Left arm Fast', 'Right arm Fast', 'Left arm Orthodox',
					'Left arm Wrist spin', 'Right arm Legbreak', 'Right arm Offbreak']


	def get_venue_info():
		venue = input_dic['venue']
		if venue in venues_df['venue'].values:  # Check if venue exists in DataFrame
			venue_mean = venues_df.loc[venues_df['venue'] == venue, 'total_mean']
			venue_first_bat_won_ratio = venues_df.loc[venues_df['venue']
													== venue, 'first_bat_won_ratio']
			# Access single value from Series
			input_dic['venue_mean'] = venue_mean.values[0]
			input_dic['venue_first_bat_won_ratio'] = venue_first_bat_won_ratio.values[0]
		else:
			input_dic['venue_mean'] = venues_df['total_mean'].median()
			input_dic['venue_first_bat_won_ratio'] = venues_df['first_bat_won_ratio'].median()


	def get_runs_remain():
		if input_dic['innings'] == 2:
			return
		if input_dic['innings'] == 1:
			return input_dic['venue_mean'] - input_dic['current_team_total']


	bowler_types = ['Left arm Fast', 'Right arm Fast', 'Left arm Orthodox',
					'Left arm Wrist spin', 'Right arm Legbreak', 'Right arm Offbreak']


	def get_bowler_type_data():
		player_vs_bowler = player_vs_bowler_df[player_vs_bowler_df['batter']
											== input_dic['batter']]
		player_vs_bowler.set_index("bowler_type", inplace=True)
		for bowler_type in bowler_types:
			if bowler_type in player_vs_bowler.index:
				input_dic[f'{bowler_type} Expected Runs'] = player_vs_bowler.loc[bowler_type,
																				'strike_rate'] / 100 * input_dic[bowler_type]
				input_dic[f'{bowler_type} Expected Wickets'] = input_dic[bowler_type] / \
					player_vs_bowler.loc[bowler_type, 'deliveries_per_wicket']
				input_dic[f'{bowler_type} Strike Rate'] = player_vs_bowler.loc[bowler_type, 'strike_rate']
				input_dic[f'{bowler_type} Deliveries Per Wicket'] = player_vs_bowler.loc[bowler_type,
																						'deliveries_per_wicket']

			else:
				input_dic[bowler_type] = 0
				input_dic[f'{bowler_type} Expected Runs'] = 0
				input_dic[f'{bowler_type} Expected Wickets'] = 0
				input_dic[f'{bowler_type} Strike Rate'] = 0
				input_dic[f'{bowler_type} Deliveries Per Wicket'] = 0




	def get_expected():
		bowler_types_expected_wickets_cols = [bowler_type + " Expected Wickets" for bowler_type in bowler_types]
		bowler_types_expected_runs_cols = [bowler_type + " Expected Runs" for bowler_type in bowler_types]

		input_dic['expected_wickets'] = sum([input_dic[key] for key in bowler_types_expected_wickets_cols])
		input_dic['expected_runs'] = sum([input_dic[key] for key in bowler_types_expected_runs_cols])

		input_dic['expected_runs'] = input_dic['expected_runs'] / np.sqrt(input_dic['expected_wickets']) 


	get_venue_info()
	get_runs_remain()
	get_bowler_type_data()
	get_expected()

	return input_dic


batter_list = ["A Flintoff"]

for batter in batter_list:
	batter_dic = input_dic.copy()
	batter_dic['batter'] = batter
	print(derive_features(batter_dic))




{'innings': 1, 'wickets_fallen': 1, 'bowling_team': 'Australia', 'batting_team': 'England', 'toss_winner': 1, 'runs_remain': 144.0, 'first_ball': 24, 'current_team_total': 29, 'is_powerplay': True, 'Left arm Fast': 0.0, 'Left arm Orthodox': 18.0, 'Left arm Wrist spin': 0, 'Right arm Fast': 146.0, 'Right arm Legbreak': 0, 'Right arm Offbreak': 0.0, 'venue': 'GMHBA Stadium', 'batter': 'A Flintoff', 'venue_mean': 173.0, 'venue_first_bat_won_ratio': 0.0, 'Left arm Fast Expected Runs': 0.0, 'Left arm Fast Expected Wickets': 0.0, 'Left arm Fast Strike Rate': 166.667, 'Left arm Fast Deliveries Per Wicket': inf, 'Right arm Fast Expected Runs': 125.14244000000001, 'Right arm Fast Expected Wickets': 41.714285714285715, 'Right arm Fast Strike Rate': 85.714, 'Right arm Fast Deliveries Per Wicket': 3.5, 'Left arm Orthodox Expected Runs': 18.0, 'Left arm Orthodox Expected Wickets': 0.0, 'Left arm Orthodox Strike Rate': 100.0, 'Left arm Orthodox Deliveries Per Wicket': inf, 'Left arm Wrist spin Expec