In [1]:
import math

import pandas as pd

In [2]:
df = pd.read_csv('./combinedv2.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
# Drop null rows
df.drop(['version', '2.2.0', '2.1.0'], inplace=True, axis=1)
df.dropna(how = 'all', inplace = True)
df = df.dropna(subset=['match_id'])

In [4]:
# Create columns for total runs made and total wickets fallen
df['bat_run_total'] = df.groupby(['match_id', 'batting_team'])['runs_off_bat'].cumsum()
df['extras_total'] = df.groupby(['match_id', 'batting_team'])['extras'].cumsum()
df['run_total'] = df['bat_run_total'] + df['extras_total']
df['wickets'] = df.groupby(['match_id', 'batting_team'])['wicket_type'].transform(lambda x: x.notnull().cumsum())

In [5]:
# Create columns for overs left and run rate
def calc_overs_left(ball):
    return ((6 - int(str(ball)[-1])) / 10) + (20 - math.ceil(ball))

def calc_run_rate(ball, run_total):
    balls_passed = (6 * math.floor(ball)) + int(str(ball)[-1])
    return (run_total / balls_passed) * 6

df['overs_left'] = df['ball'].apply(calc_overs_left)
df['run_rate'] = df.apply(lambda row: calc_run_rate(row['ball'], row['run_total']), axis=1)

In [None]:
# Calculate striker and non-striker scores and strike rates
def calculate_scores(group):
    striker_score = 0
    non_striker_score = 0
    striker_rate = 0
    non_striker_rate = 0
    striker_scores = []
    non_striker_scores = []
    striker_rates = []
    non_striker_rates = []
    striker_balls = 0
    non_striker_balls = 0

    for _, row in group.iterrows():
        if pd.notna(row['wicket_type']):
            if row['player_dismissed'] == row['striker']:
                striker_score = 0
                striker_balls = 0
            elif row['player_dismissed'] == row['non_striker']:
                non_striker_score = 0
                non_striker_balls = 0

        striker_score += row['runs_off_bat']
        striker_balls += 1
        striker_rate = (striker_score / striker_balls) * 100

        striker_scores.append(striker_score)
        striker_rates.append(striker_rate)
        non_striker_scores.append(non_striker_score)
        non_striker_rates.append(non_striker_rate)

        # Swap scores if strike rotates
        if row['bat_run_total'] % 2 == 1:
            striker_score, non_striker_score = non_striker_score, striker_score
            striker_rate, non_striker_rate = non_striker_rate, striker_rate
            striker_balls, non_striker_balls = non_striker_balls, striker_balls

    group['striker_score'] = striker_scores
    group['non_striker_score'] = non_striker_scores
    group['striker_rate'] = striker_rates
    group['non_striker_rate'] = non_striker_rates
    return group

df = df.groupby('match_id').apply(calculate_scores).reset_index(drop=True)
df.head(20)

In [7]:
#Calculate final score corresponding to each ball (target variable)
def final_score(group):
    finalScore = group['run_total'].iloc[-1]
    for _, row in group.iterrows():
        group['final_score'] = finalScore
    return group

df = df.groupby(['match_id','batting_team']).apply(final_score).reset_index(drop=True)

In [8]:
#Save the dataframe for use in model training
df.to_csv('processed_data.csv', index=False)