# Data Science part 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("E:/study files/R programming/bpl_all_matches.csv")

In [3]:
df['batting_team'].unique()

array(['Rajshahi Kings', 'Comilla Victorians', 'Khulna Titans',
       'Rangpur Riders', 'Chittagong Vikings', 'Barisal Bulls',
       'Dhaka Dynamites', 'Sylhet Sixers', 'Sylhet Thunder',
       'Chattogram Challengers', 'Cumilla Warriors', 'Rangpur Rangers',
       'Dhaka Platoon', 'Rajshahi Royals', 'Khulna Tigers',
       'Fortune Barishal', 'Minister Group Dhaka', 'Sylhet Sunrisers',
       'Sylhet Strikers', 'Dhaka Dominators', 'Durdanto Dhaka',
       'Durbar Rajshahi', 'Dhaka Capital', 'Chittagong Kings',
       'Barisal Burners', 'Duronto Rajshahi', 'Khulna Royal Bengals',
       'Sylhet Royals', 'Dhaka Gladiators', 'Sylhet Super Stars'],
      dtype=object)

In [4]:
team_mapping = {
    'Dhaka Dynamites': 'Dhaka', 'Dhaka Platoon': 'Dhaka', 'Minister Group Dhaka': 'Dhaka',
    'Dhaka Dominators': 'Dhaka', 'Durdanto Dhaka': 'Dhaka', 'Dhaka Capital': 'Dhaka', 'Dhaka Gladiators': 'Dhaka',
    
    'Sylhet Sixers': 'Sylhet', 'Sylhet Thunder': 'Sylhet', 'Sylhet Sunrisers': 'Sylhet',
    'Sylhet Strikers': 'Sylhet', 'Sylhet Royals': 'Sylhet', 'Sylhet Super Stars': 'Sylhet',
    
    'Rajshahi Kings': 'Rajshahi', 'Rajshahi Royals': 'Rajshahi', 'Durbar Rajshahi': 'Rajshahi', 'Duronto Rajshahi': 'Rajshahi',
    
    'Barisal Bulls': 'Barisal', 'Fortune Barishal': 'Barisal', 'Barisal Burners': 'Barisal',
    
    'Khulna Titans': 'Khulna', 'Khulna Tigers': 'Khulna', 'Khulna Royal Bengals': 'Khulna',
    
    'Rangpur Riders': 'Rangpur', 'Rangpur Rangers': 'Rangpur',
    
    'Chittagong Vikings': 'Chattogram', 'Chattogram Challengers': 'Chattogram', 'Chittagong Kings': 'Chattogram',
    
    'Comilla Victorians': 'Comilla', 'Cumilla Warriors': 'Comilla'
}

df['batting_team'] = df['batting_team'].map(team_mapping).fillna(df['batting_team'])

In [5]:
df['batting_team'].unique()

array(['Rajshahi', 'Comilla', 'Khulna', 'Rangpur', 'Chattogram',
       'Barisal', 'Dhaka', 'Sylhet'], dtype=object)

In [6]:
def add_match_context(group):
    teams = group['batting_team'].unique()
    if len(teams) >= 2:
        team1, team2 = teams[0], teams[1]
        group['inning'] = group['batting_team'].apply(lambda x: 1 if x == team1 else 2)
        group['bowling_team'] = group['batting_team'].apply(lambda x: team2 if x == team1 else team1)
    else:
        group['inning'] = 1
        group['bowling_team'] = 'Unknown'
    return group

In [7]:
df = df.groupby('match_id', group_keys = False).apply(add_match_context)

  df = df.groupby('match_id', group_keys = False).apply(add_match_context)


In [8]:
df['current_score'] = df.groupby(['match_id', 'inning'])['runs_total'].cumsum()
df['wickets_lost'] = df.groupby(['match_id', 'inning'])['wicket'].cumsum()

df['balls_delivered'] = (df['over']*6) + df['ball']
df['balls_left'] = 120 - df['balls_delivered']
df['balls_left'] = df['balls_left'].clip(lower=0)

In [9]:
inning1_totals = df[df['inning'] == 1].groupby('match_id')['runs_total'].sum().reset_index()
inning1_totals.columns = ['match_id', 'target_score']
inning1_totals['target_score'] = inning1_totals['target_score'] + 1

In [10]:
df = df.merge(inning1_totals, on = 'match_id', how = 'left')

In [11]:
final_scores = df.groupby(['match_id', 'inning'])['current_score'].max().unstack().reset_index()
final_scores.columns = ['match_id', 'inning1_final', 'inning2_final']

match_teams = df.groupby(['match_id', 'inning'])['batting_team'].first().unstack().reset_index()
match_teams.columns = ['match_id', 'team1', 'team2']
match_summary = final_scores.merge(match_teams, on='match_id')

In [12]:
def determine_winner(row):
    if pd.isna(row['inning2_final']): 
        return row['team1']
    if row['inning2_final'] >= row['inning1_final'] + 1:
        return row['team2']
    else:
        return row['team1'] 

match_summary['winner'] = match_summary.apply(determine_winner, axis=1)
df = df.merge(match_summary[['match_id', 'winner']], on='match_id', how='left')
df['result'] = (df['batting_team'] == df['winner']).astype(int)

In [13]:
df_chase = df[df['inning'] == 2].copy()

In [14]:
df_chase['runs_left'] = df_chase['target_score'] - df['current_score']
df_chase['wickets_left'] = 10 - df_chase['wickets_lost']
df_chase['crr'] = (df_chase['current_score'] * 6) / (120 - df_chase['balls_left'])
df_chase['rrr'] = (df_chase['runs_left'] * 6) / df_chase['balls_left']
df_chase = df_chase.replace([np.inf, -np.inf], 0)

feature_columns = ['batting_team', 'bowling_team', 'runs_left', 'balls_left', 'wickets_left', 'target_score', 'crr', 'rrr', 'result']
df_chase = df_chase.dropna(subset = feature_columns)

In [15]:
df_chase.head()

Unnamed: 0,match_id,date,venue,batting_team,over,ball,batter,bowler,runs_batter,runs_extras,...,wickets_lost,balls_delivered,balls_left,target_score,winner,result,runs_left,wickets_left,crr,rrr
121,1063047,2016-11-30,"Shere Bangla National Stadium, Mirpur",Comilla,0,1,Imrul Kayes,Mohammad Sami,4,0,...,0,1,119,125,Comilla,1,121.0,10,24.0,6.10084
122,1063047,2016-11-30,"Shere Bangla National Stadium, Mirpur",Comilla,0,2,Imrul Kayes,Mohammad Sami,0,0,...,0,2,118,125,Comilla,1,121.0,10,12.0,6.152542
123,1063047,2016-11-30,"Shere Bangla National Stadium, Mirpur",Comilla,0,3,Imrul Kayes,Mohammad Sami,0,1,...,0,3,117,125,Comilla,1,120.0,10,10.0,6.153846
124,1063047,2016-11-30,"Shere Bangla National Stadium, Mirpur",Comilla,0,4,Imrul Kayes,Mohammad Sami,0,0,...,0,4,116,125,Comilla,1,120.0,10,7.5,6.206897
125,1063047,2016-11-30,"Shere Bangla National Stadium, Mirpur",Comilla,0,5,Imrul Kayes,Mohammad Sami,0,0,...,0,5,115,125,Comilla,1,120.0,10,6.0,6.26087


In [16]:
def categorize_venue(venue):
    venue = str(venue).lower()
    if 'dhaka' in venue or 'mirpur' in venue:
        return 'Dhaka'
    elif 'chittagong' in venue or 'zohur' in venue or 'zahur' in venue:
        return 'Chattogram'
    elif 'sylhet' in venue:
        return 'Sylhet'
    else:
        return 'Other'

df_chase['venue'] = df_chase['venue'].apply(categorize_venue)

print(df_chase['venue'].unique())

['Dhaka' 'Chattogram' 'Sylhet' 'Other']


# Machine learning part

In [17]:
X = df_chase[['batting_team', 'bowling_team', 'venue', 'runs_left', 'balls_left', 'wickets_left', 'target_score', 'crr', 'rrr']]
y = df_chase['result']

In [18]:
trf = ColumnTransformer([
    ('encoder', OneHotEncoder(sparse_output = False, handle_unknown = 'ignore'), ['batting_team', 'bowling_team', 'venue'])
], remainder = 'passthrough')

In [19]:
pipe = Pipeline(steps =[
    ('preprocessor', trf), 
    ('classifier', LogisticRegression(solver = 'liblinear'))
])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [21]:
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('encoder', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [22]:
from sklearn.metrics import accuracy_score
y_test_pred = pipe.predict(X_test)
accuracy_score(y_test_pred, y_test)

0.8228421794742673

In [23]:
def predict_live(batting, bowling, venue, target, score, wickets, over, ball):
    # Calculate the numbers the model needs
    balls_delivered = (over * 6) + ball
    balls_left = 120 - balls_delivered
    runs_left = target - score
    wickets_left = 10 - wickets
    crr = (score * 6) / balls_delivered if balls_delivered > 0 else 0
    rrr = (runs_left * 6) / balls_left if balls_left > 0 else 0
    
    # Put data into a table for the model
    input_df = pd.DataFrame({
        'batting_team': [batting],
        'bowling_team': [bowling],
        'venue': [venue], 
        'runs_left': [runs_left],
        'balls_left': [balls_left],
        'wickets_left': [wickets_left],
        'target_score': [target],
        'crr': [crr],
        'rrr': [rrr]
    })
    
    # Get Probability
    win_prob = pipe.predict_proba(input_df)[0][1]
    
    print(f"--- Live Match Status ---")
    print(f"{batting} needs {runs_left} runs in {balls_left} balls with {wickets_left} wickets in hand at {venue}.")
    print(f"Win Probability: {win_prob:.1%}")
    print(f"Loss Probability: {1 - win_prob:.1%}")


In [24]:
# input:
# (batting_team, bowling_team, venue, target, current_score, wickets_lost, current_over, current_ball)
predict_live('Dhaka', 'Rajshahi', 'Sylhet', 133, 98, 5, 16, 0)

--- Live Match Status ---
Dhaka needs 35 runs in 24 balls with 5 wickets in hand at Sylhet.
Win Probability: 23.3%
Loss Probability: 76.7%
