In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
import matplotlib.pyplot as plt
import joblib
from sklearn.calibration import CalibrationDisplay

In [2]:
match_df = pd.read_csv('data_files/match_details.csv',index_col=0)
ball_df = pd.read_csv('data_files/ball_by_ball.csv',index_col=0)

In [3]:
match_df.sort_values('date', inplace=True)
match_df['team_1_win'] = (match_df['team_1'] == match_df['winner']).astype(int)
match_df['team_2_win'] = (match_df['team_2'] == match_df['winner']).astype(int)
team1_df = match_df[['date','season','match_id', 'team_1', 'team_1_win']].rename(columns={'team_1': 'team', 'team_1_win': 'win'})
team2_df = match_df[['date','season','match_id', 'team_2', 'team_2_win']].rename(columns={'team_2': 'team', 'team_2_win': 'win'})
form_df = pd.concat([team1_df, team2_df]).sort_values(by='date').reset_index(drop=True)
form_df['form'] = form_df.groupby(['team','season'])['win'].rolling(window=5, min_periods=1).mean().reset_index(level=['team','season'], drop=True)
form_df['form'] = form_df.groupby(['team','season'])['form'].shift(1)
form_df.fillna(0, inplace=True)
form_df = form_df[['match_id','team','form']]
match_df = match_df.merge(form_df.rename(columns={'team': 'team_2', 'form': 'batting_team_form'}), on=['match_id', 'team_2'], how='left')
match_df = match_df.merge(form_df.rename(columns={'team': 'team_1', 'form': 'bowling_team_form'}), on=['match_id', 'team_1'], how='left')

In [4]:
target_columns = match_df[['match_id','date','team_1_score','venue','winner','season','batting_team_form','bowling_team_form']].copy()
target_columns['target_score'] = target_columns['team_1_score']+1

In [5]:
target_columns

Unnamed: 0,match_id,date,team_1_score,venue,winner,season,batting_team_form,bowling_team_form,target_score
0,1304047,2022-03-26,131,"Wankhede Stadium, Mumbai",KKR,2022,0.0,0.0,132
1,1304048,2022-03-27,177,"Brabourne Stadium, Mumbai",DC,2022,0.0,0.0,178
2,1304049,2022-03-27,205,"Dr DY Patil Sports Academy, Mumbai",PBKS,2022,0.0,0.0,206
3,1304050,2022-03-28,158,"Wankhede Stadium, Mumbai",GT,2022,0.0,0.0,159
4,1304051,2022-03-29,210,"Maharashtra Cricket Association Stadium, Pune",RR,2022,0.0,0.0,211
...,...,...,...,...,...,...,...,...,...
288,1473507,2025-05-27,227,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,RCB,2025,0.8,0.2,228
289,1473508,2025-05-29,101,Maharaja Yadavindra Singh International Cricke...,RCB,2025,0.8,0.6,102
290,1473509,2025-05-30,228,Maharaja Yadavindra Singh International Cricke...,MI,2025,0.6,0.6,229
291,1473510,2025-06-01,203,"Narendra Modi Stadium, Ahmedabad",PBKS,2025,0.4,0.6,204


In [6]:
ball_df = ball_df[ball_df['inning']==2]
ball_df = ball_df.merge(target_columns,on='match_id')

In [7]:
ball_df['venue'] = ball_df['venue'].replace('Maharaja Yadavindra Singh International Cricket Stadium, New Chandigarh','Maharaja Yadavindra Singh International Cricket Stadium, Mullanpur')

In [8]:
ball_df = ball_df.sort_values(['date', 'match_id', 'over', 'ball'])
ball_df['current_score'] = ball_df.groupby('match_id')['total_runs'].cumsum()
ball_df['runs_required'] = ball_df['target_score'] - ball_df['current_score']
ball_df['runs_required'] = ball_df['runs_required'].clip(lower=0)
ball_df['balls_bowled'] = (~ball_df['extras_type'].str.contains('wides|noballs', na=False)).groupby(ball_df['match_id']).cumsum()
ball_df['balls_remaining'] = 120 - ball_df['balls_bowled']
ball_df['wickets_taken'] = ball_df.groupby('match_id')['is_wicket'].cumsum()
ball_df['wickets_remaining'] = 10 - ball_df['wickets_taken']
ball_df['crr'] = (ball_df['current_score'] * 6) / ball_df['balls_bowled']
ball_df['rrr'] = (ball_df['runs_required'] * 6) / ball_df['balls_remaining']
ball_df['did_win'] = (ball_df['batting_team'] == ball_df['winner']).astype(int)

In [9]:
ball_df

Unnamed: 0,match_id,inning,is_super_over,batting_team,bowling_team,over,ball,batter,non_striker,bowler,...,target_score,current_score,runs_required,balls_bowled,balls_remaining,wickets_taken,wickets_remaining,crr,rrr,did_win
24812,1304047,2,False,KKR,CSK,0,1,AM Rahane,VR Iyer,TU Deshpande,...,132,0,132,1,119,0,10,0.000000,6.655462,1
24813,1304047,2,False,KKR,CSK,0,2,AM Rahane,VR Iyer,TU Deshpande,...,132,0,132,2,118,0,10,0.000000,6.711864,1
24814,1304047,2,False,KKR,CSK,0,3,AM Rahane,VR Iyer,TU Deshpande,...,132,0,132,3,117,0,10,0.000000,6.769231,1
24815,1304047,2,False,KKR,CSK,0,4,AM Rahane,VR Iyer,TU Deshpande,...,132,2,130,4,116,0,10,3.000000,6.724138,1
24816,1304047,2,False,KKR,CSK,0,5,AM Rahane,VR Iyer,TU Deshpande,...,132,6,126,5,115,0,10,7.200000,6.573913,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19262,1473511,2,False,PBKS,RCB,19,2,Shashank Singh,KA Jamieson,JR Hazlewood,...,191,162,29,116,4,7,3,8.379310,43.500000,0
19263,1473511,2,False,PBKS,RCB,19,3,Shashank Singh,KA Jamieson,JR Hazlewood,...,191,168,23,117,3,7,3,8.615385,46.000000,0
19264,1473511,2,False,PBKS,RCB,19,4,Shashank Singh,KA Jamieson,JR Hazlewood,...,191,172,19,118,2,7,3,8.745763,57.000000,0
19265,1473511,2,False,PBKS,RCB,19,5,Shashank Singh,KA Jamieson,JR Hazlewood,...,191,178,13,119,1,7,3,8.974790,78.000000,0


In [10]:
final_df = ball_df[[
    'match_id',
    'date',
    'runs_required',
    'balls_remaining',
    'wickets_remaining',
    'target_score',
    'crr',
    'rrr',
    'batting_team_form',
    'bowling_team_form',
    'venue',
    'did_win'
]].copy()

In [11]:
final_df

Unnamed: 0,match_id,date,runs_required,balls_remaining,wickets_remaining,target_score,crr,rrr,batting_team_form,bowling_team_form,venue,did_win
24812,1304047,2022-03-26,132,119,10,132,0.000000,6.655462,0.0,0.0,"Wankhede Stadium, Mumbai",1
24813,1304047,2022-03-26,132,118,10,132,0.000000,6.711864,0.0,0.0,"Wankhede Stadium, Mumbai",1
24814,1304047,2022-03-26,132,117,10,132,0.000000,6.769231,0.0,0.0,"Wankhede Stadium, Mumbai",1
24815,1304047,2022-03-26,130,116,10,132,3.000000,6.724138,0.0,0.0,"Wankhede Stadium, Mumbai",1
24816,1304047,2022-03-26,126,115,10,132,7.200000,6.573913,0.0,0.0,"Wankhede Stadium, Mumbai",1
...,...,...,...,...,...,...,...,...,...,...,...,...
19262,1473511,2025-06-03,29,4,3,191,8.379310,43.500000,0.6,0.8,"Narendra Modi Stadium, Ahmedabad",0
19263,1473511,2025-06-03,23,3,3,191,8.615385,46.000000,0.6,0.8,"Narendra Modi Stadium, Ahmedabad",0
19264,1473511,2025-06-03,19,2,3,191,8.745763,57.000000,0.6,0.8,"Narendra Modi Stadium, Ahmedabad",0
19265,1473511,2025-06-03,13,1,3,191,8.974790,78.000000,0.6,0.8,"Narendra Modi Stadium, Ahmedabad",0


In [12]:
final_df.dropna(inplace=True)
final_df = final_df[~final_df.isin([float('inf'), float('-inf')]).any(axis=1)]
final_df = final_df[final_df['balls_remaining'] != 0]

In [13]:
preprocessor = ColumnTransformer(
    transformers=[('onehot',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),['venue'])],
    remainder='passthrough',
    force_int_remainder_cols=False 
)

In [14]:
time_train_df = final_df[final_df['date']<='2025-04-19']
time_test_df = final_df[final_df['date']>'2025-04-19']
X_train_time = time_train_df.drop(['did_win','match_id','date'], axis=1)
y_train_time = time_train_df['did_win']
X_test_time = time_test_df.drop(['did_win','match_id'], axis=1)
y_test_time = time_test_df['did_win']

In [15]:
models ={
    'Logistic Regression':LogisticRegression(max_iter=5000),
    'SVM':SVC(probability=True),
    'Random Forrest':RandomForestClassifier(n_estimators=100,random_state=42),
    'XGBoost':XGBClassifier(n_estimators=100,random_state=42)
}

In [16]:
results = {}
for name,model in models.items():
    model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)])
    model_pipeline.fit(X_train_time,y_train_time)
    y_pred_train = model_pipeline.predict(X_train_time)
    y_pred = model_pipeline.predict(X_test_time)
    y_pred_proba = model_pipeline.predict_proba(X_test_time)[:,1]
    train_acc = accuracy_score(y_pred_train,y_train_time)
    test_acc = accuracy_score(y_pred,y_test_time)
    pres = precision_score(y_test_time,y_pred)
    rec = recall_score(y_test_time,y_pred)
    f1 = f1_score(y_test_time,y_pred)
    results[name] = {'Training Accuracy':train_acc,'Test Accuracy':test_acc,'Precision':pres,'Recall':rec,'F1 score':f1}
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,Training Accuracy,Test Accuracy,Precision,Recall,F1 score
Logistic Regression,0.800909,0.774027,0.823178,0.689345,0.75034
SVM,0.789111,0.831937,0.930674,0.711856,0.806689
Random Forrest,1.0,0.775998,0.849807,0.662331,0.744448
XGBoost,1.0,0.677674,0.754982,0.511756,0.610018


In [19]:
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestClassifier(random_state=42))])
param_grid_rf = {
    'classifier__n_estimators': [100, 200, 300, 400],
    'classifier__max_depth': [5, 10, 15, 20],
    'classifier__min_samples_leaf': [5, 10, 15],
    'classifier__max_features': ['sqrt', 'log2']
}
grid_search_rf = GridSearchCV(estimator=rf_pipeline, param_grid=param_grid_rf, 
                              cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search_rf.fit(X_train_time, y_train_time)
print(f'Best Random Forrest Parameters: {grid_search_rf.best_params_}')
print(f'Best Random Forrest Model Accuracy: {grid_search_rf.best_score_}')

Fitting 3 folds for each of 96 candidates, totalling 288 fits
[CV] END classifier__max_depth=5, classifier__max_features=sqrt, classifier__min_samples_leaf=5, classifier__n_estimators=100; total time=   0.9s
[CV] END classifier__max_depth=5, classifier__max_features=sqrt, classifier__min_samples_leaf=5, classifier__n_estimators=100; total time=   0.9s
[CV] END classifier__max_depth=5, classifier__max_features=sqrt, classifier__min_samples_leaf=5, classifier__n_estimators=100; total time=   0.9s
[CV] END classifier__max_depth=5, classifier__max_features=sqrt, classifier__min_samples_leaf=5, classifier__n_estimators=200; total time=   1.7s
[CV] END classifier__max_depth=5, classifier__max_features=sqrt, classifier__min_samples_leaf=5, classifier__n_estimators=200; total time=   1.9s
[CV] END classifier__max_depth=5, classifier__max_features=sqrt, classifier__min_samples_leaf=5, classifier__n_estimators=200; total time=   1.9s
[CV] END classifier__max_depth=5, classifier__max_features=sqr

In [21]:
svm_rbf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(probability=True, random_state=42, kernel='rbf'))
])
svm_param_grid = {
    'classifier__C': [0.1, 1, 10, 50,100],
    'classifier__gamma': [0.01, 0.001, 'scale','auto'],
    'classifier__kernel': ['rbf','sigmoid'],
    'classifier__class_weight': [None, 'balanced']
}

svm_halving = HalvingGridSearchCV(
    estimator=svm_rbf_pipeline,
    param_grid=svm_param_grid,
    scoring='f1',
    cv=5,
    factor=2,
    verbose=2,
    n_jobs=-1,
    random_state = 42
)

svm_halving.fit(X_train_time, y_train_time)

print("Best Params (Halving SVM):", svm_halving.best_params_)
print("Best Accuracy Score:", svm_halving.best_score_)

n_iterations: 7
n_required_iterations: 7
n_possible_iterations: 7
min_resources_: 460
max_resources_: 29499
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 80
n_resources: 460
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[CV] END classifier__C=0.1, classifier__class_weight=None, classifier__gamma=0.01, classifier__kernel=rbf; total time=   0.0s
[CV] END classifier__C=0.1, classifier__class_weight=None, classifier__gamma=0.01, classifier__kernel=rbf; total time=   0.0s
[CV] END classifier__C=0.1, classifier__class_weight=None, classifier__gamma=0.01, classifier__kernel=sigmoid; total time=   0.0s
[CV] END classifier__C=0.1, classifier__class_weight=None, classifier__gamma=0.01, classifier__kernel=sigmoid; total time=   0.0s
[CV] END classifier__C=0.1, classifier__class_weight=None, classifier__gamma=0.01, classifier__kernel=sigmoid; total time=   0.0s
[CV] END classifier__C=0.1, classifier__class_weight=None, classifier__gamma=0.01, classifier__kernel=rbf; total time=   0.1s
[CV] END classifier__C=0.1, classifier__class_weight=None, classifier__gamma=0.01, classifier__kernel=sigmoid; total time=   0.0s
[CV] END classifier__C=0.1, classifier__class_weight=None, classifier__gamma=0.01, classifier__kernel=



[CV] END classifier__C=50, classifier__class_weight=balanced, classifier__gamma=scale, classifier__kernel=rbf; total time=  32.9s
[CV] END classifier__C=50, classifier__class_weight=None, classifier__gamma=scale, classifier__kernel=rbf; total time=  25.7s
[CV] END classifier__C=50, classifier__class_weight=None, classifier__gamma=scale, classifier__kernel=rbf; total time=  25.6s
[CV] END classifier__C=100, classifier__class_weight=None, classifier__gamma=scale, classifier__kernel=rbf; total time=  26.6s
[CV] END classifier__C=100, classifier__class_weight=None, classifier__gamma=scale, classifier__kernel=rbf; total time=  24.2s
[CV] END classifier__C=100, classifier__class_weight=None, classifier__gamma=scale, classifier__kernel=rbf; total time=  26.4s
[CV] END classifier__C=100, classifier__class_weight=None, classifier__gamma=scale, classifier__kernel=rbf; total time=  27.2s
[CV] END classifier__C=100, classifier__class_weight=None, classifier__gamma=scale, classifier__kernel=rbf; to

In [23]:
y_pred_tuned = svm_halving.best_estimator_.predict(X_test_time)
print('Metrics for Tuned Model')
print("Test Accuracy:", accuracy_score(y_test_time, y_pred_tuned))
print("Test Precision:", precision_score(y_test_time, y_pred_tuned))
print("Test Recall:", recall_score(y_test_time, y_pred_tuned))
print("Test F1:", f1_score(y_test_time, y_pred_tuned))

Metrics for Tuned Model
Test Accuracy: 0.7979300147856087
Test Precision: 0.8566243194192378
Test Recall: 0.7083541770885443
Test F1: 0.7754654983570646


In [17]:
best_time_split_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(probability=True, random_state=42, kernel='rbf'))
])
best_time_split_model.fit(X_train_time,y_train_time)
print('Metrics for regular baseline model')
y_pred_svm = best_time_split_model.predict(X_test_time)
print("Test Accuracy:", accuracy_score(y_test_time, y_pred_svm))
print("Test Precision:", precision_score(y_test_time, y_pred_svm))
print("Test Recall:", recall_score(y_test_time, y_pred_svm))
print("Test F1:", f1_score(y_test_time, y_pred_svm))

Metrics for regular baseline model
Test Accuracy: 0.8319369147363234
Test Precision: 0.9306736429038587
Test Recall: 0.711855927963982
Test F1: 0.8066893424036281


In [18]:
joblib.dump(best_time_split_model,'models/best_chase_prediction_model.pkl')

['models/best_chase_prediction_model.pkl']