In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
match_df = pd.read_csv('data_files/match_details.csv',index_col=0)
ball_df = pd.read_csv('data_files/ball_by_ball.csv',index_col=0)

In [3]:
target_columns = match_df[['match_id','team_1_score','venue','winner']].copy()
target_columns['target_score'] = target_columns['team_1_score']+1

In [4]:
ball_df = ball_df[ball_df['inning']==2]
ball_df = ball_df.merge(target_columns,on='match_id')

In [5]:
ball_df['venue'] = ball_df['venue'].replace('Maharaja Yadavindra Singh International Cricket Stadium, New Chandigarh','Maharaja Yadavindra Singh International Cricket Stadium, Mullanpur')

In [6]:
ball_df['current_score'] = ball_df.groupby('match_id')['total_runs'].cumsum()
ball_df['runs_required'] = ball_df['target_score'] - ball_df['current_score']
ball_df['runs_required'] = ball_df['runs_required'].clip(lower=0)
ball_df['balls_bowled'] = (~ball_df['extras_type'].str.contains('wides|noballs', na=False)).groupby(ball_df['match_id']).cumsum()
ball_df['balls_remaining'] = 120 - ball_df['balls_bowled']
ball_df['wickets_taken'] = ball_df.groupby('match_id')['is_wicket'].cumsum()
ball_df['wickets_remaining'] = 10 - ball_df['wickets_taken']
ball_df['crr'] = (ball_df['current_score'] * 6) / ball_df['balls_bowled']
ball_df['rrr'] = (ball_df['runs_required'] * 6) / ball_df['balls_remaining']
ball_df['did_win'] = (ball_df['batting_team'] == ball_df['winner']).astype(int)

In [7]:
final_df = ball_df[[
    'runs_required',
    'balls_remaining',
    'wickets_remaining',
    'target_score',
    'crr',
    'rrr',
    'venue',
    'did_win'
]].copy()

In [8]:
final_df

Unnamed: 0,runs_required,balls_remaining,wickets_remaining,target_score,crr,rrr,venue,did_win
0,183,119,10,183,0.000000,9.226891,Maharaja Yadavindra Singh International Cricke...,0
1,183,118,10,183,0.000000,9.305085,Maharaja Yadavindra Singh International Cricke...,0
2,183,117,10,183,0.000000,9.384615,Maharaja Yadavindra Singh International Cricke...,0
3,183,116,10,183,0.000000,9.465517,Maharaja Yadavindra Singh International Cricke...,0
4,182,115,10,183,1.200000,9.495652,Maharaja Yadavindra Singh International Cricke...,0
...,...,...,...,...,...,...,...,...
33681,2,5,5,191,9.860870,2.400000,"MA Chidambaram Stadium, Chepauk, Chennai",1
33682,1,5,5,191,9.913043,1.200000,"MA Chidambaram Stadium, Chepauk, Chennai",1
33683,1,4,4,191,9.827586,1.500000,"MA Chidambaram Stadium, Chepauk, Chennai",1
33684,1,3,4,191,9.743590,2.000000,"MA Chidambaram Stadium, Chepauk, Chennai",1


In [9]:
final_df.dropna(inplace=True)
final_df = final_df[~final_df.isin([float('inf'), float('-inf')]).any(axis=1)]
final_df = final_df[final_df['balls_remaining'] != 0]

In [10]:
X = final_df.drop('did_win', axis=1)
y = final_df['did_win']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
preprocessor = ColumnTransformer(
    transformers=[('onehot',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),['venue'])],
    remainder='passthrough',
    force_int_remainder_cols=False 
)

In [46]:
models ={
    'Logistic Regression':LogisticRegression(max_iter=1000),
    'SVM':SVC(probability=True),
    'Random Forrest':RandomForestClassifier(n_estimators=100,random_state=42),
    'XGBoost':XGBClassifier(n_estimators=100,random_state=42)
}

In [None]:
results = {}
for name,model in models.items():
    model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)])
    model_pipeline.fit(X_train,y_train)
    y_pred_train = model_pipeline.predict(X_train)
    y_pred = model_pipeline.predict(X_test)
    y_pred_proba = model_pipeline.predict_proba(X_test)[:,1]
    train_acc = accuracy_score(y_pred_train,y_train)
    test_acc = accuracy_score(y_pred,y_test)
    pres = precision_score(y_test,y_pred)
    rec = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    results[name] = {'Training Accuracy':train_acc,'Test Accuracy':test_acc,'Precision':pres,'Recall':rec,'F1 score':f1}

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,Training Accuracy,Test Accuracy,Precision,Recall,F1 score
Logistic Regression,0.802012,0.804082,0.787967,0.785159,0.786561
SVM,0.790911,0.792163,0.778583,0.765716,0.772096
Random Forrest,0.999627,0.994487,0.991933,0.996111,0.994018
XGBoost,0.998063,0.996126,0.995146,0.996436,0.99579


In [50]:
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestClassifier(random_state=42))])
param_grid_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 15],
    'classifier__min_samples_leaf': [5, 10]
}
grid_search_rf = GridSearchCV(estimator=rf_pipeline, param_grid=param_grid_rf, 
                              cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

In [51]:
grid_search_rf.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=5, classifier__n_estimators=100; total time=   1.2s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=10, classifier__n_estimators=100; total time=   1.2s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=5, classifier__n_estimators=100; total time=   1.2s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=5, classifier__n_estimators=100; total time=   1.3s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=10, classifier__n_estimators=100; total time=   1.3s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=5, classifier__n_estimators=200; total time=   2.4s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=5, classifier__n_estimators=200; total time=   2.5s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=5, classifier__n_estimators=200; total time=   2.5s
[CV] END c

In [52]:
grid_search_rf.best_params_

{'classifier__max_depth': 15,
 'classifier__min_samples_leaf': 5,
 'classifier__n_estimators': 100}

In [53]:
grid_search_rf.best_score_

np.float64(0.9527657753778472)

In [57]:
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', XGBClassifier(random_state=42, eval_metric='logloss'))])

param_grid_xgb = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3,4],
    'classifier__learning_rate': [0.01, 0.05]
}

grid_search_xgb = GridSearchCV(xgb_pipeline, param_grid_xgb, cv=3, n_jobs=-1, scoring='accuracy', verbose=1)
grid_search_xgb.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [58]:
grid_search_xgb.best_params_

{'classifier__learning_rate': 0.05,
 'classifier__max_depth': 4,
 'classifier__n_estimators': 200}

In [59]:
grid_search_xgb.best_score_

np.float64(0.9144719670843268)

In [113]:
best_xgb_model = grid_search_rf.best_estimator_

In [114]:
target = 180
score = 50

overs_completed = 6
balls_of_over = 0

wickets = 3

venue = 'MA Chidambaram Stadium, Chepauk, Chennai'

runs_required = target - score
balls_bowled = (overs_completed * 6) + balls_of_over
balls_remaining = 120 - balls_bowled
wickets_remaining = 10 - wickets
crr = (score * 6) / balls_bowled if balls_bowled > 0 else 0
rrr = (runs_required * 6) / balls_remaining if balls_remaining > 0 else float('inf')

input_data = pd.DataFrame({
    'venue':[venue],
    'runs_required': [runs_required],
    'balls_remaining': [balls_remaining],
    'wickets_remaining': [wickets_remaining],
    'target_score': [target],
    'crr': [crr],
    'rrr': [rrr]
})

In [115]:
best_xgb_model.predict(input_data)

array([0])

In [116]:
win_probability = best_xgb_model.predict_proba(input_data)[0][1]
loss_probability = best_xgb_model.predict_proba(input_data)[0][0]

In [117]:
win_probability

np.float64(0.22573684944950909)

In [118]:
loss_probability

np.float64(0.774263150550491)

In [82]:
final_df

Unnamed: 0,runs_required,balls_remaining,wickets_remaining,target_score,crr,rrr,venue,did_win
0,183,119,10,183,0.000000,9.226891,Maharaja Yadavindra Singh International Cricke...,0
1,183,118,10,183,0.000000,9.305085,Maharaja Yadavindra Singh International Cricke...,0
2,183,117,10,183,0.000000,9.384615,Maharaja Yadavindra Singh International Cricke...,0
3,183,116,10,183,0.000000,9.465517,Maharaja Yadavindra Singh International Cricke...,0
4,182,115,10,183,1.200000,9.495652,Maharaja Yadavindra Singh International Cricke...,0
...,...,...,...,...,...,...,...,...
33681,2,5,5,191,9.860870,2.400000,"MA Chidambaram Stadium, Chepauk, Chennai",1
33682,1,5,5,191,9.913043,1.200000,"MA Chidambaram Stadium, Chepauk, Chennai",1
33683,1,4,4,191,9.827586,1.500000,"MA Chidambaram Stadium, Chepauk, Chennai",1
33684,1,3,4,191,9.743590,2.000000,"MA Chidambaram Stadium, Chepauk, Chennai",1
