In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
match_df = pd.read_csv('data_files/match_details.csv',index_col=0)
ball_df = pd.read_csv('data_files/ball_by_ball.csv',index_col=0)

In [3]:
target_columns = match_df[['match_id','team_1_score','venue','winner']].copy()
target_columns['target_score'] = target_columns['team_1_score']+1

In [4]:
ball_df = ball_df[ball_df['inning']==2]
ball_df = ball_df.merge(target_columns,on='match_id')

In [5]:
ball_df['venue'] = ball_df['venue'].replace('Maharaja Yadavindra Singh International Cricket Stadium, New Chandigarh','Maharaja Yadavindra Singh International Cricket Stadium, Mullanpur')

In [6]:
ball_df['current_score'] = ball_df.groupby('match_id')['total_runs'].cumsum()
ball_df['runs_required'] = ball_df['target_score'] - ball_df['current_score']
ball_df['runs_required'] = ball_df['runs_required'].clip(lower=0)
ball_df['balls_bowled'] = (~ball_df['extras_type'].str.contains('wides|noballs', na=False)).groupby(ball_df['match_id']).cumsum()
ball_df['balls_remaining'] = 120 - ball_df['balls_bowled']
ball_df['wickets_taken'] = ball_df.groupby('match_id')['is_wicket'].cumsum()
ball_df['wickets_remaining'] = 10 - ball_df['wickets_taken']
ball_df['crr'] = (ball_df['current_score'] * 6) / ball_df['balls_bowled']
ball_df['rrr'] = (ball_df['runs_required'] * 6) / ball_df['balls_remaining']
ball_df['did_win'] = (ball_df['batting_team'] == ball_df['winner']).astype(int)

In [7]:
final_df = ball_df[[
    'match_id',
    'runs_required',
    'balls_remaining',
    'wickets_remaining',
    'target_score',
    'crr',
    'rrr',
    'venue',
    'did_win'
]].copy()

In [8]:
final_df

Unnamed: 0,match_id,runs_required,balls_remaining,wickets_remaining,target_score,crr,rrr,venue,did_win
0,1426261,183,119,10,183,0.000000,9.226891,Maharaja Yadavindra Singh International Cricke...,0
1,1426261,183,118,10,183,0.000000,9.305085,Maharaja Yadavindra Singh International Cricke...,0
2,1426261,183,117,10,183,0.000000,9.384615,Maharaja Yadavindra Singh International Cricke...,0
3,1426261,183,116,10,183,0.000000,9.465517,Maharaja Yadavindra Singh International Cricke...,0
4,1426261,182,115,10,183,1.200000,9.495652,Maharaja Yadavindra Singh International Cricke...,0
...,...,...,...,...,...,...,...,...,...
33681,1473486,2,5,5,191,9.860870,2.400000,"MA Chidambaram Stadium, Chepauk, Chennai",1
33682,1473486,1,5,5,191,9.913043,1.200000,"MA Chidambaram Stadium, Chepauk, Chennai",1
33683,1473486,1,4,4,191,9.827586,1.500000,"MA Chidambaram Stadium, Chepauk, Chennai",1
33684,1473486,1,3,4,191,9.743590,2.000000,"MA Chidambaram Stadium, Chepauk, Chennai",1


In [9]:
final_df.dropna(inplace=True)
final_df = final_df[~final_df.isin([float('inf'), float('-inf')]).any(axis=1)]
final_df = final_df[final_df['balls_remaining'] != 0]

In [10]:
train_df = final_df[final_df['match_id']<=1473472]
test_df = final_df[final_df['match_id']>1473472]

In [11]:
X_train = train_df.drop(['did_win','match_id'], axis=1)
y_train = train_df['did_win']
X_test = test_df.drop(['did_win','match_id'], axis=1)
y_test = test_df['did_win']

In [44]:
preprocessor = ColumnTransformer(
    transformers=[('onehot',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),['venue'])],
    remainder='passthrough',
    force_int_remainder_cols=False 
)

In [45]:
models ={
    'Logistic Regression':LogisticRegression(max_iter=1000),
    'SVM':SVC(probability=True),
    'Random Forrest':RandomForestClassifier(n_estimators=100,random_state=42),
    'XGBoost':XGBClassifier(n_estimators=100,random_state=42)
}

In [46]:
results = {}
for name,model in models.items():
    model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)])
    model_pipeline.fit(X_train,y_train)
    y_pred_train = model_pipeline.predict(X_train)
    y_pred = model_pipeline.predict(X_test)
    y_pred_proba = model_pipeline.predict_proba(X_test)[:,1]
    train_acc = accuracy_score(y_pred_train,y_train)
    test_acc = accuracy_score(y_pred,y_test)
    pres = precision_score(y_test,y_pred)
    rec = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    results[name] = {'Training Accuracy':train_acc,'Test Accuracy':test_acc,'Precision':pres,'Recall':rec,'F1 score':f1}

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,Training Accuracy,Test Accuracy,Precision,Recall,F1 score
Logistic Regression,0.804622,0.759454,0.770403,0.708354,0.738077
SVM,0.791245,0.811872,0.863827,0.72036,0.785597
Random Forrest,0.999523,0.796553,0.836161,0.714857,0.770766
XGBoost,0.998366,0.760172,0.82603,0.631816,0.715986


In [48]:
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestClassifier(random_state=42))])
param_grid_rf = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 15, None],
    'classifier__min_samples_leaf': [5, 10, 15]
}
grid_search_rf = GridSearchCV(estimator=rf_pipeline, param_grid=param_grid_rf, 
                              cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

In [49]:
grid_search_rf.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[CV] END classifier__max_depth=10, classifier__min_samples_leaf=5, classifier__n_estimators=100; total time=   1.2s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=5, classifier__n_estimators=100; total time=   1.3s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=5, classifier__n_estimators=100; total time=   1.2s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=5, classifier__n_estimators=200; total time=   2.4s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=5, classifier__n_estimators=200; total time=   2.4s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=10, classifier__n_estimators=100; total time=   1.2s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=5, classifier__n_estimators=200; total time=   2.5s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=10, classifier__n_estimators=100; total time=   1.3s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=10, cl

In [50]:
grid_search_rf.best_params_

{'classifier__max_depth': 10,
 'classifier__min_samples_leaf': 15,
 'classifier__n_estimators': 200}

In [51]:
grid_search_rf.best_score_

np.float64(0.7472684570611662)

In [52]:
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', XGBClassifier(random_state=42, eval_metric='logloss'))])

param_grid_xgb = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [5,7,10],
    'classifier__learning_rate': [0.01, 0.05, 0.1]
}

grid_search_xgb = GridSearchCV(xgb_pipeline, param_grid_xgb, cv=3, n_jobs=-1, scoring='accuracy', verbose=1)
grid_search_xgb.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [53]:
grid_search_xgb.best_params_

{'classifier__learning_rate': 0.01,
 'classifier__max_depth': 5,
 'classifier__n_estimators': 300}

In [54]:
grid_search_xgb.best_score_

np.float64(0.7425031485074373)

In [23]:
best_xgb_model = grid_search_rf.best_estimator_

In [24]:
target = 180
score = 50

overs_completed = 6
balls_of_over = 0

wickets = 3

venue = 'MA Chidambaram Stadium, Chepauk, Chennai'

runs_required = target - score
balls_bowled = (overs_completed * 6) + balls_of_over
balls_remaining = 120 - balls_bowled
wickets_remaining = 10 - wickets
crr = (score * 6) / balls_bowled if balls_bowled > 0 else 0
rrr = (runs_required * 6) / balls_remaining if balls_remaining > 0 else float('inf')

input_data = pd.DataFrame({
    'venue':[venue],
    'runs_required': [runs_required],
    'balls_remaining': [balls_remaining],
    'wickets_remaining': [wickets_remaining],
    'target_score': [target],
    'crr': [crr],
    'rrr': [rrr]
})

In [25]:
best_xgb_model.predict(input_data)

array([0])

In [26]:
win_probability = best_xgb_model.predict_proba(input_data)[0][1]
loss_probability = best_xgb_model.predict_proba(input_data)[0][0]

In [27]:
win_probability

np.float64(0.30766927934546834)

In [28]:
loss_probability

np.float64(0.6923307206545316)

In [29]:
final_df

Unnamed: 0,match_id,runs_required,balls_remaining,wickets_remaining,target_score,crr,rrr,venue,did_win
0,1426261,183,119,10,183,0.000000,9.226891,Maharaja Yadavindra Singh International Cricke...,0
1,1426261,183,118,10,183,0.000000,9.305085,Maharaja Yadavindra Singh International Cricke...,0
2,1426261,183,117,10,183,0.000000,9.384615,Maharaja Yadavindra Singh International Cricke...,0
3,1426261,183,116,10,183,0.000000,9.465517,Maharaja Yadavindra Singh International Cricke...,0
4,1426261,182,115,10,183,1.200000,9.495652,Maharaja Yadavindra Singh International Cricke...,0
...,...,...,...,...,...,...,...,...,...
33681,1473486,2,5,5,191,9.860870,2.400000,"MA Chidambaram Stadium, Chepauk, Chennai",1
33682,1473486,1,5,5,191,9.913043,1.200000,"MA Chidambaram Stadium, Chepauk, Chennai",1
33683,1473486,1,4,4,191,9.827586,1.500000,"MA Chidambaram Stadium, Chepauk, Chennai",1
33684,1473486,1,3,4,191,9.743590,2.000000,"MA Chidambaram Stadium, Chepauk, Chennai",1


In [57]:
svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', SVC(probability=True, random_state=42))])

param_grid_svm = {
    'classifier__C': [0.1, 1, 10],
    'classifier__gamma': ['scale', 'auto'],
    'classifier__kernel': ['rbf', 'linear']
}

grid_search_svm = GridSearchCV(svm_pipeline, param_grid_svm, cv=3, n_jobs=-1, scoring='accuracy', verbose=1)
grid_search_svm.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits




KeyboardInterrupt: 

In [None]:
grid_search_svm.best_params_

In [None]:
grid_search_svm.best_score_

In [56]:
# --- SVM Tuning on a Sample ---
print("\nTuning SVM on a smaller sample to save time and memory...")

# Create a smaller, random sample (e.g., 20% of the data) for the grid search
X_train_sample, _, y_train_sample, _ = train_test_split(
    X_train, y_train, train_size=0.2, random_state=42, stratify=y_train
)

svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', SVC(probability=True, random_state=42))])

param_grid_svm = {
    'classifier__C': [0.1, 1, 10],
    'classifier__gamma': ['scale', 'auto'],
    'classifier__kernel': ['rbf', 'linear']
}

grid_search_svm = GridSearchCV(svm_pipeline, param_grid_svm, cv=3, n_jobs=-1, scoring='accuracy', verbose=1)

# Fit the search on the SMALLER sample
grid_search_svm.fit(X_train_sample, y_train_sample)

print("\nBest parameters for SVM found:", grid_search_svm.best_params_)
print(f"Best cross-validation accuracy on sample: {grid_search_svm.best_score_:.4f}")


Tuning SVM on a smaller sample to save time and memory...
Fitting 3 folds for each of 12 candidates, totalling 36 fits

Best parameters for SVM found: {'classifier__C': 0.1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}
Best cross-validation accuracy on sample: 0.8187
