In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import joblib
from sklearn.calibration import CalibrationDisplay

In [None]:
match_df = pd.read_csv('data_files/match_details.csv',index_col=0)
ball_df = pd.read_csv('data_files/ball_by_ball.csv',index_col=0)

In [None]:
target_columns = match_df[['match_id','team_1_score','venue','winner']].copy()
target_columns['target_score'] = target_columns['team_1_score']+1

In [None]:
ball_df = ball_df[ball_df['inning']==2]
ball_df = ball_df.merge(target_columns,on='match_id')

In [None]:
ball_df['venue'] = ball_df['venue'].replace('Maharaja Yadavindra Singh International Cricket Stadium, New Chandigarh','Maharaja Yadavindra Singh International Cricket Stadium, Mullanpur')

In [None]:
ball_df['current_score'] = ball_df.groupby('match_id')['total_runs'].cumsum()
ball_df['runs_required'] = ball_df['target_score'] - ball_df['current_score']
ball_df['runs_required'] = ball_df['runs_required'].clip(lower=0)
ball_df['balls_bowled'] = (~ball_df['extras_type'].str.contains('wides|noballs', na=False)).groupby(ball_df['match_id']).cumsum()
ball_df['balls_remaining'] = 120 - ball_df['balls_bowled']
ball_df['wickets_taken'] = ball_df.groupby('match_id')['is_wicket'].cumsum()
ball_df['wickets_remaining'] = 10 - ball_df['wickets_taken']
ball_df['crr'] = (ball_df['current_score'] * 6) / ball_df['balls_bowled']
ball_df['rrr'] = (ball_df['runs_required'] * 6) / ball_df['balls_remaining']
ball_df['did_win'] = (ball_df['batting_team'] == ball_df['winner']).astype(int)

In [None]:
final_df = ball_df[[
    'match_id',
    'runs_required',
    'balls_remaining',
    'wickets_remaining',
    'target_score',
    'crr',
    'rrr',
    'venue',
    'did_win'
]].copy()

In [None]:
final_df

In [None]:
final_df.dropna(inplace=True)
final_df = final_df[~final_df.isin([float('inf'), float('-inf')]).any(axis=1)]
final_df = final_df[final_df['balls_remaining'] != 0]

In [None]:
X = final_df.drop(['did_win','match_id'], axis=1)
y = final_df['did_win']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[('onehot',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),['venue'])],
    remainder='passthrough',
    force_int_remainder_cols=False 
)

In [None]:
models ={
    'Logistic Regression':LogisticRegression(max_iter=5000),
    'SVM':SVC(probability=True),
    'Random Forrest':RandomForestClassifier(n_estimators=100,random_state=42),
    'XGBoost':XGBClassifier(n_estimators=100,random_state=42)
}

In [None]:
results = {}
for name,model in models.items():
    model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)])
    model_pipeline.fit(X_train,y_train)
    y_pred_train = model_pipeline.predict(X_train)
    y_pred = model_pipeline.predict(X_test)
    y_pred_proba = model_pipeline.predict_proba(X_test)[:,1]
    train_acc = accuracy_score(y_pred_train,y_train)
    test_acc = accuracy_score(y_pred,y_test)
    pres = precision_score(y_test,y_pred)
    rec = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    results[name] = {'Training Accuracy':train_acc,'Test Accuracy':test_acc,'Precision':pres,'Recall':rec,'F1 score':f1}
results_df = pd.DataFrame(results).T
print('Results for Random Split Data')
results_df

In [None]:
time_train_df = final_df[final_df['match_id']<=1473472]
time_test_df = final_df[final_df['match_id']>1473472]
X_train_time = time_train_df.drop(['did_win','match_id'], axis=1)
y_train_time = time_train_df['did_win']
X_test_time = time_test_df.drop(['did_win','match_id'], axis=1)
y_test_time = time_test_df['did_win']

In [None]:
results2 = {}
for name,model in models.items():
    model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)])
    model_pipeline.fit(X_train_time,y_train_time)
    y_pred_train = model_pipeline.predict(X_train_time)
    y_pred = model_pipeline.predict(X_test_time)
    y_pred_proba = model_pipeline.predict_proba(X_test_time)[:,1]
    train_acc = accuracy_score(y_pred_train,y_train_time)
    test_acc = accuracy_score(y_pred,y_test_time)
    pres = precision_score(y_test_time,y_pred)
    rec = recall_score(y_test_time,y_pred)
    f1 = f1_score(y_test_time,y_pred)
    results2[name] = {'Training Accuracy':train_acc,'Test Accuracy':test_acc,'Precision':pres,'Recall':rec,'F1 score':f1}
results_df2 = pd.DataFrame(results2).T
print('Results for Time Split Data')
results_df2

In [None]:
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestClassifier(random_state=42))])
param_grid_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 15],
    'classifier__min_samples_leaf': [5, 10]
}
grid_search_rf = GridSearchCV(estimator=rf_pipeline, param_grid=param_grid_rf, 
                              cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)
print(f'Best Random Forrest Parameters: {grid_search_rf.best_params_}')
print(f'Best Random Forrest Model Accuracy: {grid_search_rf.best_score_}')

In [None]:
svm_rbf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(probability=True, random_state=42, kernel='rbf'))
])
param_grid_rbf = {
    'classifier__C': [0.1, 1, 10],
    'classifier__gamma': ['scale', 'auto']
}
grid_search_rbf = GridSearchCV(svm_rbf_pipeline, param_grid_rbf, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search_rbf.fit(X_train_time, y_train_time)
print(f"Best RBF Score: {grid_search_rbf.best_score_:.4f}")
print(f"Best RBF Params: {grid_search_rbf.best_params_}")
svm_linear_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC(random_state=42, dual=False))
])
param_grid_linear = {
    'classifier__C': [0.1, 1, 10]
}
grid_search_linear = GridSearchCV(svm_linear_pipeline, param_grid_linear, cv=3, scoring='accuracy', verbose=1)
grid_search_linear.fit(X_train_time, y_train_time)
print(f"Best Linear Score: {grid_search_linear.best_score_}")
print(f"Best Linear Params: {grid_search_linear.best_params_}")

In [None]:
best_random_split_model = grid_search_rf.best_estimator_
best_random_split_model.fit(X_train,y_train)
y_pred_rf = best_random_split_model.predict(X_test)
print(f'Best Accuracy for Random Split Data: {accuracy_score(y_pred_rf,y_test)}')

In [None]:
best_time_split_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(probability=True, random_state=42, kernel='rbf'))
])
best_time_split_model.fit(X_train_time,y_train_time)
y_pred_svm = best_time_split_model.predict(X_test_time)
print(f'Best Accuracy for Time Split Data: {accuracy_score(y_pred_svm,y_test_time)}')

In [None]:
probs_time = best_time_split_model.predict_proba(X_test_time)[:, 1]
probs_random = best_random_split_model.predict_proba(X_test_time)[:, 1]
plt.figure(figsize=(7,6))
CalibrationDisplay.from_predictions(y_test_time, probs_time, n_bins=10, name="Time-Split Model")
CalibrationDisplay.from_predictions(y_test_time, probs_random, n_bins=10, name="Random-Split Model")
plt.plot([0, 1], [0, 1], "k--", label="Perfectly Calibrated")
plt.legend()
plt.title("Probability Calibration Curve")
plt.show()

In [None]:
joblib.dump(best_time_split_model,'models/best_chase_prediction_model.pkl')