In [1]:
import pandas as pd 
import numpy as np 
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score,accuracy_score
from sklearn.preprocessing import label_binarize
import joblib
import pickle

In [2]:
conn = sqlite3.connect('data.db')
train_data = pd.read_sql('SELECT * FROM train_data', conn)
conn.close()

#### "Predict if the match will end in a draw."

In [3]:
train_data = train_data[train_data['team2'] != 'ICC World XI']

In [4]:
train_data['is_draw'] = train_data['outcome'].apply(lambda x: 1 if x.lower() == 'draw' else 0)

In [5]:
encode = {'team1': {'Australia':1,'New Zealand':2,'West Indies':3,'Zimbabwe':4,'Bangladesh':5,'India':6,'England':7,'South Africa':8,'Pakistan':9,'Sri Lanka':10,'Ireland':11},
          'team2': {'Australia':1,'New Zealand':2,'West Indies':3,'Zimbabwe':4,'Bangladesh':5,'India':6,'England':7,'South Africa':8,'Pakistan':9,'Sri Lanka':10,'Ireland':11},
          'toss_winner': {'Australia':1,'New Zealand':2,'West Indies':3,'Zimbabwe':4,'Bangladesh':5,'India':6,'England':7,'South Africa':8,'Pakistan':9,'Sri Lanka':10,'Ireland':11},
          'winner': {'Australia':1,'New Zealand':2,'West Indies':3,'Zimbabwe':4,'Bangladesh':5,'India':6,'England':7,'South Africa':8,'Pakistan':9,'Sri Lanka':10,'Ireland':11}}
train_data.replace(encode, inplace=True)

  train_data.replace(encode, inplace=True)


In [6]:
cat_cols = ['venue', 'toss_decision']
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col].astype(str))
    label_encoders[col] = le

In [7]:
X1 = train_data.drop(columns=['match_id','outcome','is_draw','winner','season'])
y1 = train_data['is_draw']

In [8]:
X_draw_train, X_draw_test, y_draw_train, y_draw_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

In [9]:
algorithms = {'Logistic Regression': {"model": LogisticRegression(solver='saga', max_iter=10000),
        "params": {"penalty": ['elasticnet', 'l1', 'l2'],"l1_ratio": [0.0, 0.5, 1.0],}},

    'Decision Tree': {"model": tree.DecisionTreeClassifier(),
        "params": {"criterion": ['gini', 'entropy'],"max_depth": [1, 3, 5, 10, 15, 20, 30],"min_samples_split": [2, 5, 10],"min_samples_leaf": [1, 2, 4]}},

    'Random Forest': {"model": RandomForestClassifier(),
        "params": {"n_estimators": [100, 200, 300],"max_features": ["sqrt", "log2", None],"max_depth": [5, 10, 20, 30],"min_samples_split": [2, 5, 10]}},

    'NaiveBayes': {"model": GaussianNB(),"params": {}},

    'K-Nearest Neighbors': {"model": KNeighborsClassifier(),
        "params": {"n_neighbors": [3, 5, 10],"weights": ["uniform", "distance"],"metric": ["euclidean", "manhattan", "minkowski"]}},

    'Gradient Boost': {"model": GradientBoostingClassifier(),
        "params": {"learning_rate": np.arange(0.1, 1, 0.1),"n_estimators": [100, 200, 300],"criterion": ['friedman_mse', 'squared_error'],
            "min_samples_split": [2, 5, 10],"min_samples_leaf": [1, 2, 4],"max_depth": [3, 5, 10, 20],"max_features": ["sqrt", "log2", None]}}
}

In [10]:
draw_prediction_models = {}
draw_model_details = []

for model_name, values in algorithms.items():
    best_score = float('-inf')
    best_rscv = None

    try:
        rscv = RandomizedSearchCV(estimator=values["model"],param_distributions=values["params"],cv=5,n_iter=15,n_jobs=-1,verbose=0,random_state=42)
        rscv.fit(X_draw_train, y_draw_train)

        if rscv.best_score_ > best_score:
            best_score = rscv.best_score_
            best_rscv = rscv

    except Exception as e:
        print(f"Error with {model_name} (Draw Prediction): {e}")
        continue

    if best_rscv:
        draw_prediction_models[model_name] = best_rscv
        draw_model_details.append({"Model Name": model_name,"Best Score": best_score,"Best Parameters": best_rscv.best_params_})
        print(f"{model_name} (Draw): Best Score = {best_score:.4f}")
    else:
        print(f"{model_name} (Draw): No valid configuration found.")




Logistic Regression (Draw): Best Score = 0.8051
Decision Tree (Draw): Best Score = 0.7991
Random Forest (Draw): Best Score = 0.8021
NaiveBayes (Draw): Best Score = 0.8051




K-Nearest Neighbors (Draw): Best Score = 0.8036
Gradient Boost (Draw): Best Score = 0.7872


In [11]:
pd.set_option('display.max_colwidth', None)
pd.DataFrame(draw_model_details)

Unnamed: 0,Model Name,Best Score,Best Parameters
0,Logistic Regression,0.805064,"{'penalty': 'elasticnet', 'l1_ratio': 0.0}"
1,Decision Tree,0.799071,"{'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 3, 'criterion': 'entropy'}"
2,Random Forest,0.802067,"{'n_estimators': 200, 'min_samples_split': 2, 'max_features': 'log2', 'max_depth': 5}"
3,NaiveBayes,0.805064,{}
4,K-Nearest Neighbors,0.803571,"{'weights': 'uniform', 'n_neighbors': 10, 'metric': 'minkowski'}"
5,Gradient Boost,0.787164,"{'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 3, 'learning_rate': 0.1, 'criterion': 'squared_error'}"


In [12]:
draw_test_results = []

for model_name, model in draw_prediction_models.items():
    y_pred = model.predict(X_draw_test)
    
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_draw_test)[:, 1]
        try:
            roc_auc = roc_auc_score(y_draw_test, y_proba)
        except:
            roc_auc = None
    else:
        roc_auc = None

    report = classification_report(y_draw_test, y_pred, output_dict=True, zero_division=0)
    
    draw_test_results.append({"Model Name": model_name,"Test Score": model.score(X_draw_test, y_draw_test),"Precision": report["weighted avg"]["precision"],
        "Recall": report["weighted avg"]["recall"],"F1-score": report["weighted avg"]["f1-score"],"ROC AUC": roc_auc})

draw_results_df = pd.DataFrame(draw_test_results)

In [13]:
draw_results_df

Unnamed: 0,Model Name,Test Score,Precision,Recall,F1-score,ROC AUC
0,Logistic Regression,0.810651,0.657155,0.810651,0.725877,0.425411
1,Decision Tree,0.798817,0.655317,0.798817,0.719986,0.421761
2,Random Forest,0.798817,0.655317,0.798817,0.719986,0.511405
3,NaiveBayes,0.810651,0.657155,0.810651,0.725877,0.411953
4,K-Nearest Neighbors,0.816568,0.850416,0.816568,0.739733,0.559535
5,Gradient Boost,0.798817,0.705684,0.798817,0.729905,0.532162


#### "If the game produces a result, who has the upper hand?"

In [14]:
winner_train_data = train_data[train_data['is_draw'] == 0].copy()

In [15]:
X2 = winner_train_data.drop(columns=['match_id','outcome','is_draw','winner','season'])
y2 = winner_train_data['winner']

In [16]:
X_winner_train, X_winner_test, y_winner_train, y_winner_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [17]:
winner_prediction_models = {}
winner_model_details = []

for model_name, values in algorithms.items():
    best_score = float('-inf')
    best_rscv = None

    try:
        rscv = RandomizedSearchCV(estimator=values["model"],param_distributions=values["params"],cv=5,n_iter=15,n_jobs=-1,verbose=0,random_state=42)
        rscv.fit(X_winner_train, y_winner_train)

        if rscv.best_score_ > best_score:
            best_score = rscv.best_score_
            best_rscv = rscv

    except Exception as e:
        print(f"Error with {model_name} (Winner Prediction): {e}")
        continue

    if best_rscv:
        winner_prediction_models[model_name] = best_rscv
        winner_model_details.append({"Model Name": model_name,"Best Score": best_score,"Best Parameters": best_rscv.best_params_})
        print(f"{model_name} (Winner): Best Score = {best_score:.4f}")
    else:
        print(f"{model_name} (Winner): No valid configuration found.")




Logistic Regression (Winner): Best Score = 0.3119




Decision Tree (Winner): Best Score = 0.5885




Random Forest (Winner): Best Score = 0.6347




NaiveBayes (Winner): Best Score = 0.3396




K-Nearest Neighbors (Winner): Best Score = 0.4871




Gradient Boost (Winner): Best Score = 0.6292


In [18]:
pd.set_option('display.max_colwidth', None)
pd.DataFrame(winner_model_details)

Unnamed: 0,Model Name,Best Score,Best Parameters
0,Logistic Regression,0.311859,"{'penalty': 'l1', 'l1_ratio': 0.0}"
1,Decision Tree,0.588515,"{'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 15, 'criterion': 'gini'}"
2,Random Forest,0.634676,"{'n_estimators': 200, 'min_samples_split': 10, 'max_features': None, 'max_depth': 20}"
3,NaiveBayes,0.339602,{}
4,K-Nearest Neighbors,0.487054,"{'weights': 'distance', 'n_neighbors': 10, 'metric': 'manhattan'}"
5,Gradient Boost,0.629154,"{'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 3, 'learning_rate': 0.1, 'criterion': 'squared_error'}"


In [29]:
winner_test_results = []

for model_name, model in winner_prediction_models.items():
    y_pred = model.predict(X_winner_test)
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_winner_test)
        y_true_bin = label_binarize(y_winner_test, classes=np.unique(y_winner_test))

    report = classification_report(y_winner_test, y_pred, output_dict=True, zero_division=0)
    
    winner_test_results.append({"Model Name": model_name,"Test Score": model.score(X_winner_test, y_winner_test),"Precision": report["weighted avg"]["precision"],
        "Recall": report["weighted avg"]["recall"],"F1-score": report["weighted avg"]["f1-score"]})

winner_results_df = pd.DataFrame(winner_test_results)

In [30]:
winner_results_df

Unnamed: 0,Model Name,Test Score,Precision,Recall,F1-score
0,Logistic Regression,0.264706,0.194086,0.264706,0.219546
1,Decision Tree,0.654412,0.6751,0.654412,0.644874
2,Random Forest,0.705882,0.724891,0.705882,0.698698
3,NaiveBayes,0.382353,0.373056,0.382353,0.3635
4,K-Nearest Neighbors,0.544118,0.525959,0.544118,0.518713
5,Gradient Boost,0.713235,0.716069,0.713235,0.706826


#### Best Models

In [31]:
joblib.dump(draw_prediction_models["K-Nearest Neighbors"], "draw_prediction.pkl")
joblib.dump(winner_prediction_models["Gradient Boost"], "winner_prediction.pkl")

['winner_prediction.pkl']

In [22]:
with open("encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)