In [56]:
import numpy as np
import matplotlib.pyplot as mtp
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_log_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, matthews_corrcoef,roc_auc_score

Load Data

In [57]:
ds = pd.read_csv("cricket_data_train_test.csv")
ds.head(4)

Unnamed: 0,match_id,date,season,tournament_name,is_worldcup,match_stage,team1,team2,venue,city,...,second_innings_score,match_result,elo_team1,elo_team2,elo_diff,team1_form_5,team2_form_5,team1_form_10,team2_form_10,h2h_win_pct
0,7129583000.0,16-03-2014,2014.0,World T20,True,First Round,Hong Kong,Nepal,Zahur Ahmed Chowdhury Stadium,Chittagong,...,69.0,completed,1500.0,1500.0,0.0,0.5,0.5,0.5,0.5,0.5
1,-292451400.0,17-03-2014,2014.0,World T20,True,First Round,Ireland,Zimbabwe,Sylhet Stadium,,...,164.0,completed,1500.0,1500.0,0.0,0.5,0.5,0.5,0.5,0.5
2,-339384500.0,17-03-2014,2014.0,World T20,True,First Round,Netherlands,United Arab Emirates,Sylhet Stadium,,...,152.0,completed,1500.0,1500.0,0.0,0.5,0.5,0.5,0.5,0.5
3,-901852300.0,18-03-2014,2014.0,World T20,True,First Round,Bangladesh,Nepal,Zahur Ahmed Chowdhury Stadium,Chittagong,...,132.0,completed,1500.0,1516.0,-16.0,0.5,1.0,0.5,1.0,0.5


Feature extraction

In [58]:

feature_names = ['tournament_name','team1','team2','venue','innings1_team','innings1_runs','innings1_wkts','innings1_overs','innings2_team','innings2_runs','innings2_wkts','innings2_overs']
x = ds.loc[:,feature_names].values
y = ds.loc[:,['winner']].values


Encode Data

In [59]:
# Identify categorical features for OneHotEncoding
categorical_features_indices = [0, 1, 2, 3, 4, 8] # Indices for 'tournament_name','team1','team2','venue','innings1_team','innings2_team'

# Apply OneHotEncoder to the specified categorical features
ohe = ColumnTransformer([("one_hot_encoder", OneHotEncoder(), categorical_features_indices)], remainder='passthrough')
x = ohe.fit_transform(x)

# Encode the target variable 'winner'
le = LabelEncoder()
y = le.fit_transform(y.ravel())

print("Shape of x after OneHotEncoding:", x.shape)
print("Shape of y after LabelEncoding:", y.shape)

Shape of x after OneHotEncoding: (3865, 1105)
Shape of y after LabelEncoding: (3865,)


Clean Data

In [60]:
# Identify rows with NaNs in x_encoded or y_encoded
nan_mask = np.isnan(x.toarray()).any(axis=1) | np.isnan(y).any()

print(f"Number of samples before dropping NaNs: {x.shape[0]}")

x = x[~nan_mask]
y = y[~nan_mask]

# Identify classes with only one member and filter them out
class_counts = pd.Series(y).value_counts()
single_member_classes = class_counts[class_counts < 2].index

# Create a mask to keep only samples where the class count is 2 or more
valid_samples_mask = ~pd.Series(y).isin(single_member_classes)

x = x[valid_samples_mask.values]
y = y[valid_samples_mask.values]

print(f"Number of samples after dropping NaNs and single-member classes: {x.shape[0]}")

Number of samples before dropping NaNs: 3865
Number of samples after dropping NaNs and single-member classes: 3146


Split Data

In [61]:
# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)


Train Model

In [62]:
# Initialize and train the Logistic Regression model
#classifier = LogisticRegression(random_state=42, solver='liblinear') # Using 'liblinear' solver for small datasets
classifier = DecisionTreeClassifier(random_state=42)

classifier.fit(x_train, y_train)


Prediction using test data

In [63]:
# Make predictions on the test set
y_pred = classifier.predict(x_test)

Evaluate model

In [64]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
mcc = matthews_corrcoef(y_test, y_pred)

# Calculate predicted probabilities for AUC
y_proba = classifier.predict_proba(x_test)

# Calculate AUC (for multiclass, typically 'weighted' or 'macro' average is used)
# 'weighted' accounts for class imbalance
auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted', labels=classifier.classes_)


print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"MCC: {mcc:.4f}")
print(f"AUC (Weighted): {auc:.4f}")



Accuracy: 0.6143
Precision: 0.6398
Recall: 0.6143
F1-Score: 0.6190
MCC: 0.6070
AUC (Weighted): 0.8039




Confusion matrix

In [65]:
print("Confusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

Confusion Matrix:
[[ 3  0  0 ...  0  0  0]
 [ 0 18  0 ...  1  0  1]
 [ 0  0  5 ...  0  0  0]
 ...
 [ 0  2  0 ... 11  0  1]
 [ 0  0  0 ...  0 11  1]
 [ 0  1  0 ...  0  0  0]]


In [66]:
import joblib

# Example with scikit-learn
joblib.dump(classifier, "decisionTree.pkl")

['decisionTree.pkl']