In [31]:
import numpy as np
import matplotlib.pyplot as mtp
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_log_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, matthews_corrcoef,roc_auc_score

Load Data

In [32]:
ds = pd.read_csv("cricket_data_train_test.csv")
#ds = ds.head(500)


Feature extraction

In [33]:

feature_names = ['tournament_name','team1','team2','venue','innings1_team','innings1_runs','innings1_wkts','innings1_overs','innings2_team','innings2_runs','innings2_wkts','innings2_overs']
x = ds.loc[:,feature_names].values
y = ds.loc[:,['winner']].values


Encode Data

In [34]:
# Identify categorical features for OneHotEncoding
categorical_features_indices = [0, 1, 2, 3, 4, 8] # Indices for 'tournament_name','team1','team2','venue','innings1_team','innings2_team'

# Apply OneHotEncoder to the specified categorical features
ohe = ColumnTransformer([("one_hot_encoder", OneHotEncoder(), categorical_features_indices)], remainder='passthrough')
x = ohe.fit_transform(x)

# Encode the target variable 'winner'
le = LabelEncoder()
y = le.fit_transform(y.ravel())

print("Shape of x after OneHotEncoding:", x.shape)
print("Shape of y after LabelEncoding:", y.shape)

Shape of x after OneHotEncoding: (3865, 1105)
Shape of y after LabelEncoding: (3865,)


Clean Data

In [35]:
# Identify rows with NaNs in x_encoded or y_encoded
nan_mask = np.isnan(x.toarray()).any(axis=1) | np.isnan(y).any()

print(f"Number of samples before dropping NaNs: {x.shape[0]}")

x = x[~nan_mask]
y = y[~nan_mask]

# Identify classes with only one member and filter them out
class_counts = pd.Series(y).value_counts()
single_member_classes = class_counts[class_counts < 2].index

# Create a mask to keep only samples where the class count is 2 or more
valid_samples_mask = ~pd.Series(y).isin(single_member_classes)

x = x[valid_samples_mask.values]
y = y[valid_samples_mask.values]

print(f"Number of samples after dropping NaNs and single-member classes: {x.shape[0]}")

Number of samples before dropping NaNs: 3865
Number of samples after dropping NaNs and single-member classes: 3146


Split Data

In [36]:
# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)


Train Model

In [37]:
# Initialize and train the Logistic Regression model
#classifier = LogisticRegression(random_state=42, solver='liblinear') # Using 'liblinear' solver for small datasets
#classifier = DecisionTreeClassifier(random_state=42)
#classifier = KNeighborsClassifier(n_neighbors=5)
#classifier = GaussianNB()
classifier = RandomForestClassifier(n_estimators=100, random_state=42)



classifier.fit(x_train.toarray(), y_train)

Prediction using test data

In [38]:
# Make predictions on the test set
y_pred = classifier.predict(x_test.toarray())


Evaluate model

In [39]:

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
mcc = matthews_corrcoef(y_test, y_pred)

# Calculate predicted probabilities for AUC
y_proba = classifier.predict_proba(x_test.toarray())

# Calculate AUC (for multiclass, typically 'weighted' or 'macro' average is used)
# 'weighted' accounts for class imbalance
auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted', labels=classifier.classes_)


print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"MCC: {mcc:.4f}")
print(f"AUC (Weighted): {auc:.4f}")

Accuracy: 0.7286
Precision: 0.7326
Recall: 0.7286
F1-Score: 0.7137
MCC: 0.7234
AUC (Weighted): 0.9895




Confusion matrix

In [40]:
print("Confusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

Confusion Matrix:
[[ 3  0  0 ...  0  0  0]
 [ 0 21  0 ...  1  0  0]
 [ 0  0  5 ...  0  0  0]
 ...
 [ 0  1  0 ... 10  0  0]
 [ 0  0  0 ...  0 14  0]
 [ 0  1  0 ...  0  0  0]]


In [41]:
import joblib

# Example with scikit-learn
#joblib.dump(classifier, "randomForest.pkl")
joblib.dump(classifier, "randomForest.pkl", compress='zlib')


['randomForest.pkl']