In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
# ------------------------------
# 1. Data Preprocessing
# ------------------------------

# Load the dataset
df = pd.read_csv('/kaggle/input/cricket-predictor/over_features.csv')

# Create engineered features
df['pressure_index'] = df['dot_ball_pressure'] * df['required_desired_run_rate']
df['wicket_pressure'] = df['number_of_wickets_lost'] * df['required_desired_run_rate']
df['late_over_flag'] = (df['over'] > 15).astype(int)
df['bowler_pressure'] = df['current_bowler_economy'] * (df['bowler_wickets_in_match'] + 1)  # +1 to avoid zeros
df['aggressiveness_index'] = df['striker_strike_rate'] * (df['striker_boundaries_hit'] + 1)

# Drop unneeded columns: match_id is an identifier and may not be useful.
# Also, you can decide if you want to encode categorical variables like 'team' and 'match_phase'
df = df.drop(columns=['match_id'])

# One-hot encode categorical columns if needed
df = pd.get_dummies(df, columns=['team', 'match_phase'], drop_first=True)

# Separate features and target. Our target is 'wicket_next_over'
X = df.drop(columns=['wicket_next_over'])
y = df['wicket_next_over']

In [3]:
# ------------------------------
# 2. Train-Test Split & Scaling
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features (important for models like SVM and KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
# ------------------------------
# 2.5. Addressing Class Imbalance with SMOTE
# ------------------------------
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print("Class distribution before SMOTE:", np.bincount(y_train))
print("Class distribution after SMOTE:", np.bincount(y_train_balanced))

Class distribution before SMOTE: [23721 10166]
Class distribution after SMOTE: [23721 23721]


In [5]:
# ------------------------------
# 3. Model Training and Evaluation
# ------------------------------

# Prepare a dictionary to store evaluation metrics for each model
results = {}

# Helper function for printing metrics
def evaluate_model(name, y_true, y_pred, y_prob=None):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None
    results[name] = {'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1 Score': f1, 'ROC-AUC': auc}
    print(f"{name} -> Accuracy: {acc:.3f}, Precision: {prec:.3f}, Recall: {rec:.3f}, F1 Score: {f1:.3f}, ROC-AUC: {auc if auc is not None else 'N/A'}")

In [6]:
# 3.1 Linear Regression (as a baseline classifier)
lin_reg = LinearRegression()
lin_reg.fit(X_train_balanced, y_train_balanced)  # Changed here: use balanced data
y_pred_linreg = (lin_reg.predict(X_test_scaled) >= 0.5).astype(int)
evaluate_model("Linear Regression", y_test, y_pred_linreg, y_prob=lin_reg.predict(X_test_scaled))

Linear Regression -> Accuracy: 0.608, Precision: 0.378, Recall: 0.471, F1 Score: 0.419, ROC-AUC: 0.5924727976404498


In [7]:
# 3.2 Logistic Regression
log_reg = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
log_reg.fit(X_train_balanced, y_train_balanced)  # Changed here: use balanced data
y_pred_log = log_reg.predict(X_test_scaled)
y_prob_log = log_reg.predict_proba(X_test_scaled)[:, 1]
evaluate_model("Logistic Regression", y_test, y_pred_log, y_prob=y_prob_log)

Logistic Regression -> Accuracy: 0.608, Precision: 0.377, Recall: 0.472, F1 Score: 0.419, ROC-AUC: 0.5926535717650055


In [8]:
# 3.3 K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_balanced, y_train_balanced)  # Changed here: use balanced data
y_pred_knn = knn.predict(X_test_scaled)
evaluate_model("KNN", y_test, y_pred_knn)

KNN -> Accuracy: 0.522, Precision: 0.319, Recall: 0.520, F1 Score: 0.395, ROC-AUC: N/A


In [9]:
# 3.4 Decision Tree
tree = DecisionTreeClassifier(class_weight='balanced', random_state=42)
tree.fit(X_train_balanced, y_train_balanced)  # Changed here: use balanced data
y_pred_tree = tree.predict(X_test_scaled)  # You can use X_test_scaled for consistency
y_prob_tree = tree.predict_proba(X_test_scaled)[:, 1]
evaluate_model("Decision Tree", y_test, y_pred_tree, y_prob=y_prob_tree)

Decision Tree -> Accuracy: 0.587, Precision: 0.323, Recall: 0.343, F1 Score: 0.333, ROC-AUC: 0.5179480179858644


In [10]:
# 3.6 Random Forest
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train_balanced, y_train_balanced)  # Changed here: use balanced data
y_pred_rf = rf.predict(X_test_scaled)
y_prob_rf = rf.predict_proba(X_test_scaled)[:, 1]
evaluate_model("Random Forest", y_test, y_pred_rf, y_prob=y_prob_rf)

Random Forest -> Accuracy: 0.684, Precision: 0.432, Recall: 0.164, F1 Score: 0.237, ROC-AUC: 0.5658904767527793


In [11]:
# 3.7 Ensemble Method: Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('lr', log_reg), ('tree', tree), ('rf', rf)],
    voting='soft'
)
voting_clf.fit(X_train_balanced, y_train_balanced)  # Changed here: use balanced data
y_pred_voting = voting_clf.predict(X_test_scaled)
y_prob_voting = voting_clf.predict_proba(X_test_scaled)[:, 1]
evaluate_model("Voting Classifier", y_test, y_pred_voting, y_prob=y_prob_voting)

Voting Classifier -> Accuracy: 0.588, Precision: 0.324, Recall: 0.345, F1 Score: 0.334, ROC-AUC: 0.560510207601668


In [12]:
# 3.8 K-Means Clustering (Unsupervised)
kmeans = KMeans(n_clusters=2, n_init=10, random_state=42)
kmeans.fit(X_train_balanced)  # Changed here: using balanced data for clustering
clusters_train = kmeans.labels_

# Map clusters to the majority class in the balanced training data
mapping = {}
for cluster in np.unique(clusters_train):
    indices = np.where(clusters_train == cluster)[0]
    majority_class = y_train_balanced.iloc[indices].mode()[0]
    mapping[cluster] = majority_class

clusters_test = kmeans.predict(X_test_scaled)
y_pred_kmeans = np.array([mapping[cluster] for cluster in clusters_test])
evaluate_model("KMeans Clustering", y_test, y_pred_kmeans)

KMeans Clustering -> Accuracy: 0.451, Precision: 0.315, Recall: 0.708, F1 Score: 0.436, ROC-AUC: N/A


In [None]:
# SVM with RBF kernel
svm_model = SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42)
svm_model.fit(X_train_balanced, y_train_balanced)
y_pred_svm = svm_model.predict(X_test_scaled)
y_prob_svm = svm_model.predict_proba(X_test_scaled)[:, 1]
evaluate_model("SVM (RBF)", y_test, y_pred_svm, y_prob=y_prob_svm)

In [None]:
# Gaussian Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train_balanced, y_train_balanced)
y_pred_nb = nb_model.predict(X_test_scaled)
# GaussianNB supports predict_proba, so we get probabilities for ROC-AUC calculation
y_prob_nb = nb_model.predict_proba(X_test_scaled)[:, 1]
evaluate_model("Gaussian Naive Bayes", y_test, y_pred_nb, y_prob=y_prob_nb)

In [None]:
# AdaBoost using a simple Decision Tree as base estimator
ada_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1, class_weight='balanced', random_state=42),
    n_estimators=50,
    random_state=42
)
ada_model.fit(X_train_balanced, y_train_balanced)
y_pred_ada = ada_model.predict(X_test_scaled)
y_prob_ada = ada_model.predict_proba(X_test_scaled)[:, 1]
evaluate_model("AdaBoost", y_test, y_pred_ada, y_prob=y_prob_ada)

In [None]:
# Stacking Classifier with a mix of models
stacking_estimators = [
    ('lr', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)),
    ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
]
stacking_model = StackingClassifier(
    estimators=stacking_estimators,
    final_estimator=LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
)
stacking_model.fit(X_train_balanced, y_train_balanced)
y_pred_stack = stacking_model.predict(X_test_scaled)
y_prob_stack = stacking_model.predict_proba(X_test_scaled)[:, 1]
evaluate_model("Stacking Classifier", y_test, y_pred_stack, y_prob=y_prob_stack)

In [None]:
# Define parameter grid for Logistic Regression using L1 and L2 penalties.
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear']  # liblinear supports both l1 and l2 penalties.
}

grid_log_reg = GridSearchCV(
    LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1
)
grid_log_reg.fit(X_train_balanced, y_train_balanced)
print("Best parameters for Logistic Regression:", grid_log_reg.best_params_)
print("Best F1 score from GridSearch:", grid_log_reg.best_score_)

# Evaluate the best logistic regression estimator on the test set.
best_log_reg = grid_log_reg.best_estimator_
y_pred_best_log = best_log_reg.predict(X_test_scaled)
y_prob_best_log = best_log_reg.predict_proba(X_test_scaled)[:, 1]
evaluate_model("Logistic Regression (Tuned)", y_test, y_pred_best_log, y_prob=y_prob_best_log)

# Additionally, adjust the decision threshold (e.g., 0.4 instead of 0.5) for the tuned logistic model.
threshold = 0.4
y_pred_thresh = (y_prob_best_log >= threshold).astype(int)
evaluate_model(f"Logistic Regression (Tuned, Threshold {threshold})", y_test, y_pred_thresh, y_prob=y_prob_best_log)

In [None]:
# ------------------------------
# 4. Display Summary of Results
# ------------------------------
print("\nSummary of Results:")
for model, metrics in results.items():
    print(f"{model}: {metrics}")