# Cricket Player Performance Prediction

In [None]:
import streamlit as st
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("akarshsinghh/cricket-player-performance-prediction")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading to C:\Users\PSHT15H\.cache\kagglehub\datasets\akarshsinghh\cricket-player-performance-prediction\2.archive...


100%|██████████| 2.42M/2.42M [00:01<00:00, 1.39MB/s]

Extracting files...





Path to dataset files: C:\Users\PSHT15H\.cache\kagglehub\datasets\akarshsinghh\cricket-player-performance-prediction\versions\2


In [None]:
import os

# Check what files are in the downloaded directory
print(f"Files in {path}:")
for file in os.listdir(path):
    print(f"  {file}")
    
# Let's also check if there are any CSV files
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
print(f"\nCSV files found: {csv_files}")

Files in C:\Users\PSHT15H\.cache\kagglehub\datasets\akarshsinghh\cricket-player-performance-prediction\versions\2:
  ball.csv
  bat.csv
  match.csv

CSV files found: ['ball.csv', 'bat.csv', 'match.csv']


In [None]:
# Load the match.csv file into a dataframe
match_df = pd.read_csv(os.path.join(path, 'match.csv'))

# Display basic information about the dataframe
print(f"Match dataframe shape: {match_df.shape}")
print(f"\nColumn names: {list(match_df.columns)}")
print(f"\nNumber of rows: {match_df.shape[0]}")
print(f"Number of columns: {match_df.shape[1]}")
print(match_df.head())

Match dataframe shape: (6199, 15)

Column names: ['Unnamed: 0', 'match_number', 'name', 'start_date', 'matchtype', 'series_id', 'match_detail_id', 'scorecard_id', 'title', 'runs', 'over', 'run_rate', 'match_id', 'opp_team_id', 'team_id']

Number of rows: 6199
Number of columns: 15
   Unnamed: 0  match_number                name                 start_date  \
0        1417           1.0  Bangladesh v India  2004-12-22 18:30:00+00:00   
1        1418           1.0  Bangladesh v India  2004-12-22 18:30:00+00:00   
2        1419           2.0  Bangladesh v India  2004-12-25 18:30:00+00:00   
3        1420           2.0  Bangladesh v India  2004-12-25 18:30:00+00:00   
4        1421           3.0  Bangladesh v India  2004-12-26 18:30:00+00:00   

  matchtype  series_id  match_detail_id  scorecard_id               title  \
0       odi        182              773          1495       India Innings   
1       odi        182              773          1496  Bangladesh Innings   
2       odi       

In [None]:
print(match_df.columns)


Index(['Unnamed: 0', 'match_number', 'name', 'start_date', 'matchtype',
       'series_id', 'match_detail_id', 'scorecard_id', 'title', 'runs', 'over',
       'run_rate', 'match_id', 'opp_team_id', 'team_id'],
      dtype='object')


In [None]:
drop_cols = [
    'Unnamed: 0',        # index column
    'match detail id',   # pure identifier
    'scorecard id'       # pure identifier
]

match_df = match_df.drop(columns=drop_cols, errors='ignore')


In [None]:
print("Remaining features:", match_df.shape[1] - 1)  # minus target


Remaining features: 13


In [None]:
match_df['start_date'] = pd.to_datetime(match_df['start_date'], errors='coerce')
match_df['match_year'] = match_df['start_date'].dt.year
match_df['match_month'] = match_df['start_date'].dt.month


In [None]:
# First, let's check what columns are available
print("Available columns:", match_df.columns.tolist())

# Since 'winner' column doesn't exist, let's use all columns for features
X = match_df.copy()
print("Final feature count:", X.shape[1])


Available columns: ['match_number', 'name', 'start_date', 'matchtype', 'series_id', 'match_detail_id', 'scorecard_id', 'title', 'runs', 'over', 'run_rate', 'match_id', 'opp_team_id', 'team_id', 'match_year', 'match_month']
Final feature count: 16


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in match_df.select_dtypes(include='object').columns:
    match_df[col] = le.fit_transform(match_df[col])


In [None]:
# Create a target variable based on runs scored
# Group by match_id and determine winner based on highest runs
match_results = match_df.groupby('match_id')['runs'].transform('max')
match_df['winner'] = (match_df['runs'] == match_results).astype(int)

X = match_df.drop('winner', axis=1)
y = match_df['winner']


# Train–Test Split + Scaling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Select only numeric columns for scaling
numeric_columns = X_train.select_dtypes(include=[np.number]).columns
non_numeric_columns = X_train.select_dtypes(exclude=[np.number]).columns

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
X_test_scaled = scaler.transform(X_test[numeric_columns])

# Convert scaled arrays back to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=numeric_columns, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=numeric_columns, index=X_test.index)

# Combine scaled numeric columns with non-numeric columns
X_train = pd.concat([X_train_scaled_df, X_train[non_numeric_columns]], axis=1)
X_test = pd.concat([X_test_scaled_df, X_test[non_numeric_columns]], axis=1)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [None]:
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }


In [None]:
# First, let's properly handle non-numeric columns and missing values before training
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Debug: Check what non-numeric columns we have
print("Non-numeric columns:", non_numeric_columns.tolist())
print("\nData types in X_train:")
print(X_train.dtypes)

# Check for missing values
print(f"\nMissing values in X_train:")
print(X_train.isnull().sum())

# Create a copy for processing
X_train_processed = X_train.copy()
X_test_processed = X_test.copy()

# Handle non-numeric columns (encode them)
label_encoders = {}
for col in non_numeric_columns:
    if col in X_train_processed.columns:
        print(f"\nProcessing column: {col}")
        print(f"Data type: {X_train_processed[col].dtype}")
        
        # Convert datetime columns to numeric features
        if X_train_processed[col].dtype == 'datetime64[ns]' or pd.api.types.is_datetime64_any_dtype(X_train_processed[col]):
            print(f"Converting datetime column {col} to numeric features")
            # Extract numeric features from datetime
            X_train_processed[col + '_year'] = X_train_processed[col].dt.year
            X_train_processed[col + '_month'] = X_train_processed[col].dt.month
            X_train_processed[col + '_day'] = X_train_processed[col].dt.day
            X_test_processed[col + '_year'] = X_test_processed[col].dt.year
            X_test_processed[col + '_month'] = X_test_processed[col].dt.month
            X_test_processed[col + '_day'] = X_test_processed[col].dt.day
            # Drop the original datetime column
            X_train_processed = X_train_processed.drop(col, axis=1)
            X_test_processed = X_test_processed.drop(col, axis=1)
        else:
            # Use label encoder for other non-numeric columns
            le = LabelEncoder()
            # Handle NaN values by filling them with a placeholder
            X_train_processed[col] = X_train_processed[col].fillna('Unknown')
            X_test_processed[col] = X_test_processed[col].fillna('Unknown')
            
            # Fit on train and transform both train and test
            X_train_processed[col] = le.fit_transform(X_train_processed[col].astype(str))
            # For test set, handle any unseen labels
            test_labels = X_test_processed[col].astype(str)
            test_encoded = []
            for label in test_labels:
                if label in le.classes_:
                    test_encoded.append(le.transform([label])[0])
                else:
                    # Assign to 'Unknown' category if exists, otherwise use 0
                    if 'Unknown' in le.classes_:
                        test_encoded.append(le.transform(['Unknown'])[0])
                    else:
                        test_encoded.append(0)
            X_test_processed[col] = test_encoded
            label_encoders[col] = le

# Handle missing values in numeric columns
print(f"\nHandling missing values...")
print(f"Missing values after initial processing:")
missing_before = X_train_processed.isnull().sum()
print(missing_before[missing_before > 0])

# Use SimpleImputer to fill missing values with median for numeric columns
imputer = SimpleImputer(strategy='median')
X_train_processed = pd.DataFrame(
    imputer.fit_transform(X_train_processed), 
    columns=X_train_processed.columns,
    index=X_train_processed.index
)
X_test_processed = pd.DataFrame(
    imputer.transform(X_test_processed), 
    columns=X_test_processed.columns,
    index=X_test_processed.index
)

# Verify no missing values remain
print(f"\nMissing values after imputation:")
missing_after = X_train_processed.isnull().sum()
print(missing_after[missing_after > 0])
print(f"Total missing values: {X_train_processed.isnull().sum().sum()}")

print(f"\nProcessed X_train shape: {X_train_processed.shape}")
print(f"Processed X_test shape: {X_test_processed.shape}")
print(f"Final data types:")
print(X_train_processed.dtypes)

# Now train the models with the properly processed data
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    try:
        model.fit(X_train_processed, y_train)
        results[name] = evaluate_model(model, X_test_processed, y_test)
        print(f"{name} trained successfully!")
    except Exception as e:
        print(f"Error training {name}: {str(e)}")
        
print(f"\nTraining completed! {len(results)} models trained successfully.")

Non-numeric columns: ['start_date']

Data types in X_train:
match_number                   float64
name                           float64
matchtype                      float64
series_id                      float64
match_detail_id                float64
scorecard_id                   float64
title                          float64
runs                           float64
over                           float64
run_rate                       float64
match_id                       float64
opp_team_id                    float64
team_id                        float64
match_year                     float64
match_month                    float64
start_date         datetime64[ns, UTC]
dtype: object

Missing values in X_train:
match_number       57
name                0
matchtype           0
series_id           0
match_detail_id     0
scorecard_id        0
title               0
runs                0
over                0
run_rate            0
match_id            0
opp_team_id         0
team_id   

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression trained successfully!

Training Decision Tree...
Decision Tree trained successfully!

Training KNN...
KNN trained successfully!

Training Naive Bayes...
Naive Bayes trained successfully!

Training Random Forest...
Random Forest trained successfully!

Training XGBoost...
XGBoost trained successfully!

Training completed! 6 models trained successfully.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# Display model performance results
import pandas as pd

print("Model Performance Results:")
print("="*60)

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results).T
results_df = results_df.round(4)

print(results_df)

# Find the best performing model for each metric
print("\nBest Models by Metric:")
print("-"*40)
for metric in results_df.columns:
    best_model = results_df[metric].idxmax()
    best_score = results_df[metric].max()
    print(f"{metric}: {best_model} ({best_score:.4f})")

# Overall best model based on F1 score (balanced metric)
best_overall = results_df['F1'].idxmax()
print(f"\nBest Overall Model (by F1-Score): {best_overall}")
print(f"F1-Score: {results_df.loc[best_overall, 'F1']:.4f}")

# Create a simple visualization
plt.figure(figsize=(12, 8))

# Plot accuracy for all models
plt.subplot(2, 2, 1)
results_df['Accuracy'].plot(kind='bar')
plt.title('Model Accuracy Comparison')
plt.xticks(rotation=45)
plt.ylabel('Accuracy')

# Plot F1 Score
plt.subplot(2, 2, 2)
results_df['F1'].plot(kind='bar')
plt.title('Model F1-Score Comparison')
plt.xticks(rotation=45)
plt.ylabel('F1-Score')

# Plot AUC
plt.subplot(2, 2, 3)
results_df['AUC'].plot(kind='bar')
plt.title('Model AUC Comparison')
plt.xticks(rotation=45)
plt.ylabel('AUC')

# Plot MCC
plt.subplot(2, 2, 4)
results_df['MCC'].plot(kind='bar')
plt.title('Model MCC Comparison')
plt.xticks(rotation=45)
plt.ylabel('MCC')

plt.tight_layout()
plt.show()

Model Performance Results:
                     Accuracy     AUC  Precision  Recall      F1     MCC
Logistic Regression    0.6919  0.7597     0.6731  0.6615  0.6672  0.3806
Decision Tree          0.6548  0.6542     0.6269  0.6442  0.6354  0.3079
KNN                    0.4427  0.4617     0.3978  0.3765  0.3869 -0.1233
Naive Bayes            0.6306  0.6937     0.6020  0.6166  0.6092  0.2592
Random Forest          0.6944  0.7644     0.6748  0.6667  0.6707  0.3856
XGBoost                0.7113  0.7856     0.6915  0.6891  0.6903  0.4199

Best Models by Metric:
----------------------------------------
Accuracy: XGBoost (0.7113)
AUC: XGBoost (0.7856)
Precision: XGBoost (0.6915)
Recall: XGBoost (0.6891)
F1: XGBoost (0.6903)
MCC: XGBoost (0.4199)

Best Overall Model (by F1-Score): XGBoost
F1-Score: 0.6903


  plt.show()


In [None]:
# Fix matplotlib display issue for Jupyter notebooks
%matplotlib inline
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend
import matplotlib.pyplot as plt

# Re-create the visualization with proper backend configuration
plt.figure(figsize=(12, 8))

# Plot accuracy for all models
plt.subplot(2, 2, 1)
results_df['Accuracy'].plot(kind='bar', color='skyblue')
plt.title('Model Accuracy Comparison')
plt.xticks(rotation=45)
plt.ylabel('Accuracy')
plt.grid(True, alpha=0.3)

# Plot F1 Score
plt.subplot(2, 2, 2)
results_df['F1'].plot(kind='bar', color='lightgreen')
plt.title('Model F1-Score Comparison')
plt.xticks(rotation=45)
plt.ylabel('F1-Score')
plt.grid(True, alpha=0.3)

# Plot AUC
plt.subplot(2, 2, 3)
results_df['AUC'].plot(kind='bar', color='lightcoral')
plt.title('Model AUC Comparison')
plt.xticks(rotation=45)
plt.ylabel('AUC')
plt.grid(True, alpha=0.3)

# Plot MCC
plt.subplot(2, 2, 4)
results_df['MCC'].plot(kind='bar', color='lightsalmon')
plt.title('Model MCC Comparison')
plt.xticks(rotation=45)
plt.ylabel('MCC')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("Visualization saved as 'model_comparison.png'")
print("\nPlots should now display properly in the notebook!")