In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import pandas as pd
from imblearn.over_sampling import SMOTE
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)

In [3]:
ball_df = pd.read_csv('../Data/ballbyball.csv')
country_df = pd.read_csv('../Data/country.csv')
ground_df = pd.read_csv('../Data/ground.csv')
matches_df = pd.read_csv('../Data/matches.csv')
players_df = pd.read_csv('../Data/players.csv')
season_df = pd.read_csv('../Data/season.csv')
team_df = pd.read_csv('../Data/team.csv')
town_df = pd.read_csv('../Data/town.csv')

In [4]:
ball_df['totalRuns'] = ball_df['totalRuns'] - ball_df['run']
ball_df['totalWickets'] = ball_df['totalWickets'] - ball_df['isWicket'].apply(lambda x: 1 if x else 0)

In [5]:
ground_df.drop(['Ground Slug', 'Town ID', 'Capacity'], axis=1, inplace=True)

In [6]:
matches_df.rename(columns={'ground_id': 'Ground ID'}, inplace=True)

matches_df = pd.merge(matches_df, ground_df, on='Ground ID', how='inner')

In [7]:
matches_df.drop(['season_id', 'slug', 'title', 'date', 'time', 'status', 'status_description', 'winner_team_id', 'toss_winner_team_id', 'toss_choice', 'Ground ID', 'team_1_id', 'team_2_id', 'team_1_score', 'team_1_scoreInfo', 'team_2_scoreInfo'], axis=1, inplace=True)

In [8]:
players_df.drop(['Full Name', 'Image', 'DOB', 'Batting Style (s)', 'Bowling Style (s)', 'Team ID'], axis=1, inplace=True)

In [9]:
ball_df = pd.merge(ball_df, matches_df, on='match_id', how='inner')

In [10]:
ball_df.drop(['index', 'overNumber', 'ballNumber', 'oversUnique', 'penalties', 'batsmanRuns', 'outPlayerId'], axis=1, inplace=True)

In [11]:
ball_df = ball_df.merge(players_df, how='left', left_on='batsmanPlayerId', right_on='ID', suffixes=('', '_batsman'))

ball_df.rename(columns={
    'Name': 'Batsman_Name',
    'Role': 'Batsman_Role',
    'Batting Style (l)': 'Batsman_Batting_Style',
    'Bowling Style(l)': 'Batsman_Bowling_Style',
    'Playing Role': 'Batsman_Playing_Role'
}, inplace=True)

ball_df.drop(columns=['ID'], inplace=True)

ball_df = ball_df.merge(players_df, how='left', left_on='bowlerPlayerId', right_on='ID', suffixes=('', '_bowler'))

ball_df.rename(columns={
    'Name': 'Bowler_Name',
    'Role': 'Bowler_Role',
    'Batting Style (l)': 'Bowler_Batting_Style',
    'Bowling Style(l)': 'Bowler_Bowling_Style',
    'Playing Role': 'Bowler_Playing_Role'
}, inplace=True)

ball_df.drop(columns=['ID'], inplace=True)

In [12]:
ball_df.drop(columns=['batsmanPlayerId', 'bowlerPlayerId'], inplace=True)

In [13]:
ball_df.drop(columns=['Batsman_Bowling_Style', 'Bowler_Batting_Style'], inplace=True)

In [14]:
df = ball_df.dropna(subset=['pitchLength', 'pitchLine','shotType','Batsman_Name','Batsman_Batting_Style', 'Batsman_Playing_Role','Bowler_Name','Bowler_Bowling_Style','Bowler_Playing_Role'])

In [15]:
df.drop(columns=['match_id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['match_id'], inplace=True)


In [16]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
X = df.drop('isWicket', axis=1)
y = df['isWicket']

categorical_cols = X.select_dtypes(include='object').columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

columns_to_scale = [col for col in X.columns if col not in ['pitchLine', 'pitchLength', 'Bowler_Bowling_Style', 'shotType']]

scaler = StandardScaler()
X_resampled[columns_to_scale] = scaler.fit_transform(X_resampled[columns_to_scale])

df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['isWicket'] = y_resampled

In [19]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb

threshold = 2
df_copy = df_resampled.copy()
df_copy['target'] = ((df_copy['isWicket'] == 1) | (df_copy['run'] <= threshold)).astype(int)

input_cols = ['inningNumber', 'oversActual', 'totalRuns', 'totalWickets', 
              'time_of_day', 'Ground Name', 'Batsman_Name', 'Batsman_Role', 
              'Batsman_Batting_Style', 'Batsman_Playing_Role']
target_outputs = ['pitchLine', 'pitchLength','Bowler_Bowling_Style', 'shotType']

df_good = df_copy[df_copy['target'] == 1]
X = df_good[input_cols]

for target in target_outputs:
    y = df_good[target]

    class_counts = y.value_counts()
    valid_classes = class_counts[class_counts > 1].index
    dropped_classes = class_counts[class_counts <= 1]

    if not dropped_classes.empty:
        print(f"Target '{target}': Dropping {len(dropped_classes)} class(es):")
        print(dropped_classes)

    mask = y.isin(valid_classes)
    X_filtered = X[mask]
    y_filtered = y[mask]

    if y_filtered.nunique() < 2:
        print(f"Skipping target '{target}' due to insufficient class diversity.\n")
        continue

    X_train, X_test, y_train, y_test = train_test_split(
        X_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered
    )

    model = xgb.XGBClassifier(
        objective='multi:softmax',
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=42
    )

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    print(f"\nClassification report for {target} (training):")
    print(classification_report(y_train, y_train_pred))

    y_pred = model.predict(X_test)
    print(f"Classification report for {target} (testing):")
    print(classification_report(y_test, y_pred))

    model_filename = f"Models/xgb_model_{target}_safe.pkl"
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved to {model_filename}\n")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Classification report for pitchLine (training):
              precision    recall  f1-score   support

           0       0.91      0.44      0.59      2698
           1       0.74      0.83      0.78     18172
           2       0.74      0.74      0.74     15331
           3       0.94      0.65      0.77       945
           4       0.88      0.37      0.52      1189

    accuracy                           0.75     38335
   macro avg       0.84      0.60      0.68     38335
weighted avg       0.76      0.75      0.74     38335

Classification report for pitchLine (testing):
              precision    recall  f1-score   support

           0       0.77      0.30      0.43       675
           1       0.63      0.74      0.68      4543
           2       0.60      0.59      0.60      3833
           3       0.80      0.39      0.53       236
           4       0.57      0.17      0.26       297

    accuracy                           0.62      9584
   macro avg       0.67      0.44  

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Classification report for pitchLength (training):
              precision    recall  f1-score   support

           0       0.79      0.54      0.64      7408
           1       0.87      0.41      0.56      3763
           2       0.61      0.95      0.74     16540
           3       0.83      0.46      0.59      4036
           4       0.75      0.39      0.51      6003
           5       0.89      0.39      0.54       585

    accuracy                           0.67     38335
   macro avg       0.79      0.52      0.60     38335
weighted avg       0.72      0.67      0.65     38335

Classification report for pitchLength (testing):
              precision    recall  f1-score   support

           0       0.63      0.43      0.51      1852
           1       0.58      0.24      0.34       941
           2       0.54      0.88      0.67      4135
           3       0.65      0.31      0.42      1009
           4       0.47      0.22      0.30      1501
           5       0.21      0.0

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Classification report for Bowler_Bowling_Style (training):
              precision    recall  f1-score   support

           0       0.89      0.88      0.89       994
           1       0.83      0.79      0.81      2195
           2       0.90      0.71      0.80       461
           3       0.90      0.66      0.77       426
           4       0.87      0.77      0.82      1489
           5       0.84      0.62      0.71       827
           6       0.65      0.80      0.72      3328
           7       0.73      0.69      0.71      2271
           8       0.66      0.77      0.71      5122
           9       0.70      0.77      0.73      4615
          10       0.82      0.56      0.67      1166
          11       0.85      0.58      0.69      1819
          12       0.84      0.56      0.67      1126
          13       0.75      0.71      0.73      3771
          14       0.81      0.62      0.70       827
          15       0.70      0.74      0.72      3058
          16       0.

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Classification report for shotType (training):
              precision    recall  f1-score   support

           0       0.62      0.50      0.56      2660
           1       0.63      0.51      0.56      2478
           2       0.80      0.58      0.67       937
           3       0.52      0.63      0.57      3378
           4       0.42      0.68      0.52      4705
           5       0.80      0.56      0.66       914
           6       0.75      0.38      0.50      1516
           7       0.76      0.50      0.60      1070
           8       0.48      0.64      0.55      4355
           9       0.76      0.54      0.64      1032
          10       0.77      0.56      0.65      1048
          11       0.59      0.53      0.56      3008
          12       0.62      0.49      0.54      2092
          13       0.75      0.57      0.65       934
          14       0.80      0.56      0.66       895
          15       0.76      0.57      0.65      1169
          16       0.59      0.76

In [20]:
threshold = 2
df_copy = df_resampled.copy()
df_copy['target'] = ((df_copy['isWicket'] == 0) & (df_copy['run'] > threshold)).astype(int)

input_cols = ['inningNumber', 'oversActual', 'totalRuns', 'totalWickets', 
              'time_of_day', 'Ground Name', 'Batsman_Name', 'Batsman_Role', 
              'Batsman_Batting_Style', 'Batsman_Playing_Role']
target_outputs = ['pitchLine', 'pitchLength', 'Bowler_Bowling_Style']

df_good = df_copy[df_copy['target'] == 1]
X = df_good[input_cols]

for target in target_outputs:
    print(f"\n{'='*50}")
    print(f"Processing target: {target}")
    print(f"{'='*50}")
    
    y = df_good[target]
    
    class_counts = y.value_counts()
    print(f"Original class distribution for '{target}':")
    print(class_counts)
    
    valid_classes = class_counts[class_counts > 1].index
    dropped_classes = class_counts[class_counts <= 1]
    
    if not dropped_classes.empty:
        print(f"\nDropping {len(dropped_classes)} class(es) with insufficient samples:")
        print(dropped_classes)
    
    mask = y.isin(valid_classes)
    X_filtered = X[mask]
    y_filtered = y[mask]
    
    if y_filtered.nunique() < 2:
        print(f"Skipping target '{target}' due to insufficient class diversity.\n")
        continue
    
    le = LabelEncoder()
    y_encoded = le.fit_transform(y_filtered)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_filtered, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )
    
    print(f"\nTraining set has {len(X_train)} samples")
    print(f"Test set has {len(X_test)} samples")
    
    model = xgb.XGBClassifier(
        objective='multi:softmax',
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    print(f"\nClassification report for {target} (training):")
    print(classification_report(y_train, y_train_pred))
    
    y_pred = model.predict(X_test)
    print(f"Classification report for {target} (testing):")
    print(classification_report(y_test, y_pred))
    
    print("\nSample predictions (original labels):")
    original_class_mapping = {i: cls for i, cls in enumerate(le.classes_)}
    print("True vs Predicted (first 5 samples):")
    for i in range(min(5, len(y_test))):
        true_label = original_class_mapping[y_test[i]]
        pred_label = original_class_mapping[y_pred[i]]
        print(f"  True: {true_label}, Predicted: {pred_label}")
    
    feature_importance = model.feature_importances_
    sorted_idx = np.argsort(feature_importance)[::-1]
    print("\nTop 5 important features:")
    for i in range(min(5, len(input_cols))):
        print(f"  {input_cols[sorted_idx[i]]}: {feature_importance[sorted_idx[i]]:.4f}")

    model_filename = f"Models/xgb_model_{target}_avoid.pkl"
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved to {model_filename}\n")


Processing target: pitchLine
Original class distribution for 'pitchLine':
pitchLine
2    1881
1    1215
0     305
4     299
3      13
Name: count, dtype: int64

Training set has 2970 samples
Test set has 743 samples


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Classification report for pitchLine (training):
              precision    recall  f1-score   support

           0       1.00      0.93      0.97       244
           1       0.98      0.94      0.96       972
           2       0.95      0.99      0.97      1505
           3       1.00      1.00      1.00        10
           4       0.99      0.94      0.97       239

    accuracy                           0.96      2970
   macro avg       0.98      0.96      0.97      2970
weighted avg       0.97      0.96      0.96      2970

Classification report for pitchLine (testing):
              precision    recall  f1-score   support

           0       0.17      0.07      0.10        61
           1       0.36      0.28      0.31       243
           2       0.55      0.74      0.63       376
           3       0.00      0.00      0.00         3
           4       0.39      0.18      0.25        60

    accuracy                           0.48       743
   macro avg       0.30      0.25  

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Classification report for pitchLength (training):
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       754
           1       1.00      0.99      0.99       186
           2       0.95      0.99      0.97      1042
           3       0.99      0.94      0.96       314
           4       0.98      0.94      0.96       651
           5       1.00      1.00      1.00        23

    accuracy                           0.97      2970
   macro avg       0.98      0.97      0.98      2970
weighted avg       0.97      0.97      0.97      2970

Classification report for pitchLength (testing):
              precision    recall  f1-score   support

           0       0.30      0.29      0.29       189
           1       0.14      0.09      0.11        47
           2       0.36      0.48      0.41       260
           3       0.24      0.14      0.18        79
           4       0.27      0.23      0.25       163
           5       0.00      0.0

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Classification report for Bowler_Bowling_Style (training):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       126
           1       1.00      1.00      1.00       240
           2       1.00      1.00      1.00         9
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00       115
           5       1.00      1.00      1.00        13
           6       0.99      1.00      1.00       179
           7       1.00      1.00      1.00        94
           8       0.99      0.99      0.99       488
           9       0.99      1.00      1.00       523
          10       1.00      1.00      1.00         3
          11       1.00      1.00      1.00       111
          12       1.00      1.00      1.00         9
          13       1.00      0.99      0.99       430
          14       1.00      1.00      1.00       250
          15       1.00      1.00      1.00        26
          16       1.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
