In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import metrics

In [2]:
df = pd.read_csv('deliveries.csv')

In [3]:
df.shape

(17380, 22)

In [4]:
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.1,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,
1,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.2,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,
2,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.3,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,
3,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.4,England,New Zealand,DJ Malan,JM Bairstow,...,0,,,,,,,,,
4,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.5,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,


In [5]:
df.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [6]:
df.dtypes

match_id                    int64
season                     object
start_date                 object
venue                      object
innings                     int64
ball                      float64
batting_team               object
bowling_team               object
striker                    object
non_striker                object
bowler                     object
runs_off_bat                int64
extras                      int64
wides                     float64
noballs                   float64
byes                      float64
legbyes                   float64
penalty                   float64
wicket_type                object
player_dismissed           object
other_wicket_type         float64
other_player_dismissed    float64
dtype: object

In [7]:
selected_features = ['match_id', 'venue', 'innings', 'batting_team', 'bowling_team', 
                     'ball', 'runs_off_bat', 'wides', 'noballs', 'byes', 'legbyes', 'extras']

In [8]:
sel_df = df[selected_features]

In [9]:
sel_df[["extras", "wides", "noballs"]] = sel_df[["extras", "wides", "noballs"]].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sel_df[["extras", "wides", "noballs"]] = sel_df[["extras", "wides", "noballs"]].fillna(0)


In [10]:
sel_df["runs_on_ball"] = sel_df["runs_off_bat"] + sel_df["extras"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sel_df["runs_on_ball"] = sel_df["runs_off_bat"] + sel_df["extras"]


In [11]:
runs_per_inning = sel_df.groupby(['match_id', 'innings'])['runs_on_ball'].sum().reset_index(name="runs_per_inning")
sel_df = sel_df.merge(runs_per_inning, on=['match_id', 'innings'], how='left')

In [12]:
total_balls = sel_df.groupby(['match_id', 'innings'])['ball'].size().reset_index(name ='total_balls')
wide_balls = sel_df.groupby(['match_id', 'innings'])['wides'].apply(lambda x: (x != 0).sum()).reset_index(name="total_wides")
no_balls = sel_df.groupby(['match_id', 'innings'])['noballs'].apply(lambda x: (x != 0).sum()).reset_index(name='total_noballs')

In [13]:
sel_df = sel_df.merge(total_balls, on=['match_id', 'innings'], how='left')
sel_df = sel_df.merge(wide_balls, on=['match_id', 'innings'], how='left')
sel_df = sel_df.merge(no_balls, on=['match_id', 'innings'], how='left')

In [14]:
sel_df["Overs_Played"] = (sel_df["total_balls"] - sel_df["total_wides"] - sel_df["total_noballs"])/6

In [15]:
sel_df.head()

Unnamed: 0,match_id,venue,innings,batting_team,bowling_team,ball,runs_off_bat,wides,noballs,byes,legbyes,extras,runs_on_ball,runs_per_inning,total_balls,total_wides,total_noballs,Overs_Played
0,1,"Narendra Modi Stadium, Ahmedabad",1,England,New Zealand,0.1,0,0.0,0.0,,,0,0,282,304,4,0,50.0
1,1,"Narendra Modi Stadium, Ahmedabad",1,England,New Zealand,0.2,6,0.0,0.0,,,0,6,282,304,4,0,50.0
2,1,"Narendra Modi Stadium, Ahmedabad",1,England,New Zealand,0.3,1,0.0,0.0,,,0,1,282,304,4,0,50.0
3,1,"Narendra Modi Stadium, Ahmedabad",1,England,New Zealand,0.4,1,0.0,0.0,,,0,1,282,304,4,0,50.0
4,1,"Narendra Modi Stadium, Ahmedabad",1,England,New Zealand,0.5,4,0.0,0.0,,,0,4,282,304,4,0,50.0


In [16]:
drop_cols=['match_id','ball','runs_off_bat','wides','noballs','byes','legbyes','extras','runs_on_ball','total_balls','total_wides','total_noballs']

sel_df = sel_df.drop(drop_cols,axis='columns')
sel_df.shape

(17380, 6)

In [17]:
sel_df.head()

Unnamed: 0,venue,innings,batting_team,bowling_team,runs_per_inning,Overs_Played
0,"Narendra Modi Stadium, Ahmedabad",1,England,New Zealand,282,50.0
1,"Narendra Modi Stadium, Ahmedabad",1,England,New Zealand,282,50.0
2,"Narendra Modi Stadium, Ahmedabad",1,England,New Zealand,282,50.0
3,"Narendra Modi Stadium, Ahmedabad",1,England,New Zealand,282,50.0
4,"Narendra Modi Stadium, Ahmedabad",1,England,New Zealand,282,50.0


In [18]:
sel_df = sel_df.drop_duplicates(keep='first')

In [19]:
sel_df.shape

(64, 6)

In [20]:
sel_df.head()

Unnamed: 0,venue,innings,batting_team,bowling_team,runs_per_inning,Overs_Played
0,"Narendra Modi Stadium, Ahmedabad",1,England,New Zealand,282,50.0
304,"Narendra Modi Stadium, Ahmedabad",2,New Zealand,England,283,36.333333
525,"Rajiv Gandhi International Stadium, Uppal, Hyd...",1,Pakistan,Netherlands,286,48.833333
827,"Rajiv Gandhi International Stadium, Uppal, Hyd...",2,Netherlands,Pakistan,205,41.0
1082,"Himachal Pradesh Cricket Association Stadium, ...",1,Afghanistan,Bangladesh,156,37.333333


In [21]:
sel_df = sel_df.drop(['venue'], axis=1)

In [22]:
sel_df = sel_df.reset_index(drop=True)

In [23]:
sel_df.head(64)

Unnamed: 0,innings,batting_team,bowling_team,runs_per_inning,Overs_Played
0,1,England,New Zealand,282,50.000000
1,2,New Zealand,England,283,36.333333
2,1,Pakistan,Netherlands,286,48.833333
3,2,Netherlands,Pakistan,205,41.000000
4,1,Afghanistan,Bangladesh,156,37.333333
...,...,...,...,...,...
59,2,Afghanistan,Sri Lanka,242,45.333333
60,1,Bangladesh,Pakistan,204,45.166667
61,2,Pakistan,Bangladesh,205,32.500000
62,1,South Africa,New Zealand,357,50.000000


Analysis of Second Inning's Score

In [24]:
df_2 = sel_df[sel_df['innings'] == 2]

In [25]:
df_2 = df_2.reset_index(drop = True)

In [26]:
df_2.head(32)

Unnamed: 0,innings,batting_team,bowling_team,runs_per_inning,Overs_Played
0,2,New Zealand,England,283,36.333333
1,2,Netherlands,Pakistan,205,41.0
2,2,Bangladesh,Afghanistan,158,34.666667
3,2,Sri Lanka,South Africa,326,44.833333
4,2,India,Australia,201,41.333333
5,2,Netherlands,New Zealand,223,46.5
6,2,Bangladesh,England,227,48.333333
7,2,Pakistan,Sri Lanka,345,48.333333
8,2,India,Afghanistan,273,35.0
9,2,New Zealand,Bangladesh,248,42.833333


In [27]:
df_2.shape

(32, 5)

In [28]:
df_2.describe()

Unnamed: 0,innings,runs_per_inning,Overs_Played
count,32.0,32.0,32.0
mean,2.0,225.46875,39.979167
std,0.0,66.14158,7.837708
min,2.0,90.0,21.0
25%,2.0,175.25,34.958333
50%,2.0,219.0,41.416667
75%,2.0,271.5,46.541667
max,2.0,383.0,50.0


In [29]:
df_2.columns

Index(['innings', 'batting_team', 'bowling_team', 'runs_per_inning',
       'Overs_Played'],
      dtype='object')

In [30]:
df_2.isnull().sum()

innings            0
batting_team       0
bowling_team       0
runs_per_inning    0
Overs_Played       0
dtype: int64

Label Encoding

In [31]:
le = preprocessing.LabelEncoder()

In [32]:
df_2['batting_team'] = le.fit_transform(df_2['batting_team'])

In [33]:
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)

{'Afghanistan': 0, 'Australia': 1, 'Bangladesh': 2, 'England': 3, 'India': 4, 'Netherlands': 5, 'New Zealand': 6, 'Pakistan': 7, 'South Africa': 8, 'Sri Lanka': 9}


In [34]:
df_2['bowling_team'] = le.fit_transform(df_2['bowling_team'])

In [35]:
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)

{'Afghanistan': 0, 'Australia': 1, 'Bangladesh': 2, 'England': 3, 'India': 4, 'Netherlands': 5, 'New Zealand': 6, 'Pakistan': 7, 'South Africa': 8, 'Sri Lanka': 9}


Model Training

In [36]:
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor

In [37]:
X = df_2[['innings', 'batting_team', 'bowling_team', 'Overs_Played']]
y = df_2['runs_per_inning']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=42, shuffle=True)

In [39]:
standard_scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = standard_scaler.transform(X_train)
X_test_scaled = standard_scaler.transform(X_test)

In [40]:
X_test.head()

Unnamed: 0,innings,batting_team,bowling_team,Overs_Played
29,2,0,9,45.333333
15,2,4,2,41.5
24,2,9,3,25.666667
17,2,9,5,48.333333
8,2,4,0,35.0


In [41]:
model_dict = {
    'LinearRegression': {"model": LinearRegression(), "params": {}},
    'RandomForestRegressor': {"model": RandomForestRegressor(random_state=42),
                     "params": {'n_estimators': list(range(50, 150, 5)), 'max_depth': list(range(1, 10, 2))}},
    'XGBRegressor': {"model": XGBRegressor(), "params": {'n_estimators': list(range(10, 800, 100)), 'learning_rate': [0.001, 0.01, 0.1]},
    'PolynomialFeatures': {"model": make_pipeline(PolynomialFeatures(), LinearRegression()),
                      "params": {'polynomialfeatures__degree': [2,3]}}}
}

In [48]:
def eval_models():
    model_results = pd.DataFrame()
    model_results['Train_RMSE'] = None
    model_results['Test_RMSE'] = None
    model_results['Train_MAE'] = None
    model_results['Test_MAE'] = None
    model_results['best_params'] = None
    best_test_score = math.inf

    for model_name, reg_model in model_dict.items():
        classifier = GridSearchCV(reg_model['model'], reg_model['params'], n_jobs=20, verbose=0)
        classifier.fit(X_train_scaled, y_train)
        best_model = classifier.best_estimator_

        y_train_predicted = best_model.predict(X_train_scaled)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_predicted))
        train_mae = mean_absolute_error(y_train, y_train_predicted)

        print(model_name, train_rmse, classifier.best_params_)

        y_predicted = best_model.predict(X_test_scaled)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
        test_mae = mean_absolute_error(y_test, y_predicted)

        if test_rmse < best_test_score:
            best_test_score = test_rmse
            best_reg_model = best_model

        model_results.loc[model_name, ['Train_RMSE', 'Test_RMSE', 'Train_MAE', 'Test_MAE', 'best_params']] = [train_rmse, test_rmse, train_mae, test_mae, classifier.best_params_]

    print("Best model: ", best_reg_model)
    y_predicted = best_reg_model.predict(X_test_scaled)

    return model_results,best_reg_model

In [49]:
model_results,best_reg_model = eval_models()
model_results



LinearRegression 43.079133005414356 {}
RandomForestRegressor 42.04929313727326 {'max_depth': 1, 'n_estimators': 65}


  if is_sparse(data):


XGBRegressor 44.04501653175449 {'learning_rate': 0.001, 'n_estimators': 710}
Best model:  XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.001, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=710, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)


Unnamed: 0,Train_RMSE,Test_RMSE,Train_MAE,Test_MAE,best_params
LinearRegression,43.079133,43.699687,36.609273,33.647848,{}
RandomForestRegressor,42.049293,49.4698,33.023218,40.740021,"{'max_depth': 1, 'n_estimators': 65}"
XGBRegressor,44.045017,40.154655,33.461408,30.051189,"{'learning_rate': 0.001, 'n_estimators': 710}"


In [51]:
print(best_reg_model)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.001, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=710, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)


In [52]:
y_predicted = best_reg_model.predict(X_test_scaled)

In [53]:
print(y_predicted)

[225.65231 216.35042 181.79445 271.1229  184.165   240.59685 181.79445]


Pickle File

In [47]:
### Create a Pickle file using serialization
import pickle
pickle_out = open("score_2.pkl","wb")
pickle.dump(XGBRegressor, pickle_out)
#Serialization
pickle_out.close()