In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.optimizers import Adam
import pickle

# **Scraping Data**

In [2]:
url = 'https://www.espncricinfo.com/records/tournament/bowling-most-wickets-career/icc-cricket-world-cup-2023-24-15338'
all_tables = pd.read_html(url)
current_wc_df = all_tables[0]
print(current_wc_df)

                       Player       Span  Mat  Inns  Balls  Overs Mdns  Runs  \
0               A Zampa (AUS)  2023-2023    9     9    474   79.0    1   416   
1           D Madushanka (SL)  2023-2023    9     9    470   78.2    4   525   
2              G Coetzee (SA)  2023-2023    7     7    327   54.3    1   349   
3   Shaheen Shah Afridi (PAK)  2023-2023    9     9    486   81.0    3   481   
4             JJ Bumrah (IND)  2023-2023    9     9    437   72.5    6   266   
..                        ...        ...  ...   ...    ...    ...  ...   ...   
77              V Kohli (IND)  2023-2023    9     2     21    3.3    -    15   
78             R Ashwin (IND)  2023-2023    1     1     60   10.0    1    34   
79        AL Phehlukwayo (SA)  2023-2023    1     1     42    7.0    -    36   
80       Saqib Zulfiqar (NED)  2023-2023    2     2     30    5.0    -    40   
81              JE Root (ENG)  2023-2023    9     2     61   10.1    -    67   

    Wkts   BBI    Ave  Econ     SR  4  

In [3]:
url = 'https://www.espncricinfo.com/records/trophy/bowling-most-wickets-career/world-cup-12'
all_tables = pd.read_html(url)
past_wc_df = all_tables[0]
print(past_wc_df)

                Player       Span  Mat  Inns  Balls  Overs  Mdns  Runs  Wkts  \
0     GD McGrath (AUS)  1996-2007   39    39   1955  325.5    42  1292    71   
1   M Muralidaran (SL)  1996-2011   40    39   2061  343.3    15  1335    68   
2       MA Starc (AUS)  2015-2023   26    26   1339  223.1     9  1165    59   
3      SL Malinga (SL)  2007-2019   29    28   1394  232.2    11  1281    56   
4    Wasim Akram (PAK)  1987-2003   38    36   1947  324.3    16  1311    55   
..                 ...        ...  ...   ...    ...    ...   ...   ...   ...   
95   Aaqib Javed (PAK)  1992-1996   15    15    746  124.2    13   517    18   
96   MO Odumbe (KENYA)  1996-2003   19    16    701  116.5     9   567    18   
97      GR Larsen (NZ)  1992-1999   19    19   1020  170.0    12   599    18   
98      CL Hooper (WI)  1987-2003   20    17    924  154.0     3   659    18   
99      CL Cairns (NZ)  1992-2003   28    23    880  146.4     9   755    18   

     BBI    Ave  Econ     SR  4  5  
0 

In [4]:
df1 = pd.DataFrame(current_wc_df)
df2 = pd.DataFrame(past_wc_df)

In [5]:
frames=[df1,df2]
wkts_df = pd.concat(frames)

In [6]:
wkts_df

Unnamed: 0,Player,Span,Mat,Inns,Balls,Overs,Mdns,Runs,Wkts,BBI,Ave,Econ,SR,4,5
0,A Zampa (AUS),2023-2023,9,9,474,79.0,1,416,22,4/8,18.90,5.26,21.54,3,-
1,D Madushanka (SL),2023-2023,9,9,470,78.2,4,525,21,5/80,25.00,6.70,22.38,1,1
2,G Coetzee (SA),2023-2023,7,7,327,54.3,1,349,18,4/44,19.38,6.40,18.16,1,-
3,Shaheen Shah Afridi (PAK),2023-2023,9,9,486,81.0,3,481,18,5/54,26.72,5.93,27.00,-,1
4,JJ Bumrah (IND),2023-2023,9,9,437,72.5,6,266,17,4/39,15.64,3.65,25.70,1,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Aaqib Javed (PAK),1992-1996,15,15,746,124.2,13,517,18,3/21,28.72,4.15,41.44,-,-
96,MO Odumbe (KENYA),1996-2003,19,16,701,116.5,9,567,18,4/38,31.50,4.85,38.94,1,-
97,GR Larsen (NZ),1992-1999,19,19,1020,170.0,12,599,18,3/16,33.27,3.52,56.66,-,-
98,CL Hooper (WI),1987-2003,20,17,924,154.0,3,659,18,3/42,36.61,4.27,51.33,-,-


In [7]:
wkts_df.isna().sum()

Player    0
Span      0
Mat       0
Inns      0
Balls     0
Overs     0
Mdns      0
Runs      0
Wkts      0
BBI       0
Ave       0
Econ      0
SR        0
4         0
5         0
dtype: int64

In [8]:
wkts_df.dtypes

Player     object
Span       object
Mat         int64
Inns        int64
Balls       int64
Overs     float64
Mdns       object
Runs        int64
Wkts        int64
BBI        object
Ave       float64
Econ      float64
SR        float64
4          object
5          object
dtype: object

In [9]:
wkts_df = wkts_df.replace('-', '0')

columns_to_clean = ['Mdns', 'BBI', '4', '5']

for col in columns_to_clean:
    wkts_df[col] = wkts_df[col].astype(str).str.replace(r'[*+]', '', regex=True)
    wkts_df[col] = pd.to_numeric(wkts_df[col], errors='coerce')

print(wkts_df.isna().sum())

Player      0
Span        0
Mat         0
Inns        0
Balls       0
Overs       0
Mdns        0
Runs        0
Wkts        0
BBI       182
Ave         0
Econ        0
SR          0
4           0
5           0
dtype: int64


In [10]:
wkts_df = wkts_df.rename(columns={'4': 'Four_wickets', '5': 'Five_wickets'})

In [11]:
features = ['Inns','Overs','Mdns','Econ','Four_wickets','Five_wickets']

X = wkts_df[features]
y = wkts_df['Wkts']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Linear Regression**

In [12]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [13]:
player_2023 = pd.DataFrame({
    'Inns' : [9, 9, 7, 9, 9],
    'Overs': [79.0, 78.2, 54.3, 81.0, 72.5],
    'Mdns': [1, 4, 1, 3, 6],
    'Econ': [5.26, 6.70, 6.40, 5.93, 3.65],
    'Four_wickets': [3, 1, 1, 0, 1],
    'Five_wickets': [0, 1, 0, 1, 0],
})

predicted_wkts_2023 = lr_model.predict(player_2023)
player_2023['Predicted_wkts_2023'] = np.ceil(predicted_wkts_2023)
print(player_2023)

   Inns  Overs  Mdns  Econ  Four_wickets  Five_wickets  Predicted_wkts_2023
0     9   79.0     1  5.26             3             0                 23.0
1     9   78.2     4  6.70             1             1                 20.0
2     7   54.3     1  6.40             1             0                 13.0
3     9   81.0     3  5.93             0             1                 17.0
4     9   72.5     6  3.65             1             0                 15.0


In [14]:
pickle.dump(lr_model, open('LR_model_wkts.pkl', 'wb'))

# **ANN**

In [15]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [16]:
model = Sequential()

model.add(Dense(units=64, activation='relu', input_shape=(len(features),)))
model.add(BatchNormalization())

model.add(Dense(units=128, activation='relu'))
model.add(BatchNormalization())

model.add(Dense(units=64, activation='relu'))
model.add(BatchNormalization())

model.add(Dense(units=1, activation='linear'))

adam_custom = Adam(learning_rate=0.001)

model.compile(optimizer=adam_custom, loss='mean_squared_error')

model.fit(X_train, y_train, batch_size=64, epochs=200, validation_data=(X_test, y_test))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x78513043a2f0>

In [17]:
player_2023 = pd.DataFrame({
    'Inns' : [9, 9, 7, 9, 9],
    'Overs': [79.0, 78.2, 54.3, 81.0, 72.5],
    'Mdns': [1, 4, 1, 3, 6],
    'Econ': [5.26, 6.70, 6.40, 5.93, 3.65],
    'Four_wickets': [3, 1, 1, 0, 1],
    'Five_wickets': [0, 1, 0, 1, 0],
})

player_2023_scaled = sc.transform(player_2023)

predicted_wkts_2023 = model.predict(player_2023_scaled)
player_2023['Predicted_wkts_2023'] = np.ceil(predicted_wkts_2023)
print(player_2023)

   Inns  Overs  Mdns  Econ  Four_wickets  Five_wickets  Predicted_wkts_2023
0     9   79.0     1  5.26             3             0                 22.0
1     9   78.2     4  6.70             1             1                 21.0
2     7   54.3     1  6.40             1             0                 16.0
3     9   81.0     3  5.93             0             1                 19.0
4     9   72.5     6  3.65             1             0                 18.0


In [18]:
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(sc, scaler_file)

In [19]:
model.save('wkts_NN_model.h5')

  saving_api.save_model(
