In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
import pickle

# **Scraping Data**

In [2]:
url = 'https://www.espncricinfo.com/records/tournament/batting-most-runs-career/icc-cricket-world-cup-2023-24-15338'
all_tables = pd.read_html(url)
current_wc_df = all_tables[0]
print(current_wc_df)

                     Player       Span  Mat  Inns  NO Runs    HS    Ave   BF  \
0             V Kohli (IND)  2023-2023    9     9   3  594  103*  99.00  671   
1            Q de Kock (SA)  2023-2023    9     9   0  591   174  65.66  541   
2           R Ravindra (NZ)  2023-2023    9     9   1  565  123*  70.62  521   
3           RG Sharma (IND)  2023-2023    9     9   0  503   131  55.88  414   
4           DA Warner (AUS)  2023-2023    9     9   0  499   163  55.44  473   
..                      ...        ...  ...   ...  ..  ...   ...    ...  ...   
140      JR Hazlewood (AUS)  2023-2023    9     5   3    4     2   2.00    5   
141  Fazalhaq Farooqi (AFG)  2023-2023    6     3   2    2    2*   2.00    9   
142        LH Ferguson (NZ)  2023-2023    6     2   1    1     1   1.00    6   
143          AT Carey (AUS)  2023-2023    1     1   0    -     0   0.00    2   
144         Usama Mir (PAK)  2023-2023    4     1   0    -     0   0.00    3   

         SR 100 50  0  4s  6s  
0     8

In [3]:
url = 'https://www.espncricinfo.com/records/trophy/batting-most-runs-career/world-cup-12'
all_tables = pd.read_html(url)
past_wc_df = all_tables[0]
print(past_wc_df)

                 Player       Span  Mat  Inns  NO  Runs    HS    Ave    BF  \
0    SR Tendulkar (IND)  1992-2011   45    44   4  2278   152  56.95  2560   
1      RT Ponting (AUS)  1996-2011   46    42   4  1743  140*  45.86  2180   
2         V Kohli (IND)  2011-2023   35    35   7  1624   107  58.00  1859   
3    KC Sangakkara (SL)  2003-2015   37    35   8  1532   124  56.74  1770   
4       DA Warner (AUS)  2015-2023   27    27   2  1491   178  59.64  1484   
..                  ...        ...  ...   ...  ..   ...   ...    ...   ...   
145  KR Rutherford (NZ)  1987-1992   14    12   2   416    75  41.60   587   
146       AH Jones (NZ)  1987-1992   13    13   2   416    78  37.81   673   
147     LRD Mendis (SL)  1975-1987   16    16   2   412    64  29.42   591   
148     PN Kirsten (SA)  1992-1992    8     8   2   410    90  68.33   616   
149  Fakhar Zaman (PAK)  2019-2023   12    12   1   406  126*  36.90   406   

         SR 100  50  0   4s  6s  
0     88.98   6  15  2  241  

In [4]:
df1 = pd.DataFrame(current_wc_df)
df2 = pd.DataFrame(past_wc_df)

In [5]:
frames=[df1,df2]
runs_df = pd.concat(frames)

In [6]:
runs_df

Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s
0,V Kohli (IND),2023-2023,9,9,3,594,103*,99.00,671,88.52,2,5,1,55,7
1,Q de Kock (SA),2023-2023,9,9,0,591,174,65.66,541,109.24,4,-,-,57,21
2,R Ravindra (NZ),2023-2023,9,9,1,565,123*,70.62,521,108.44,3,2,-,52,17
3,RG Sharma (IND),2023-2023,9,9,0,503,131,55.88,414,121.49,1,3,1,58,24
4,DA Warner (AUS),2023-2023,9,9,0,499,163,55.44,473,105.49,2,2,-,48,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,KR Rutherford (NZ),1987-1992,14,12,2,416,75,41.6,587,70.86,-,4,-,36,7
146,AH Jones (NZ),1987-1992,13,13,2,416,78,37.81,673,61.81,-,4,1,44,0
147,LRD Mendis (SL),1975-1987,16,16,2,412,64,29.42,591,69.71,-,3,1,37,4
148,PN Kirsten (SA),1992-1992,8,8,2,410,90,68.33,616,66.55,-,4,-,28,2


In [7]:
runs_df.isna().sum()

Player    0
Span      0
Mat       0
Inns      0
NO        0
Runs      0
HS        0
Ave       0
BF        0
SR        0
100       0
50        0
0         0
4s        0
6s        0
dtype: int64

In [8]:
runs_df.dtypes

Player     object
Span       object
Mat         int64
Inns        int64
NO          int64
Runs       object
HS         object
Ave        object
BF          int64
SR        float64
100        object
50         object
0          object
4s         object
6s         object
dtype: object

In [9]:
runs_df = runs_df.replace('-', '0')

columns_to_clean = ['Mat','Runs','HS','Ave','100','50','0','4s','6s']

for col in columns_to_clean:
    runs_df[col] = runs_df[col].astype(str).str.replace(r'[*+]', '', regex=True)
    runs_df[col] = pd.to_numeric(runs_df[col], errors='coerce')

print(runs_df.isna().sum())

Player    0
Span      0
Mat       0
Inns      0
NO        0
Runs      0
HS        0
Ave       0
BF        0
SR        0
100       0
50        0
0         0
4s        0
6s        0
dtype: int64


In [10]:
runs_df = runs_df.rename(columns={'100': 'Hundreds', '50': 'Fifties', '4s': 'Fours', '6s': 'Sixes'})

In [11]:
features = ['Inns','Ave', 'SR','Hundreds','Fifties','Fours','Sixes']

X = runs_df[features]
y = runs_df['Runs']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Linear Regression**

In [12]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [13]:
player_2023 = pd.DataFrame({
    'Inns': [9, 9, 9],
    'Ave': [99.00 , 65.66, 70.62],
    'SR': [88.52, 109.24, 108.44],
    'Hundreds' : [2, 4, 3],
    'Fifties' : [5, 0, 2],
    'Fours' : [55, 57, 52],
    'Sixes' : [7, 21, 17]
})

predicted_runs_2023 = lr_model.predict(player_2023)
player_2023['Predicted_Runs_2023'] = np.ceil(predicted_runs_2023)
print(player_2023)

   Inns    Ave      SR  Hundreds  Fifties  Fours  Sixes  Predicted_Runs_2023
0     9  99.00   88.52         2        5     55      7                667.0
1     9  65.66  109.24         4        0     57     21                617.0
2     9  70.62  108.44         3        2     52     17                601.0


In [14]:
pickle.dump(lr_model, open('lr_model.pkl', 'wb'))

# **ANN**

In [15]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [16]:
model = Sequential()

model.add(Dense(256, input_shape=(X.shape[1],), activation='relu'))

model.add(Dense(128, activation='relu'))

model.add(Dense(128, activation='relu'))

model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7f2695226f50>

In [17]:
player_2023 = pd.DataFrame({
    'Inns': [9, 9, 9],
    'Ave': [99.00 , 65.66, 70.62],
    'SR': [88.52, 109.24, 108.44],
    'Hundreds' : [2, 4, 3],
    'Fifties' : [5, 0, 2],
    'Fours' : [55, 57, 52],
    'Sixes' : [7, 21, 17]
})

player_2023_scaled = sc.transform(player_2023)

predicted_runs_2023 = model.predict(player_2023_scaled)
player_2023['Predicted_runs_2023'] = np.ceil(predicted_runs_2023)
print(player_2023)

   Inns    Ave      SR  Hundreds  Fifties  Fours  Sixes  Predicted_runs_2023
0     9  99.00   88.52         2        5     55      7                661.0
1     9  65.66  109.24         4        0     57     21                609.0
2     9  70.62  108.44         3        2     52     17                554.0


In [18]:
with open('scaler_runs.pkl', 'wb') as scaler_file:
    pickle.dump(sc, scaler_file)

In [19]:
model.save('runs_NN_model.h5')

  saving_api.save_model(
