In [213]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')


## Loading the dataset

1. Batting: `batting_data.csv`
2. Bowling: `bowling_data.csv`
3. Player Name and ID Map: `player_team_name_id.csv`

In [214]:
batting_df = pd.read_csv('final_data/batting_data.csv')
batting_df

Unnamed: 0,player_id,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start,odi_number
0,440970,0,-,3,0,0,0.00,3,caught,2,Kenya,Sharjah,2 Oct 2013,ODI # 3417
1,440970,13*,-,19,0,0,68.42,5,not out,2,Kenya,Sharjah,4 Oct 2013,ODI # 3418
2,440970,38*,-,67,4,0,56.71,4,not out,2,Hong Kong,Kuala Lumpur,1 May 2014,ODI # 3487
3,440970,3,-,7,0,0,42.85,4,lbw,1,U.A.E.,Kuala Lumpur,2 May 2014,ODI # 3488
4,440970,15,21,22,2,0,68.18,4,caught,1,Zimbabwe,Bulawayo,18 Jul 2014,ODI # 3503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,793007,4*,4,2,1,0,200.00,11,not out,2,South Africa,Delhi,7 Oct 2023,ODI # 4661
9327,793007,DNB,-,-,-,-,-,-,-,1,Pakistan,Hyderabad,10 Oct 2023,ODI # 4665
9328,793007,0*,12,6,0,0,0.00,11,not out,1,Australia,Lucknow,16 Oct 2023,ODI # 4671
9329,793007,DNB,-,-,-,-,-,-,-,2,Netherlands,Lucknow,21 Oct 2023,ODI # 4676


In [215]:
bowling_df = pd.read_csv('final_data/bowling_data.csv')
bowling_df

Unnamed: 0,player_id,overs,maidens,runs,wickets,economy,position,innings,opposition,ground,start_date,odi_number
0,440970,DNB,-,-,-,-,-,1,Kenya,Sharjah,2 Oct 2013,ODI # 3417
1,440970,DNB,-,-,-,-,-,1,Kenya,Sharjah,4 Oct 2013,ODI # 3418
2,440970,DNB,-,-,-,-,-,1,Hong Kong,Kuala Lumpur,1 May 2014,ODI # 3487
3,440970,2.0,0,17,0,8.50,6,2,U.A.E.,Kuala Lumpur,2 May 2014,ODI # 3488
4,440970,DNB,-,-,-,-,-,2,Zimbabwe,Bulawayo,18 Jul 2014,ODI # 3503
...,...,...,...,...,...,...,...,...,...,...,...,...
9326,793007,10.0,0,86,2,8.60,2,1,South Africa,Delhi,7 Oct 2023,ODI # 4661
9327,793007,9.2,0,60,2,6.42,2,2,Pakistan,Hyderabad,10 Oct 2023,ODI # 4665
9328,793007,9.0,2,38,3,4.22,2,2,Australia,Lucknow,16 Oct 2023,ODI # 4671
9329,793007,9.4,1,49,4,5.06,1,1,Netherlands,Lucknow,21 Oct 2023,ODI # 4676


In [216]:
player_data_df = pd.read_csv('final_data/player_team_name_id.csv')
player_data_df

Unnamed: 0,team_name,player_name,player_id
0,Afghanistan,Hashmatullah Shahidi,440970
1,Afghanistan,Rahmanullah Gurbaz,974087
2,Afghanistan,Ibrahim Zadran,921509
3,Afghanistan,Reyaz Hassan,1061090
4,Afghanistan,Rahmat Shah Zurmati,533956
...,...,...,...
145,Sri Lanka,Dunith Wellalage,1152427
146,Sri Lanka,Kasun Rajitha,499594
147,Sri Lanka,Matheesha Pathirana,1194795
148,Sri Lanka,Lahiru Kumara,784375


In [217]:
# # Collect Team Names

# team_names = player_data_df['team_name'].unique().tolist()
# team_names += bowling_df['opposition'].unique().tolist()
# team_names += batting_df['Opposition'].unique().tolist()
# team_names = list(set(team_names))
# team_names

# # Encode Team Names

# te = LabelEncoder()
# te.fit(team_names)

# batting_df['Opposition'] = te.transform(batting_df['Opposition'])
# bowling_df['opposition'] = te.transform(bowling_df['opposition'])
# player_data_df['team_name'] = te.transform(player_data_df['team_name'])


## Data Preprocessing

### Batting Data

In [218]:
batting_df = batting_df.merge(player_data_df, on='player_id', how='left')
batting_df

Unnamed: 0,player_id,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start,odi_number,team_name,player_name
0,440970,0,-,3,0,0,0.00,3,caught,2,Kenya,Sharjah,2 Oct 2013,ODI # 3417,Afghanistan,Hashmatullah Shahidi
1,440970,13*,-,19,0,0,68.42,5,not out,2,Kenya,Sharjah,4 Oct 2013,ODI # 3418,Afghanistan,Hashmatullah Shahidi
2,440970,38*,-,67,4,0,56.71,4,not out,2,Hong Kong,Kuala Lumpur,1 May 2014,ODI # 3487,Afghanistan,Hashmatullah Shahidi
3,440970,3,-,7,0,0,42.85,4,lbw,1,U.A.E.,Kuala Lumpur,2 May 2014,ODI # 3488,Afghanistan,Hashmatullah Shahidi
4,440970,15,21,22,2,0,68.18,4,caught,1,Zimbabwe,Bulawayo,18 Jul 2014,ODI # 3503,Afghanistan,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,793007,4*,4,2,1,0,200.00,11,not out,2,South Africa,Delhi,7 Oct 2023,ODI # 4661,Sri Lanka,Dilshan Madushanka
9327,793007,DNB,-,-,-,-,-,-,-,1,Pakistan,Hyderabad,10 Oct 2023,ODI # 4665,Sri Lanka,Dilshan Madushanka
9328,793007,0*,12,6,0,0,0.00,11,not out,1,Australia,Lucknow,16 Oct 2023,ODI # 4671,Sri Lanka,Dilshan Madushanka
9329,793007,DNB,-,-,-,-,-,-,-,2,Netherlands,Lucknow,21 Oct 2023,ODI # 4676,Sri Lanka,Dilshan Madushanka


In [219]:
batting_df['player_id'].value_counts()
# Valid

253802     286
56029      261
34102      256
56143      244
56025      225
          ... 
721041       4
819507       3
1139520      2
35281        2
379887       2
Name: player_id, Length: 150, dtype: int64

#### Removing unwanted columns

In [220]:
batting_df

Unnamed: 0,player_id,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start,odi_number,team_name,player_name
0,440970,0,-,3,0,0,0.00,3,caught,2,Kenya,Sharjah,2 Oct 2013,ODI # 3417,Afghanistan,Hashmatullah Shahidi
1,440970,13*,-,19,0,0,68.42,5,not out,2,Kenya,Sharjah,4 Oct 2013,ODI # 3418,Afghanistan,Hashmatullah Shahidi
2,440970,38*,-,67,4,0,56.71,4,not out,2,Hong Kong,Kuala Lumpur,1 May 2014,ODI # 3487,Afghanistan,Hashmatullah Shahidi
3,440970,3,-,7,0,0,42.85,4,lbw,1,U.A.E.,Kuala Lumpur,2 May 2014,ODI # 3488,Afghanistan,Hashmatullah Shahidi
4,440970,15,21,22,2,0,68.18,4,caught,1,Zimbabwe,Bulawayo,18 Jul 2014,ODI # 3503,Afghanistan,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,793007,4*,4,2,1,0,200.00,11,not out,2,South Africa,Delhi,7 Oct 2023,ODI # 4661,Sri Lanka,Dilshan Madushanka
9327,793007,DNB,-,-,-,-,-,-,-,1,Pakistan,Hyderabad,10 Oct 2023,ODI # 4665,Sri Lanka,Dilshan Madushanka
9328,793007,0*,12,6,0,0,0.00,11,not out,1,Australia,Lucknow,16 Oct 2023,ODI # 4671,Sri Lanka,Dilshan Madushanka
9329,793007,DNB,-,-,-,-,-,-,-,2,Netherlands,Lucknow,21 Oct 2023,ODI # 4676,Sri Lanka,Dilshan Madushanka


In [221]:
batting_df.drop(columns=["Mins", "odi_number", "team_name"], inplace=True)

#### Runs

In [222]:
batting_df['Runs'].unique()

array(['0', '13*', '38*', '3', '15', '11', '31', '32', '72', '14', '21',
       '2*', '12', '23', '54', '2', '34*', '37', '58', '97*', '71', '9',
       '52', '59*', '47', '18', '4', '59', '8', '76', '82', '10', '73',
       '28', '5', '88', '1*', '38', '57', '9*', '22', '15*', '13', '51',
       '80', '48*', '127', '103', '7', '106*', '17', '53', '68', '145',
       '6', '151', '1', '65', '19', '121*', '106', '162', '98', '41*',
       '100', '75', '87', '50', '35', 'DNB', '34', '100*', '26', '16',
       '48', '30', '36', '78', '108*', 'TDNB', '44', '114', '43', '56',
       '69', '29', '33', '113', '62', '46', '24', '61', '103*', '70',
       '94', '55', '45', '77*', '83', '12*', '25', '89*', '81*', '8*',
       '67', '63*', '17*', '42', '20', '104*', '60*', '20*', '3*', '77',
       '7*', '6*', '23*', '60', '41', '46*', '14*', '116', '40', '49',
       '27*', '18*', '92', '64', '27', '50*', '25*', '5*', '86', '19*',
       '16*', '57*', '11*', '35*', '39*', '4*', '0*', '10*', '21*'

In [223]:
# We can notice *, DNB, TDNB, absent in runs

# Replace *23 with 23
batting_df['Runs'].replace(to_replace='\*', value='', regex=True, inplace=True)
batting_df['Runs'].unique()

array(['0', '13', '38', '3', '15', '11', '31', '32', '72', '14', '21',
       '2', '12', '23', '54', '34', '37', '58', '97', '71', '9', '52',
       '59', '47', '18', '4', '8', '76', '82', '10', '73', '28', '5',
       '88', '1', '57', '22', '51', '80', '48', '127', '103', '7', '106',
       '17', '53', '68', '145', '6', '151', '65', '19', '121', '162',
       '98', '41', '100', '75', '87', '50', '35', 'DNB', '26', '16', '30',
       '36', '78', '108', 'TDNB', '44', '114', '43', '56', '69', '29',
       '33', '113', '62', '46', '24', '61', '70', '94', '55', '45', '77',
       '83', '25', '89', '81', '67', '63', '42', '20', '104', '60', '116',
       '40', '49', '27', '92', '64', '86', '39', '117', '85', '99', '128',
       '96', '101', '152', '124', '102', '79', '66', '93', '95', '146',
       '74', '163', '178', '84', '122', '109', '173', '119', '156', '130',
       '179', '107', '166', '134', '105', 'absent', '126', '176', '136',
       '90', '110', '144', '125', '111', '112', 'sub',

In [224]:
# Get Non numeric values

batting_df[~batting_df['Runs'].str.isnumeric()]['Runs'].value_counts()

DNB       1830
TDNB       144
absent       5
sub          1
Name: Runs, dtype: int64

In [225]:
# Remove rows with DNB, TDNB, absent, sub

batting_df.drop(batting_df[batting_df['Runs'].isin(['DNB', 'TDNB', 'absent', 'sub'])].index, inplace=True)
batting_df

Unnamed: 0,player_id,Runs,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start,player_name
0,440970,0,3,0,0,0.00,3,caught,2,Kenya,Sharjah,2 Oct 2013,Hashmatullah Shahidi
1,440970,13,19,0,0,68.42,5,not out,2,Kenya,Sharjah,4 Oct 2013,Hashmatullah Shahidi
2,440970,38,67,4,0,56.71,4,not out,2,Hong Kong,Kuala Lumpur,1 May 2014,Hashmatullah Shahidi
3,440970,3,7,0,0,42.85,4,lbw,1,U.A.E.,Kuala Lumpur,2 May 2014,Hashmatullah Shahidi
4,440970,15,22,2,0,68.18,4,caught,1,Zimbabwe,Bulawayo,18 Jul 2014,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9321,793007,4,2,1,0,200.00,11,not out,2,New Zealand,Auckland,25 Mar 2023,Dilshan Madushanka
9322,793007,1,5,0,0,20.00,11,not out,1,Netherlands,Bulawayo,30 Jun 2023,Dilshan Madushanka
9325,793007,0,0,0,0,-,11,not out,1,Netherlands,Harare,9 Jul 2023,Dilshan Madushanka
9326,793007,4,2,1,0,200.00,11,not out,2,South Africa,Delhi,7 Oct 2023,Dilshan Madushanka


In [226]:
batting_df[~batting_df['Runs'].str.isnumeric()]['Runs'].unique()

array([], dtype=object)

In [227]:
# Change data type of runs to int

batting_df['Runs'] = batting_df['Runs'].astype(int)
batting_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7351 entries, 0 to 9328
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   player_id    7351 non-null   int64 
 1   Runs         7351 non-null   int64 
 2   BF           7351 non-null   object
 3   4s           7351 non-null   object
 4   6s           7351 non-null   object
 5   SR           7351 non-null   object
 6   Pos          7351 non-null   object
 7   Dismissal    7351 non-null   object
 8   Inns         7351 non-null   object
 9   Opposition   7351 non-null   object
 10  Ground       7351 non-null   object
 11  Start        7351 non-null   object
 12  player_name  7351 non-null   object
dtypes: int64(2), object(11)
memory usage: 804.0+ KB


#### Data Type Conversion

In [228]:
batting_df['BF'] = batting_df['BF'].astype(int)

In [229]:
batting_df['Pos'] = batting_df['Pos'].astype(int)

In [230]:
batting_df['Dismissal'].value_counts()

caught            3730
not out           1395
bowled            1082
lbw                602
run out            375
stumped            145
retired notout      15
hit wicket           6
obstruct field       1
Name: Dismissal, dtype: int64

In [231]:
batting_df['Pos'] = batting_df['Pos'].astype(int)

In [232]:
# Replace rows with "-" in 4s and 6s with 0

batting_df['4s'].replace(to_replace='-', value=0, inplace=True)
batting_df['6s'].replace(to_replace='-', value=0, inplace=True)

In [233]:
batting_df['4s'] = batting_df['4s'].astype(int)
batting_df['6s'] = batting_df['6s'].astype(int)

In [234]:
batting_df['Ground'].value_counts()

Mirpur           467
Colombo (RPS)    321
Harare           237
Pallekele        187
The Oval         169
                ... 
Toronto            1
Jamshedpur         1
Gwalior            1
Lincoln            1
Ayr                1
Name: Ground, Length: 128, dtype: int64

In [235]:
# # Collect Ground Names

# ground_names = batting_df['Ground'].unique().tolist()
# ground_names += bowling_df['ground'].unique().tolist()
# ground_names = list(set(ground_names))
# ground_names

In [236]:
# ge = LabelEncoder()
# ge.fit(ground_names)
# batting_df['Ground'] = ge.transform(batting_df['Ground'])
# bowling_df['ground'] = ge.transform(bowling_df['ground'])

In [237]:
batting_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7351 entries, 0 to 9328
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   player_id    7351 non-null   int64 
 1   Runs         7351 non-null   int64 
 2   BF           7351 non-null   int64 
 3   4s           7351 non-null   int64 
 4   6s           7351 non-null   int64 
 5   SR           7351 non-null   object
 6   Pos          7351 non-null   int64 
 7   Dismissal    7351 non-null   object
 8   Inns         7351 non-null   object
 9   Opposition   7351 non-null   object
 10  Ground       7351 non-null   object
 11  Start        7351 non-null   object
 12  player_name  7351 non-null   object
dtypes: int64(6), object(7)
memory usage: 804.0+ KB


In [238]:
# Replace rows with "-" in SR with 0

batting_df['SR'].replace(to_replace='-', value=0, inplace=True)
batting_df['SR'] = batting_df['SR'].astype(float)

### Bowling Data

In [239]:
bowling_df = bowling_df.merge(player_data_df, on='player_id', how='left')
bowling_df

Unnamed: 0,player_id,overs,maidens,runs,wickets,economy,position,innings,opposition,ground,start_date,odi_number,team_name,player_name
0,440970,DNB,-,-,-,-,-,1,Kenya,Sharjah,2 Oct 2013,ODI # 3417,Afghanistan,Hashmatullah Shahidi
1,440970,DNB,-,-,-,-,-,1,Kenya,Sharjah,4 Oct 2013,ODI # 3418,Afghanistan,Hashmatullah Shahidi
2,440970,DNB,-,-,-,-,-,1,Hong Kong,Kuala Lumpur,1 May 2014,ODI # 3487,Afghanistan,Hashmatullah Shahidi
3,440970,2.0,0,17,0,8.50,6,2,U.A.E.,Kuala Lumpur,2 May 2014,ODI # 3488,Afghanistan,Hashmatullah Shahidi
4,440970,DNB,-,-,-,-,-,2,Zimbabwe,Bulawayo,18 Jul 2014,ODI # 3503,Afghanistan,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,793007,10.0,0,86,2,8.60,2,1,South Africa,Delhi,7 Oct 2023,ODI # 4661,Sri Lanka,Dilshan Madushanka
9327,793007,9.2,0,60,2,6.42,2,2,Pakistan,Hyderabad,10 Oct 2023,ODI # 4665,Sri Lanka,Dilshan Madushanka
9328,793007,9.0,2,38,3,4.22,2,2,Australia,Lucknow,16 Oct 2023,ODI # 4671,Sri Lanka,Dilshan Madushanka
9329,793007,9.4,1,49,4,5.06,1,1,Netherlands,Lucknow,21 Oct 2023,ODI # 4676,Sri Lanka,Dilshan Madushanka


#### Removing unwanted columns

In [240]:
bowling_df.drop(columns=["odi_number"], inplace=True)
bowling_df

Unnamed: 0,player_id,overs,maidens,runs,wickets,economy,position,innings,opposition,ground,start_date,team_name,player_name
0,440970,DNB,-,-,-,-,-,1,Kenya,Sharjah,2 Oct 2013,Afghanistan,Hashmatullah Shahidi
1,440970,DNB,-,-,-,-,-,1,Kenya,Sharjah,4 Oct 2013,Afghanistan,Hashmatullah Shahidi
2,440970,DNB,-,-,-,-,-,1,Hong Kong,Kuala Lumpur,1 May 2014,Afghanistan,Hashmatullah Shahidi
3,440970,2.0,0,17,0,8.50,6,2,U.A.E.,Kuala Lumpur,2 May 2014,Afghanistan,Hashmatullah Shahidi
4,440970,DNB,-,-,-,-,-,2,Zimbabwe,Bulawayo,18 Jul 2014,Afghanistan,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,793007,10.0,0,86,2,8.60,2,1,South Africa,Delhi,7 Oct 2023,Sri Lanka,Dilshan Madushanka
9327,793007,9.2,0,60,2,6.42,2,2,Pakistan,Hyderabad,10 Oct 2023,Sri Lanka,Dilshan Madushanka
9328,793007,9.0,2,38,3,4.22,2,2,Australia,Lucknow,16 Oct 2023,Sri Lanka,Dilshan Madushanka
9329,793007,9.4,1,49,4,5.06,1,1,Netherlands,Lucknow,21 Oct 2023,Sri Lanka,Dilshan Madushanka


#### Overs

In [241]:
bowling_df['overs'].unique()

array(['DNB', '2.0', '1.0', 'TDNB', '3.0', '5.0', '1.5', '5.3', '6.0',
       '8.0', '4.0', '1.1', '9.0', '10.0', '9.5', '5.1', '7.0', '8.3',
       '9.4', '6.4', '5.2', '7.1', '4.1', '3.5', '8.2', '8.5', '8.1',
       '7.3', '9.2', '4.4', '4.2', '5.4', '9.3', '8.4', '7.2', '7.5',
       '4.5', '1.4', '3.2', '6.1', '6.2', '0.1', '6.5', '2.4', '0.5',
       '5.5', 'sub', '1.3', '3.1', '9.1', '6.3', '1.2', '0.2', '2.3',
       '2.5', '7.4', '0.4', '2.2', '0.3', '3.3', '4.3'], dtype=object)

In [242]:
# We can notice DNB, TDNB, sub in overs. Drop them

bowling_df.drop(bowling_df[bowling_df['overs'].isin(['DNB', 'TDNB', 'sub'])].index, inplace=True)
bowling_df['overs'] = bowling_df['overs'].astype(float)
bowling_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5171 entries, 3 to 9330
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   player_id    5171 non-null   int64  
 1   overs        5171 non-null   float64
 2   maidens      5171 non-null   object 
 3   runs         5171 non-null   object 
 4   wickets      5171 non-null   object 
 5   economy      5171 non-null   object 
 6   position     5171 non-null   object 
 7   innings      5171 non-null   object 
 8   opposition   5171 non-null   object 
 9   ground       5171 non-null   object 
 10  start_date   5171 non-null   object 
 11  team_name    5171 non-null   object 
 12  player_name  5171 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 565.6+ KB


#### Runs

In [243]:
bowling_df['runs'] = bowling_df['runs'].astype(int)
bowling_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5171 entries, 3 to 9330
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   player_id    5171 non-null   int64  
 1   overs        5171 non-null   float64
 2   maidens      5171 non-null   object 
 3   runs         5171 non-null   int64  
 4   wickets      5171 non-null   object 
 5   economy      5171 non-null   object 
 6   position     5171 non-null   object 
 7   innings      5171 non-null   object 
 8   opposition   5171 non-null   object 
 9   ground       5171 non-null   object 
 10  start_date   5171 non-null   object 
 11  team_name    5171 non-null   object 
 12  player_name  5171 non-null   object 
dtypes: float64(1), int64(2), object(10)
memory usage: 565.6+ KB


#### Wickets

In [244]:
bowling_df['wickets'] = bowling_df['wickets'].astype(int)
bowling_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5171 entries, 3 to 9330
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   player_id    5171 non-null   int64  
 1   overs        5171 non-null   float64
 2   maidens      5171 non-null   object 
 3   runs         5171 non-null   int64  
 4   wickets      5171 non-null   int64  
 5   economy      5171 non-null   object 
 6   position     5171 non-null   object 
 7   innings      5171 non-null   object 
 8   opposition   5171 non-null   object 
 9   ground       5171 non-null   object 
 10  start_date   5171 non-null   object 
 11  team_name    5171 non-null   object 
 12  player_name  5171 non-null   object 
dtypes: float64(1), int64(3), object(9)
memory usage: 565.6+ KB


In [245]:
bowling_df['economy'] = bowling_df['economy'].astype(float)
bowling_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5171 entries, 3 to 9330
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   player_id    5171 non-null   int64  
 1   overs        5171 non-null   float64
 2   maidens      5171 non-null   object 
 3   runs         5171 non-null   int64  
 4   wickets      5171 non-null   int64  
 5   economy      5171 non-null   float64
 6   position     5171 non-null   object 
 7   innings      5171 non-null   object 
 8   opposition   5171 non-null   object 
 9   ground       5171 non-null   object 
 10  start_date   5171 non-null   object 
 11  team_name    5171 non-null   object 
 12  player_name  5171 non-null   object 
dtypes: float64(2), int64(3), object(8)
memory usage: 565.6+ KB


#### Innings

In [246]:
bowling_df['innings'] = bowling_df['innings'].astype(int)
bowling_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5171 entries, 3 to 9330
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   player_id    5171 non-null   int64  
 1   overs        5171 non-null   float64
 2   maidens      5171 non-null   object 
 3   runs         5171 non-null   int64  
 4   wickets      5171 non-null   int64  
 5   economy      5171 non-null   float64
 6   position     5171 non-null   object 
 7   innings      5171 non-null   int64  
 8   opposition   5171 non-null   object 
 9   ground       5171 non-null   object 
 10  start_date   5171 non-null   object 
 11  team_name    5171 non-null   object 
 12  player_name  5171 non-null   object 
dtypes: float64(2), int64(4), object(7)
memory usage: 565.6+ KB


In [247]:
# Reset index
batting_df.reset_index(drop=True, inplace=True)
bowling_df.reset_index(drop=True, inplace=True)

In [248]:
batting_df

Unnamed: 0,player_id,Runs,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start,player_name
0,440970,0,3,0,0,0.00,3,caught,2,Kenya,Sharjah,2 Oct 2013,Hashmatullah Shahidi
1,440970,13,19,0,0,68.42,5,not out,2,Kenya,Sharjah,4 Oct 2013,Hashmatullah Shahidi
2,440970,38,67,4,0,56.71,4,not out,2,Hong Kong,Kuala Lumpur,1 May 2014,Hashmatullah Shahidi
3,440970,3,7,0,0,42.85,4,lbw,1,U.A.E.,Kuala Lumpur,2 May 2014,Hashmatullah Shahidi
4,440970,15,22,2,0,68.18,4,caught,1,Zimbabwe,Bulawayo,18 Jul 2014,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7346,793007,4,2,1,0,200.00,11,not out,2,New Zealand,Auckland,25 Mar 2023,Dilshan Madushanka
7347,793007,1,5,0,0,20.00,11,not out,1,Netherlands,Bulawayo,30 Jun 2023,Dilshan Madushanka
7348,793007,0,0,0,0,0.00,11,not out,1,Netherlands,Harare,9 Jul 2023,Dilshan Madushanka
7349,793007,4,2,1,0,200.00,11,not out,2,South Africa,Delhi,7 Oct 2023,Dilshan Madushanka


In [249]:
def build_run_predict_dataset(player_id, batting_df, position):
    # For each match he played, show his career stats till that match like avg, no_of_innings, centuries, fifties, zeros
    # Get all matches played by player
    player_df = batting_df[batting_df['player_id'] == player_id]
    player_df = player_df[player_df['Pos'] == position]
    player_df.reset_index(drop=True, inplace=True)

    time_series_stats = [{
        'current_avg': 0,
        'current_avg_SR': 0,
        'current_no_of_innings': 0,
        'current_no_of_centuries': 0,
        'current_no_of_fifties': 0,
        'current_no_of_zeros': 0
    }]
    # For each match he played, show his career stats till that match like avg, no_of_innings, centuries, fifties, zeros
    for i in range(1, len(player_df) + 1):
        # Get all matches till now
        matches_till_now = player_df.iloc[:i]
        # Get runs scored till now
        runs_till_now = matches_till_now['Runs'].sum()
        # Get innings played till now
        innings_till_now = matches_till_now['Pos'].count()
        # Get centuries till now
        centuries_till_now = matches_till_now[matches_till_now['Runs'] >= 100]['Runs'].count()
        # Get fifties till now
        fifties_till_now = matches_till_now[(matches_till_now['Runs'] >= 50) & (matches_till_now['Runs'] < 100)]['Runs'].count()
        # Get zeros till now
        zeros_till_now = matches_till_now[matches_till_now['Runs'] == 0]['Runs'].count()
        # Calculate avg SR
        avg_SR = matches_till_now['SR'].mean()

        # Calculate avg
        avg = runs_till_now / innings_till_now

        # Append to time_series_stats
        time_series_stats.append({
            'current_avg': avg,
            'current_no_of_innings': innings_till_now,
            'current_no_of_centuries': centuries_till_now,
            'current_no_of_fifties': fifties_till_now,
            'current_no_of_zeros': zeros_till_now,
            'current_avg_SR': avg_SR
        })

    # Convert to dataframe
    time_series_stats_df = pd.DataFrame(time_series_stats)
    # Drop first row
    time_series_stats_df.drop(0, inplace=True)

    # Merge with player_df
    player_df = pd.concat([player_df, time_series_stats_df], axis=1)

    # Drop NaN
    player_df.dropna(inplace=True)

    player_df.drop(columns=['player_id', 'player_name', 'Start', '4s', '6s', 'Dismissal', 'Pos', 'BF', 'SR'], inplace=True)

    return player_df



In [267]:
# Virat Kohli
player_df = build_run_predict_dataset(253802, batting_df, 3)
player_df

Unnamed: 0,Runs,Inns,Opposition,Ground,current_avg,current_avg_SR,current_no_of_innings,current_no_of_centuries,current_no_of_fifties,current_no_of_zeros
1,9.0,1,Sri Lanka,Mirpur,10.000000,62.500000,1.0,0.0,0.0,0.0
2,91.0,2,Bangladesh,Mirpur,9.500000,68.750000,2.0,0.0,0.0,0.0
3,71.0,2,Sri Lanka,Mirpur,36.666667,75.570000,3.0,0.0,1.0,0.0
4,102.0,2,Bangladesh,Mirpur,45.250000,82.780000,4.0,0.0,2.0,0.0
5,2.0,1,Sri Lanka,Mirpur,56.600000,87.696000,5.0,1.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...
214,85.0,2,Australia,Chennai,51.224299,79.340841,214.0,40.0,56.0,15.0
215,55.0,2,Afghanistan,Delhi,51.381395,79.312605,215.0,40.0,57.0,15.0
216,16.0,2,Pakistan,Ahmedabad,51.398148,79.400093,216.0,40.0,58.0,15.0
217,103.0,2,Bangladesh,Pune,51.235023,79.443779,217.0,40.0,58.0,15.0
