In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')


## Loading the dataset

1. Batting: `batting_data.csv`
2. Bowling: `bowling_data.csv`
3. Player Name and ID Map: `player_team_name_id.csv`

In [2]:
batting_df = pd.read_csv('final_data/batting_data.csv')
batting_df

Unnamed: 0,player_id,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start,odi_number
0,440970,0,-,3,0,0,0.00,3,caught,2,Kenya,Sharjah,2 Oct 2013,ODI # 3417
1,440970,13*,-,19,0,0,68.42,5,not out,2,Kenya,Sharjah,4 Oct 2013,ODI # 3418
2,440970,38*,-,67,4,0,56.71,4,not out,2,Hong Kong,Kuala Lumpur,1 May 2014,ODI # 3487
3,440970,3,-,7,0,0,42.85,4,lbw,1,U.A.E.,Kuala Lumpur,2 May 2014,ODI # 3488
4,440970,15,21,22,2,0,68.18,4,caught,1,Zimbabwe,Bulawayo,18 Jul 2014,ODI # 3503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,793007,4*,4,2,1,0,200.00,11,not out,2,South Africa,Delhi,7 Oct 2023,ODI # 4661
9327,793007,DNB,-,-,-,-,-,-,-,1,Pakistan,Hyderabad,10 Oct 2023,ODI # 4665
9328,793007,0*,12,6,0,0,0.00,11,not out,1,Australia,Lucknow,16 Oct 2023,ODI # 4671
9329,793007,DNB,-,-,-,-,-,-,-,2,Netherlands,Lucknow,21 Oct 2023,ODI # 4676


In [3]:
bowling_df = pd.read_csv('final_data/bowling_data.csv')
bowling_df

Unnamed: 0,player_id,overs,maidens,runs,wickets,economy,position,innings,opposition,ground,start_date,odi_number
0,440970,DNB,-,-,-,-,-,1,Kenya,Sharjah,2 Oct 2013,ODI # 3417
1,440970,DNB,-,-,-,-,-,1,Kenya,Sharjah,4 Oct 2013,ODI # 3418
2,440970,DNB,-,-,-,-,-,1,Hong Kong,Kuala Lumpur,1 May 2014,ODI # 3487
3,440970,2.0,0,17,0,8.50,6,2,U.A.E.,Kuala Lumpur,2 May 2014,ODI # 3488
4,440970,DNB,-,-,-,-,-,2,Zimbabwe,Bulawayo,18 Jul 2014,ODI # 3503
...,...,...,...,...,...,...,...,...,...,...,...,...
9326,793007,10.0,0,86,2,8.60,2,1,South Africa,Delhi,7 Oct 2023,ODI # 4661
9327,793007,9.2,0,60,2,6.42,2,2,Pakistan,Hyderabad,10 Oct 2023,ODI # 4665
9328,793007,9.0,2,38,3,4.22,2,2,Australia,Lucknow,16 Oct 2023,ODI # 4671
9329,793007,9.4,1,49,4,5.06,1,1,Netherlands,Lucknow,21 Oct 2023,ODI # 4676


In [4]:
player_data_df = pd.read_csv('final_data/player_team_name_id.csv')
player_data_df

Unnamed: 0,team_name,player_name,player_id
0,Afghanistan,Hashmatullah Shahidi,440970
1,Afghanistan,Rahmanullah Gurbaz,974087
2,Afghanistan,Ibrahim Zadran,921509
3,Afghanistan,Reyaz Hassan,1061090
4,Afghanistan,Rahmat Shah Zurmati,533956
...,...,...,...
145,Sri Lanka,Dunith Wellalage,1152427
146,Sri Lanka,Kasun Rajitha,499594
147,Sri Lanka,Matheesha Pathirana,1194795
148,Sri Lanka,Lahiru Kumara,784375


In [5]:
# Collect Team Names

team_names = player_data_df['team_name'].unique().tolist()
team_names += bowling_df['opposition'].unique().tolist()
team_names += batting_df['Opposition'].unique().tolist()
team_names = list(set(team_names))
team_names

# Encode Team Names

le = LabelEncoder()
le.fit(team_names)
batting_df['Opposition'] = le.transform(batting_df['Opposition'])
bowling_df['opposition'] = le.transform(bowling_df['opposition'])
player_data_df['team_name'] = le.transform(player_data_df['team_name'])


## Data Preprocessing

### Batting Data

In [6]:
batting_df = batting_df.merge(player_data_df, on='player_id', how='left')
batting_df

Unnamed: 0,player_id,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start,odi_number,team_name,player_name
0,440970,0,-,3,0,0,0.00,3,caught,2,9,Sharjah,2 Oct 2013,ODI # 3417,0,Hashmatullah Shahidi
1,440970,13*,-,19,0,0,68.42,5,not out,2,9,Sharjah,4 Oct 2013,ODI # 3418,0,Hashmatullah Shahidi
2,440970,38*,-,67,4,0,56.71,4,not out,2,6,Kuala Lumpur,1 May 2014,ODI # 3487,0,Hashmatullah Shahidi
3,440970,3,-,7,0,0,42.85,4,lbw,1,18,Kuala Lumpur,2 May 2014,ODI # 3488,0,Hashmatullah Shahidi
4,440970,15,21,22,2,0,68.18,4,caught,1,21,Bulawayo,18 Jul 2014,ODI # 3503,0,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,793007,4*,4,2,1,0,200.00,11,not out,2,16,Delhi,7 Oct 2023,ODI # 4661,17,Dilshan Madushanka
9327,793007,DNB,-,-,-,-,-,-,-,1,14,Hyderabad,10 Oct 2023,ODI # 4665,17,Dilshan Madushanka
9328,793007,0*,12,6,0,0,0.00,11,not out,1,1,Lucknow,16 Oct 2023,ODI # 4671,17,Dilshan Madushanka
9329,793007,DNB,-,-,-,-,-,-,-,2,11,Lucknow,21 Oct 2023,ODI # 4676,17,Dilshan Madushanka


In [7]:
batting_df['player_id'].value_counts()
# Valid

player_id
253802     286
56029      261
34102      256
56143      244
56025      225
          ... 
721041       4
819507       3
1139520      2
35281        2
379887       2
Name: count, Length: 150, dtype: int64

#### Removing unwanted columns

In [8]:
batting_df

Unnamed: 0,player_id,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Opposition,Ground,Start,odi_number,team_name,player_name
0,440970,0,-,3,0,0,0.00,3,caught,2,9,Sharjah,2 Oct 2013,ODI # 3417,0,Hashmatullah Shahidi
1,440970,13*,-,19,0,0,68.42,5,not out,2,9,Sharjah,4 Oct 2013,ODI # 3418,0,Hashmatullah Shahidi
2,440970,38*,-,67,4,0,56.71,4,not out,2,6,Kuala Lumpur,1 May 2014,ODI # 3487,0,Hashmatullah Shahidi
3,440970,3,-,7,0,0,42.85,4,lbw,1,18,Kuala Lumpur,2 May 2014,ODI # 3488,0,Hashmatullah Shahidi
4,440970,15,21,22,2,0,68.18,4,caught,1,21,Bulawayo,18 Jul 2014,ODI # 3503,0,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,793007,4*,4,2,1,0,200.00,11,not out,2,16,Delhi,7 Oct 2023,ODI # 4661,17,Dilshan Madushanka
9327,793007,DNB,-,-,-,-,-,-,-,1,14,Hyderabad,10 Oct 2023,ODI # 4665,17,Dilshan Madushanka
9328,793007,0*,12,6,0,0,0.00,11,not out,1,1,Lucknow,16 Oct 2023,ODI # 4671,17,Dilshan Madushanka
9329,793007,DNB,-,-,-,-,-,-,-,2,11,Lucknow,21 Oct 2023,ODI # 4676,17,Dilshan Madushanka


In [9]:
batting_df.drop(columns=["Mins", "4s", "6s", "SR", "odi_number"], inplace=True)

#### Runs

In [10]:
batting_df['Runs'].unique()

array(['0', '13*', '38*', '3', '15', '11', '31', '32', '72', '14', '21',
       '2*', '12', '23', '54', '2', '34*', '37', '58', '97*', '71', '9',
       '52', '59*', '47', '18', '4', '59', '8', '76', '82', '10', '73',
       '28', '5', '88', '1*', '38', '57', '9*', '22', '15*', '13', '51',
       '80', '48*', '127', '103', '7', '106*', '17', '53', '68', '145',
       '6', '151', '1', '65', '19', '121*', '106', '162', '98', '41*',
       '100', '75', '87', '50', '35', 'DNB', '34', '100*', '26', '16',
       '48', '30', '36', '78', '108*', 'TDNB', '44', '114', '43', '56',
       '69', '29', '33', '113', '62', '46', '24', '61', '103*', '70',
       '94', '55', '45', '77*', '83', '12*', '25', '89*', '81*', '8*',
       '67', '63*', '17*', '42', '20', '104*', '60*', '20*', '3*', '77',
       '7*', '6*', '23*', '60', '41', '46*', '14*', '116', '40', '49',
       '27*', '18*', '92', '64', '27', '50*', '25*', '5*', '86', '19*',
       '16*', '57*', '11*', '35*', '39*', '4*', '0*', '10*', '21*'

In [11]:
# We can notice *, DNB, TDNB, absent in runs

# Replace *23 with 23
batting_df['Runs'].replace(to_replace='\*', value='', regex=True, inplace=True)
batting_df['Runs'].unique()

array(['0', '13', '38', '3', '15', '11', '31', '32', '72', '14', '21',
       '2', '12', '23', '54', '34', '37', '58', '97', '71', '9', '52',
       '59', '47', '18', '4', '8', '76', '82', '10', '73', '28', '5',
       '88', '1', '57', '22', '51', '80', '48', '127', '103', '7', '106',
       '17', '53', '68', '145', '6', '151', '65', '19', '121', '162',
       '98', '41', '100', '75', '87', '50', '35', 'DNB', '26', '16', '30',
       '36', '78', '108', 'TDNB', '44', '114', '43', '56', '69', '29',
       '33', '113', '62', '46', '24', '61', '70', '94', '55', '45', '77',
       '83', '25', '89', '81', '67', '63', '42', '20', '104', '60', '116',
       '40', '49', '27', '92', '64', '86', '39', '117', '85', '99', '128',
       '96', '101', '152', '124', '102', '79', '66', '93', '95', '146',
       '74', '163', '178', '84', '122', '109', '173', '119', '156', '130',
       '179', '107', '166', '134', '105', 'absent', '126', '176', '136',
       '90', '110', '144', '125', '111', '112', 'sub',

In [12]:
# Get Non numeric values

batting_df[~batting_df['Runs'].str.isnumeric()]['Runs'].value_counts()

Runs
DNB       1830
TDNB       144
absent       5
sub          1
Name: count, dtype: int64

In [13]:
# Remove rows with DNB, TDNB, absent, sub

batting_df.drop(batting_df[batting_df['Runs'].isin(['DNB', 'TDNB', 'absent', 'sub'])].index, inplace=True)
batting_df

Unnamed: 0,player_id,Runs,BF,Pos,Dismissal,Inns,Opposition,Ground,Start,team_name,player_name
0,440970,0,3,3,caught,2,9,Sharjah,2 Oct 2013,0,Hashmatullah Shahidi
1,440970,13,19,5,not out,2,9,Sharjah,4 Oct 2013,0,Hashmatullah Shahidi
2,440970,38,67,4,not out,2,6,Kuala Lumpur,1 May 2014,0,Hashmatullah Shahidi
3,440970,3,7,4,lbw,1,18,Kuala Lumpur,2 May 2014,0,Hashmatullah Shahidi
4,440970,15,22,4,caught,1,21,Bulawayo,18 Jul 2014,0,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...,...
9321,793007,4,2,11,not out,2,12,Auckland,25 Mar 2023,17,Dilshan Madushanka
9322,793007,1,5,11,not out,1,11,Bulawayo,30 Jun 2023,17,Dilshan Madushanka
9325,793007,0,0,11,not out,1,11,Harare,9 Jul 2023,17,Dilshan Madushanka
9326,793007,4,2,11,not out,2,16,Delhi,7 Oct 2023,17,Dilshan Madushanka


In [14]:
batting_df[~batting_df['Runs'].str.isnumeric()]['Runs'].unique()

array([], dtype=object)

In [15]:
# Change data type of runs to int

batting_df['Runs'] = batting_df['Runs'].astype(int)
batting_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7351 entries, 0 to 9328
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   player_id    7351 non-null   int64 
 1   Runs         7351 non-null   int32 
 2   BF           7351 non-null   object
 3   Pos          7351 non-null   object
 4   Dismissal    7351 non-null   object
 5   Inns         7351 non-null   object
 6   Opposition   7351 non-null   int32 
 7   Ground       7351 non-null   object
 8   Start        7351 non-null   object
 9   team_name    7351 non-null   int32 
 10  player_name  7351 non-null   object
dtypes: int32(3), int64(1), object(7)
memory usage: 603.0+ KB


#### Data Type Conversion

In [16]:
batting_df['BF'] = batting_df['BF'].astype(int)

In [17]:
batting_df['Pos'] = batting_df['Pos'].astype(int)

In [18]:
batting_df['Dismissal'].value_counts()

Dismissal
caught            3730
not out           1395
bowled            1082
lbw                602
run out            375
stumped            145
retired notout      15
hit wicket           6
obstruct field       1
Name: count, dtype: int64

In [19]:
batting_df['Pos'] = batting_df['Pos'].astype(int)

In [20]:
batting_df['Ground'].value_counts()

Ground
Mirpur           467
Colombo (RPS)    321
Harare           237
Pallekele        187
The Oval         169
                ... 
Toronto            1
Jamshedpur         1
Gwalior            1
Lincoln            1
Ayr                1
Name: count, Length: 128, dtype: int64

In [21]:
# Collect Ground Names

ground_names = batting_df['Ground'].unique().tolist()
ground_names += bowling_df['ground'].unique().tolist()
ground_names = list(set(ground_names))
ground_names

['Mohali',
 "St John's",
 'Napier',
 'Chennai',
 'Gqeberha',
 'Wellington',
 'Pune',
 'Hamilton',
 'Basseterre',
 'Potchefstroom',
 'Edinburgh',
 'ICCA Dubai',
 'Dharamsala',
 'Dubai (DSC)',
 'Rotterdam',
 'East London',
 'King City (NW)',
 'Gwalior',
 'Benoni',
 'Bengaluru',
 'Multan',
 'Bulawayo',
 'Southampton',
 'Roseau',
 'Kingston',
 'Vadodara',
 'Kingstown',
 'Jaipur',
 'Sharjah',
 'Gros Islet',
 'Ayr',
 'Utrecht',
 'Bloemfontein',
 'Eden Gardens',
 'Fatullah',
 'Amstelveen',
 'Auckland',
 'Greater Noida',
 'Whangarei',
 'Tarouba',
 'Khulna',
 'The Hague',
 'Bridgetown',
 'Visakhapatnam',
 'Johannesburg',
 'Nelson',
 'Dunedin',
 'Hambantota',
 'Paarl',
 'Faisalabad',
 'Cape Town',
 'Mirpur',
 'Galle',
 'Bristol',
 'Bogra',
 'Canberra',
 'Nottingham',
 'Pallekele',
 'Wankhede',
 'Guwahati',
 'The Oval',
 'Christchurch',
 'Abu Dhabi',
 'Deventer',
 'Toronto',
 'Delhi',
 'Rawalpindi',
 'Ahmedabad',
 'Kochi',
 'Manchester',
 'Darwin',
 'Adelaide',
 'Glasgow',
 'Nagpur',
 'Ranchi',
 

In [22]:
ge = LabelEncoder()
ge.fit(ground_names)
batting_df['Ground'] = ge.transform(batting_df['Ground'])
bowling_df['ground'] = ge.transform(bowling_df['ground'])

In [23]:
batting_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7351 entries, 0 to 9328
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   player_id    7351 non-null   int64 
 1   Runs         7351 non-null   int32 
 2   BF           7351 non-null   int32 
 3   Pos          7351 non-null   int32 
 4   Dismissal    7351 non-null   object
 5   Inns         7351 non-null   object
 6   Opposition   7351 non-null   int32 
 7   Ground       7351 non-null   int32 
 8   Start        7351 non-null   object
 9   team_name    7351 non-null   int32 
 10  player_name  7351 non-null   object
dtypes: int32(6), int64(1), object(4)
memory usage: 516.9+ KB


### Bowling Data

In [24]:
bowling_df = bowling_df.merge(player_data_df, on='player_id', how='left')
bowling_df

Unnamed: 0,player_id,overs,maidens,runs,wickets,economy,position,innings,opposition,ground,start_date,odi_number,team_name,player_name
0,440970,DNB,-,-,-,-,-,1,9,109,2 Oct 2013,ODI # 3417,0,Hashmatullah Shahidi
1,440970,DNB,-,-,-,-,-,1,9,109,4 Oct 2013,ODI # 3418,0,Hashmatullah Shahidi
2,440970,DNB,-,-,-,-,-,1,6,77,1 May 2014,ODI # 3487,0,Hashmatullah Shahidi
3,440970,2.0,0,17,0,8.50,6,2,18,77,2 May 2014,ODI # 3488,0,Hashmatullah Shahidi
4,440970,DNB,-,-,-,-,-,2,21,19,18 Jul 2014,ODI # 3503,0,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,793007,10.0,0,86,2,8.60,2,1,16,38,7 Oct 2023,ODI # 4661,17,Dilshan Madushanka
9327,793007,9.2,0,60,2,6.42,2,2,14,63,10 Oct 2023,ODI # 4665,17,Dilshan Madushanka
9328,793007,9.0,2,38,3,4.22,2,2,1,82,16 Oct 2023,ODI # 4671,17,Dilshan Madushanka
9329,793007,9.4,1,49,4,5.06,1,1,11,82,21 Oct 2023,ODI # 4676,17,Dilshan Madushanka


#### Removing unwanted columns

In [25]:
bowling_df.drop(columns=["maidens", "position", "odi_number"], inplace=True)
bowling_df

Unnamed: 0,player_id,overs,runs,wickets,economy,innings,opposition,ground,start_date,team_name,player_name
0,440970,DNB,-,-,-,1,9,109,2 Oct 2013,0,Hashmatullah Shahidi
1,440970,DNB,-,-,-,1,9,109,4 Oct 2013,0,Hashmatullah Shahidi
2,440970,DNB,-,-,-,1,6,77,1 May 2014,0,Hashmatullah Shahidi
3,440970,2.0,17,0,8.50,2,18,77,2 May 2014,0,Hashmatullah Shahidi
4,440970,DNB,-,-,-,2,21,19,18 Jul 2014,0,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...,...
9326,793007,10.0,86,2,8.60,1,16,38,7 Oct 2023,17,Dilshan Madushanka
9327,793007,9.2,60,2,6.42,2,14,63,10 Oct 2023,17,Dilshan Madushanka
9328,793007,9.0,38,3,4.22,2,1,82,16 Oct 2023,17,Dilshan Madushanka
9329,793007,9.4,49,4,5.06,1,11,82,21 Oct 2023,17,Dilshan Madushanka


#### Overs

In [26]:
bowling_df['overs'].unique()

array(['DNB', '2.0', '1.0', 'TDNB', '3.0', '5.0', '1.5', '5.3', '6.0',
       '8.0', '4.0', '1.1', '9.0', '10.0', '9.5', '5.1', '7.0', '8.3',
       '9.4', '6.4', '5.2', '7.1', '4.1', '3.5', '8.2', '8.5', '8.1',
       '7.3', '9.2', '4.4', '4.2', '5.4', '9.3', '8.4', '7.2', '7.5',
       '4.5', '1.4', '3.2', '6.1', '6.2', '0.1', '6.5', '2.4', '0.5',
       '5.5', 'sub', '1.3', '3.1', '9.1', '6.3', '1.2', '0.2', '2.3',
       '2.5', '7.4', '0.4', '2.2', '0.3', '3.3', '4.3'], dtype=object)

In [27]:
# We can notice DNB, TDNB, sub in overs. Drop them

bowling_df.drop(bowling_df[bowling_df['overs'].isin(['DNB', 'TDNB', 'sub'])].index, inplace=True)
bowling_df['overs'] = bowling_df['overs'].astype(float)
bowling_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5171 entries, 3 to 9330
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   player_id    5171 non-null   int64  
 1   overs        5171 non-null   float64
 2   runs         5171 non-null   object 
 3   wickets      5171 non-null   object 
 4   economy      5171 non-null   object 
 5   innings      5171 non-null   object 
 6   opposition   5171 non-null   int32  
 7   ground       5171 non-null   int32  
 8   start_date   5171 non-null   object 
 9   team_name    5171 non-null   int32  
 10  player_name  5171 non-null   object 
dtypes: float64(1), int32(3), int64(1), object(6)
memory usage: 424.2+ KB


#### Runs

In [28]:
bowling_df['runs'] = bowling_df['runs'].astype(int)
bowling_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5171 entries, 3 to 9330
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   player_id    5171 non-null   int64  
 1   overs        5171 non-null   float64
 2   runs         5171 non-null   int32  
 3   wickets      5171 non-null   object 
 4   economy      5171 non-null   object 
 5   innings      5171 non-null   object 
 6   opposition   5171 non-null   int32  
 7   ground       5171 non-null   int32  
 8   start_date   5171 non-null   object 
 9   team_name    5171 non-null   int32  
 10  player_name  5171 non-null   object 
dtypes: float64(1), int32(4), int64(1), object(5)
memory usage: 404.0+ KB


#### Wickets

In [29]:
bowling_df['wickets'] = bowling_df['wickets'].astype(int)
bowling_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5171 entries, 3 to 9330
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   player_id    5171 non-null   int64  
 1   overs        5171 non-null   float64
 2   runs         5171 non-null   int32  
 3   wickets      5171 non-null   int32  
 4   economy      5171 non-null   object 
 5   innings      5171 non-null   object 
 6   opposition   5171 non-null   int32  
 7   ground       5171 non-null   int32  
 8   start_date   5171 non-null   object 
 9   team_name    5171 non-null   int32  
 10  player_name  5171 non-null   object 
dtypes: float64(1), int32(5), int64(1), object(4)
memory usage: 383.8+ KB


In [30]:
bowling_df['economy'] = bowling_df['economy'].astype(float)
bowling_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5171 entries, 3 to 9330
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   player_id    5171 non-null   int64  
 1   overs        5171 non-null   float64
 2   runs         5171 non-null   int32  
 3   wickets      5171 non-null   int32  
 4   economy      5171 non-null   float64
 5   innings      5171 non-null   object 
 6   opposition   5171 non-null   int32  
 7   ground       5171 non-null   int32  
 8   start_date   5171 non-null   object 
 9   team_name    5171 non-null   int32  
 10  player_name  5171 non-null   object 
dtypes: float64(2), int32(5), int64(1), object(3)
memory usage: 383.8+ KB


#### Innings

In [31]:
bowling_df['innings'] = bowling_df['innings'].astype(int)
bowling_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5171 entries, 3 to 9330
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   player_id    5171 non-null   int64  
 1   overs        5171 non-null   float64
 2   runs         5171 non-null   int32  
 3   wickets      5171 non-null   int32  
 4   economy      5171 non-null   float64
 5   innings      5171 non-null   int32  
 6   opposition   5171 non-null   int32  
 7   ground       5171 non-null   int32  
 8   start_date   5171 non-null   object 
 9   team_name    5171 non-null   int32  
 10  player_name  5171 non-null   object 
dtypes: float64(2), int32(6), int64(1), object(2)
memory usage: 363.6+ KB


### Label Encoding

We have already encoded the `team_name`, `ground_name` beforehand to avoid any data integrity issues.

#### Batting Data

In [32]:
dismissal_encoder = LabelEncoder()
dismissal_encoder.fit(batting_df['Dismissal'])
batting_df['Dismissal'] = dismissal_encoder.transform(batting_df['Dismissal'])
batting_df

Unnamed: 0,player_id,Runs,BF,Pos,Dismissal,Inns,Opposition,Ground,Start,team_name,player_name
0,440970,0,3,3,1,2,9,109,2 Oct 2013,0,Hashmatullah Shahidi
1,440970,13,19,5,4,2,9,109,4 Oct 2013,0,Hashmatullah Shahidi
2,440970,38,67,4,4,2,6,77,1 May 2014,0,Hashmatullah Shahidi
3,440970,3,7,4,3,1,18,77,2 May 2014,0,Hashmatullah Shahidi
4,440970,15,22,4,1,1,21,19,18 Jul 2014,0,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...,...
9321,793007,4,2,11,4,2,12,6,25 Mar 2023,17,Dilshan Madushanka
9322,793007,1,5,11,4,1,11,19,30 Jun 2023,17,Dilshan Madushanka
9325,793007,0,0,11,4,1,11,61,9 Jul 2023,17,Dilshan Madushanka
9326,793007,4,2,11,4,2,16,38,7 Oct 2023,17,Dilshan Madushanka


## Feature Selection

### Batting Data

From the above heatmap, we can see that Runs and BF are highly correlated. So we can drop one of them

In [33]:
# From the above heatmap, we can see that Runs and BF are highly correlated. So we can drop one of them

batting_df.drop(columns=['BF'], inplace=True)
batting_df

Unnamed: 0,player_id,Runs,Pos,Dismissal,Inns,Opposition,Ground,Start,team_name,player_name
0,440970,0,3,1,2,9,109,2 Oct 2013,0,Hashmatullah Shahidi
1,440970,13,5,4,2,9,109,4 Oct 2013,0,Hashmatullah Shahidi
2,440970,38,4,4,2,6,77,1 May 2014,0,Hashmatullah Shahidi
3,440970,3,4,3,1,18,77,2 May 2014,0,Hashmatullah Shahidi
4,440970,15,4,1,1,21,19,18 Jul 2014,0,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...
9321,793007,4,11,4,2,12,6,25 Mar 2023,17,Dilshan Madushanka
9322,793007,1,11,4,1,11,19,30 Jun 2023,17,Dilshan Madushanka
9325,793007,0,11,4,1,11,61,9 Jul 2023,17,Dilshan Madushanka
9326,793007,4,11,4,2,16,38,7 Oct 2023,17,Dilshan Madushanka


In [34]:
# batting_df.drop(columns=['player_name'], inplace=True)
# batting_df

### Bowling Data

In [35]:
# bowling_df.drop(columns=['player_name'], inplace=True)
# bowling_df

1. From the above heatmap, we can see that overs and runs are highly correlated. So we can drop one of them
2. Also, economy and runs are highly correlated. So we can drop one of them

In [36]:
# bowling_df.drop(columns=['overs'], inplace=True)
# bowling_df.drop(columns=['runs'], inplace=True)
# bowling_df


## Regression

### Tasks

1. Predict the runs batsman will score in a match.
2. Predict the wickets a bowler will take in a match.
3. Predict the economy of a bowler in a match.
4. Predict the type of dismissal of a batsman in a match.

### Models

1. Linear Regression
2. Decision Tree Regression
3. Random Forest Regression
4. Logistic Regression

`Use the batting_df and bowling_df to train and test the models.`

In [37]:
batting_df

Unnamed: 0,player_id,Runs,Pos,Dismissal,Inns,Opposition,Ground,Start,team_name,player_name
0,440970,0,3,1,2,9,109,2 Oct 2013,0,Hashmatullah Shahidi
1,440970,13,5,4,2,9,109,4 Oct 2013,0,Hashmatullah Shahidi
2,440970,38,4,4,2,6,77,1 May 2014,0,Hashmatullah Shahidi
3,440970,3,4,3,1,18,77,2 May 2014,0,Hashmatullah Shahidi
4,440970,15,4,1,1,21,19,18 Jul 2014,0,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...
9321,793007,4,11,4,2,12,6,25 Mar 2023,17,Dilshan Madushanka
9322,793007,1,11,4,1,11,19,30 Jun 2023,17,Dilshan Madushanka
9325,793007,0,11,4,1,11,61,9 Jul 2023,17,Dilshan Madushanka
9326,793007,4,11,4,2,16,38,7 Oct 2023,17,Dilshan Madushanka


In [38]:
batting_df

Unnamed: 0,player_id,Runs,Pos,Dismissal,Inns,Opposition,Ground,Start,team_name,player_name
0,440970,0,3,1,2,9,109,2 Oct 2013,0,Hashmatullah Shahidi
1,440970,13,5,4,2,9,109,4 Oct 2013,0,Hashmatullah Shahidi
2,440970,38,4,4,2,6,77,1 May 2014,0,Hashmatullah Shahidi
3,440970,3,4,3,1,18,77,2 May 2014,0,Hashmatullah Shahidi
4,440970,15,4,1,1,21,19,18 Jul 2014,0,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...
9321,793007,4,11,4,2,12,6,25 Mar 2023,17,Dilshan Madushanka
9322,793007,1,11,4,1,11,19,30 Jun 2023,17,Dilshan Madushanka
9325,793007,0,11,4,1,11,61,9 Jul 2023,17,Dilshan Madushanka
9326,793007,4,11,4,2,16,38,7 Oct 2023,17,Dilshan Madushanka


In [39]:
le.inverse_transform([0,1])

array(['Afghanistan', 'Australia'], dtype='<U12')

In [40]:
from sklearn.model_selection import train_test_split
x=batting_df.drop(['Runs','Dismissal','Inns','Start','player_name'],axis='columns')
x
y=batting_df['Runs']
y

0        0
1       13
2       38
3        3
4       15
        ..
9321     4
9322     1
9325     0
9326     4
9328     0
Name: Runs, Length: 7351, dtype: int32

In [41]:
x

Unnamed: 0,player_id,Pos,Opposition,Ground,team_name
0,440970,3,9,109,0
1,440970,5,9,109,0
2,440970,4,6,77,0
3,440970,4,18,77,0
4,440970,4,21,19,0
...,...,...,...,...,...
9321,793007,11,12,6,17
9322,793007,11,11,19,17
9325,793007,11,11,61,17
9326,793007,11,16,38,17


In [42]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=10)

In [43]:
# n=int(input())
# m=int(input())
# g=int(input())
# team_name=int(input())
# o=int(input())
# # n=6
# # m=5
# # g=109
# # t=0
# # o=9





ValueError: invalid literal for int() with base 10: ''

In [None]:
# df=batting_df[batting_df['team_name']==t]
# arr=df['player_id'].unique()
# arr

In [44]:
bowling_df

Unnamed: 0,player_id,overs,runs,wickets,economy,innings,opposition,ground,start_date,team_name,player_name
3,440970,2.0,17,0,8.50,2,18,77,2 May 2014,0,Hashmatullah Shahidi
12,440970,1.0,8,0,8.00,2,21,61,16 Feb 2017,0,Hashmatullah Shahidi
129,533956,3.0,10,0,3.33,1,15,109,6 Mar 2013,0,Rahmat Shah Zurmati
130,533956,5.0,22,0,4.40,1,15,109,8 Mar 2013,0,Rahmat Shah Zurmati
131,533956,1.5,7,1,3.81,1,9,109,2 Oct 2013,0,Rahmat Shah Zurmati
...,...,...,...,...,...,...,...,...,...,...,...
9326,793007,10.0,86,2,8.60,1,16,38,7 Oct 2023,17,Dilshan Madushanka
9327,793007,9.2,60,2,6.42,2,14,63,10 Oct 2023,17,Dilshan Madushanka
9328,793007,9.0,38,3,4.22,2,1,82,16 Oct 2023,17,Dilshan Madushanka
9329,793007,9.4,49,4,5.06,1,11,82,21 Oct 2023,17,Dilshan Madushanka


In [45]:
bd=batting_df
bd

Unnamed: 0,player_id,Runs,Pos,Dismissal,Inns,Opposition,Ground,Start,team_name,player_name
0,440970,0,3,1,2,9,109,2 Oct 2013,0,Hashmatullah Shahidi
1,440970,13,5,4,2,9,109,4 Oct 2013,0,Hashmatullah Shahidi
2,440970,38,4,4,2,6,77,1 May 2014,0,Hashmatullah Shahidi
3,440970,3,4,3,1,18,77,2 May 2014,0,Hashmatullah Shahidi
4,440970,15,4,1,1,21,19,18 Jul 2014,0,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...
9321,793007,4,11,4,2,12,6,25 Mar 2023,17,Dilshan Madushanka
9322,793007,1,11,4,1,11,19,30 Jun 2023,17,Dilshan Madushanka
9325,793007,0,11,4,1,11,61,9 Jul 2023,17,Dilshan Madushanka
9326,793007,4,11,4,2,16,38,7 Oct 2023,17,Dilshan Madushanka


In [46]:
# le=LabelEncoder()
# le.fit(bd)
# bd['Dismissal']=le.fit_transform(bd['Dismissal'])
# bd
# Assuming bd['Dismissal'] is the column you want to encode
le = LabelEncoder()
bd['Dismissal'] = le.fit_transform(bd['Dismissal'])
bd



Unnamed: 0,player_id,Runs,Pos,Dismissal,Inns,Opposition,Ground,Start,team_name,player_name
0,440970,0,3,1,2,9,109,2 Oct 2013,0,Hashmatullah Shahidi
1,440970,13,5,4,2,9,109,4 Oct 2013,0,Hashmatullah Shahidi
2,440970,38,4,4,2,6,77,1 May 2014,0,Hashmatullah Shahidi
3,440970,3,4,3,1,18,77,2 May 2014,0,Hashmatullah Shahidi
4,440970,15,4,1,1,21,19,18 Jul 2014,0,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...
9321,793007,4,11,4,2,12,6,25 Mar 2023,17,Dilshan Madushanka
9322,793007,1,11,4,1,11,19,30 Jun 2023,17,Dilshan Madushanka
9325,793007,0,11,4,1,11,61,9 Jul 2023,17,Dilshan Madushanka
9326,793007,4,11,4,2,16,38,7 Oct 2023,17,Dilshan Madushanka


In [47]:
x=batting_df.drop(['Runs','Dismissal','Start','team_name','player_name'],axis='columns')
x

Unnamed: 0,player_id,Pos,Inns,Opposition,Ground
0,440970,3,2,9,109
1,440970,5,2,9,109
2,440970,4,2,6,77
3,440970,4,1,18,77
4,440970,4,1,21,19
...,...,...,...,...,...
9321,793007,11,2,12,6
9322,793007,11,1,11,19
9325,793007,11,1,11,61
9326,793007,11,2,16,38


In [48]:
y=batting_df['Dismissal']
y

0       1
1       4
2       4
3       3
4       1
       ..
9321    4
9322    4
9325    4
9326    4
9328    4
Name: Dismissal, Length: 7351, dtype: int64

In [49]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=10)

In [50]:
from sklearn.linear_model import LogisticRegression 
log_reg=LogisticRegression(solver='lbfgs',random_state=42)
log_reg.fit(x_train,y_train)

In [51]:
id=int(input())
pos=int(input())
inns=int(input())
opp=int(input())
ground=int(input())

# id=440970
# pos=3
# inns=2
# opp=9
# ground=109
print(log_reg.predict([[id,pos,inns,opp,ground]]))





ValueError: invalid literal for int() with base 10: ''

In [52]:
y_pred=log_reg.predict(x_test)

In [53]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.46603260869565216


In [54]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train, y_train)

In [55]:
print(model.predict([[id,pos,inns,opp,ground]]))

NameError: name 'pos' is not defined

In [56]:
y_pred=model.predict(x_test)

In [57]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.3328804347826087


In [58]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
clf.fit(x_train, y_train)

In [59]:
y_pred = clf.predict(x_test)

In [60]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.4986413043478261


In [61]:
from sklearn.metrics import accuracy_score, classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.01      0.02       115
           1       0.49      0.96      0.65       343
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00        57
           4       0.55      0.26      0.35       147
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00        52
           8       0.00      0.00      0.00        20

    accuracy                           0.50       736
   macro avg       0.26      0.15      0.13       736
weighted avg       0.50      0.50      0.38       736



In [62]:
# id=int(input())
# pos=int(input())
# inns=int(input())
# opp=int(input())
# ground=int(input())

id=440970
pos=3
inns=2
opp=9
ground=109



In [63]:
df=bd[batting_df['player_id']==id]
df

Unnamed: 0,player_id,Runs,Pos,Dismissal,Inns,Opposition,Ground,Start,team_name,player_name
0,440970,0,3,1,2,9,109,2 Oct 2013,0,Hashmatullah Shahidi
1,440970,13,5,4,2,9,109,4 Oct 2013,0,Hashmatullah Shahidi
2,440970,38,4,4,2,6,77,1 May 2014,0,Hashmatullah Shahidi
3,440970,3,4,3,1,18,77,2 May 2014,0,Hashmatullah Shahidi
4,440970,15,4,1,1,21,19,18 Jul 2014,0,Hashmatullah Shahidi
...,...,...,...,...,...,...,...,...,...,...
64,440970,18,4,1,1,2,40,7 Oct 2023,0,Hashmatullah Shahidi
65,440970,80,4,3,1,7,38,11 Oct 2023,0,Hashmatullah Shahidi
66,440970,14,4,0,1,5,38,15 Oct 2023,0,Hashmatullah Shahidi
67,440970,8,4,1,2,12,28,18 Oct 2023,0,Hashmatullah Shahidi


In [64]:
x=df.drop(['Runs','Dismissal','Start','team_name','player_name'],axis='columns')
x

Unnamed: 0,player_id,Pos,Inns,Opposition,Ground
0,440970,3,2,9,109
1,440970,5,2,9,109
2,440970,4,2,6,77
3,440970,4,1,18,77
4,440970,4,1,21,19
...,...,...,...,...,...
64,440970,4,1,2,40
65,440970,4,1,7,38
66,440970,4,1,5,38
67,440970,4,2,12,28


In [65]:
y=df['Dismissal']
y

0     1
1     4
2     4
3     3
4     1
     ..
64    1
65    3
66    0
67    1
68    4
Name: Dismissal, Length: 69, dtype: int64

In [66]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=10)

In [67]:
from sklearn.linear_model import LogisticRegression 
log_reg=LogisticRegression(solver='lbfgs',random_state=42)
log_reg.fit(x_train,y_train)

In [68]:
y_pred=log_reg.predict(x_test)

In [69]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5714285714285714


In [70]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train, y_train)

In [71]:
print(model.predict([[id,pos,inns,opp,ground]]))

[1]


In [72]:
y_pred=model.predict(x_test)

In [73]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.14285714285714285


In [74]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
clf.fit(x_train, y_train)

In [75]:
y_pred = clf.predict(x_test)

In [76]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.42857142857142855


In [77]:
from scipy.stats import randint

param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

In [78]:
from sklearn.model_selection import RandomizedSearchCV
tree = DecisionTreeClassifier()
tree_cv = RandomizedSearchCV(tree, param_dist, cv = 5)
 
tree_cv.fit(x, y)
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

Tuned Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 3, 'max_features': 2, 'min_samples_leaf': 6}
Best score is 0.49230769230769234


In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
 
# Creating the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}
 
# Instantiating logistic regression classifier
logreg = LogisticRegression()
 
# Instantiating the GridSearchCV object
logreg_cv = GridSearchCV(logreg, param_grid, cv = 5)
 
logreg_cv.fit(x, y)
 
# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))

Tuned Logistic Regression Parameters: {'C': 1e-05}
Best score is 0.49230769230769234


In [83]:
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.svm import SVC
 
# Load the datasets

 
#Build the model
svm = SVC(kernel="rbf", gamma=0.5, C=1.0)
# Trained the model
svm.fit(x_train,y_train)
 
# # Plot Decision Boundary
# DecisionBoundaryDisplay.from_estimator(
#         svm,
#         x,
#         response_method="predict",
#         cmap=plt.cm.Spectral,
#         alpha=0.8,
#         xlabel=cancer.feature_names[0],
#         ylabel=cancer.feature_names[1],
#     )
 
# # Scatter plot
# plt.scatter(X[:, 0], X[:, 1], 
#             c=y, 
#             s=20, edgecolors="k")
# plt.show()

y_pred=svm.predict(x_test);
print(f"accuracy: {accuracy_score(y_test,y_pred)}")

accuracy: 0.5714285714285714


In [87]:
x

Unnamed: 0,player_id,Pos,Inns,Opposition,Ground
0,440970,3,2,9,109
1,440970,5,2,9,109
2,440970,4,2,6,77
3,440970,4,1,18,77
4,440970,4,1,21,19
...,...,...,...,...,...
64,440970,4,1,2,40
65,440970,4,1,7,38
66,440970,4,1,5,38
67,440970,4,2,12,28
