# <span style="background-color:white; color:black;"> Evidence-based Cricket analytics:</span>
# Evaluating India’s **Batting, Bowling & All-Round** Performance in Tests Using ESPN Cricinfo Data (2013–2022)

## `Problem Statement:`
### To evaluate the performance of Indian men’s cricket players in the **`Test`** format across the world during the period 2013–2022, and to identify the most effective role-based combination of players that forms the best possible Playing XI using data-driven analytics.

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore")

## Fetching Test Batters Data

In [2]:
# ESPN Cricinfo website where our data exists.
url='https://stats.espncricinfo.com/ci/engine/stats/index.html?class=1;spanmax1=31+Dec+2022;spanmin1=01+Jan+2013;spanval1=span;team=6;template=results;type=batting'

In [3]:
# Headers are used to retrive the data succesfully without website blocking, Header source: "Chatgpt".

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Referer": "https://www.espncricinfo.com/"
}

test_batters_data=requests.get(url,headers=headers)
test_batters_data

<Response [200]>

In [4]:
test_batters_data.content # Checking wether the data retrived properly or not (in html format).

b'\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<!-- hostname: web04, edition-view: , country: unknown, cluster: www, created: 2025-12-09 06:43:27 -->\n<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/" xmlns:fb="http://developers.facebook.com/schema/" >\n<head>\n <script type="text/javascript">var _sf_startpt=(new Date()).getTime()</script>\n <meta name="google-site-verification" content="ZxdgH3XglRg0Bsy-Ho2RnO3EE4nRs53FloLS6fkt_nc" />\n <meta\n        name="viewport"\n        content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0"\n      />\n <title>Batting records | Test matches | Cricinfo Statsguru | ESPNcricinfo.com</title>\n <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n <meta name="keywords" content="" />\n\n \n <meta name="description" content="" />\n<!--[if IE 9]>\

In [5]:
# Getting BeautifulSoup library into play--> The below code turns the Html code into a format where Python can easily find things like tables, rows, and columns.

soup=BeautifulSoup(test_batters_data.content)

In [6]:
# Finding all the tables with respected "Tag" and "Class".

table=soup.find_all('table',class_='engineTable')
len(table)

6

In [7]:
# To know the Correct Table we are Looking for, we get the titles along with indeces of the tables present on the site which makes us easy to know which table we are looking for.

for i, t in enumerate(table):
    caption = t.find("caption") # 'caption' refers to the "Tag" of the table "Title".
    
    print(i, caption.get_text(strip=True) if caption else "No caption") # prints "Index Table-Title" with stripping unessary spaces and "No caption" if the table doesn't have a title.


0 No caption
1 No caption
2 Overall figures
3 No caption
4 No caption
5 No caption


In [8]:
# We need to install lxml so that we can Read / parse HTML pages, Extract tables from HTML and Allow "pandas.read_html()" to work properly(It helps scrapping tobe done smoothly).

!pip install lxml



In [9]:
# Getting our Required table based on the index we have found above(Html format of the table).
main_table = table[2] 

# str(main_table) converts the HTML table into text (HTML string).
# pd.read_html(...) looks inside that HTML and extracts any <table> it finds.
# df_list--> Creates list of Dataframes, even if there is only one table.

df_list = pd.read_html(str(main_table)) 

# From the list of DataFrames, we are taking the first and only one.
test_batters_df = df_list[0]

test_batters_df.head()


Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s,Unnamed: 15
0,V Kohli,2013-2022,90,152,9,7228,254*,50.54,12681,56.99,24,23,12,809,20,
1,CA Pujara,2013-2022,89,153,8,6253,204,43.12,14212,43.99,16,33,11,755,14,
2,AM Rahane,2013-2022,82,140,12,4931,188,38.52,9972,49.44,12,25,10,560,34,
3,M Vijay,2013-2018,49,85,1,3373,167,40.15,7345,45.92,11,13,7,399,30,
4,RG Sharma,2013-2022,45,77,9,3137,212,46.13,5625,55.76,8,14,4,335,64,


In [10]:
# Dropping Columns that are Not-Necessary/Added Aditionally.
test_batters_df=test_batters_df.drop(columns='Unnamed: 15')
test_batters_df.head()

Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s
0,V Kohli,2013-2022,90,152,9,7228,254*,50.54,12681,56.99,24,23,12,809,20
1,CA Pujara,2013-2022,89,153,8,6253,204,43.12,14212,43.99,16,33,11,755,14
2,AM Rahane,2013-2022,82,140,12,4931,188,38.52,9972,49.44,12,25,10,560,34
3,M Vijay,2013-2018,49,85,1,3373,167,40.15,7345,45.92,11,13,7,399,30
4,RG Sharma,2013-2022,45,77,9,3137,212,46.13,5625,55.76,8,14,4,335,64


In [11]:
test_batters_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  50 non-null     object 
 1   Span    50 non-null     object 
 2   Mat     50 non-null     int64  
 3   Inns    50 non-null     int64  
 4   NO      50 non-null     int64  
 5   Runs    50 non-null     int64  
 6   HS      50 non-null     object 
 7   Ave     50 non-null     object 
 8   BF      50 non-null     int64  
 9   SR      50 non-null     float64
 10  100     50 non-null     int64  
 11  50      50 non-null     int64  
 12  0       50 non-null     int64  
 13  4s      50 non-null     int64  
 14  6s      50 non-null     int64  
dtypes: float64(1), int64(10), object(4)
memory usage: 6.0+ KB


In [12]:
test_batters_df.columns

Index(['Player', 'Span', 'Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', 'BF', 'SR',
       '100', '50', '0', '4s', '6s'],
      dtype='object')

In [13]:
# Changing Column Names

test_batters_df.columns=['Player Name', 'Span', 'Matches', 'Innings', 'Not Outs', 'Runs', 'Highest Score', 'Avg', 'Balls Faced', 'Strike Rate',
       '100', '50', '0', '4s', '6s']

test_batters_df.head()

Unnamed: 0,Player Name,Span,Matches,Innings,Not Outs,Runs,Highest Score,Avg,Balls Faced,Strike Rate,100,50,0,4s,6s
0,V Kohli,2013-2022,90,152,9,7228,254*,50.54,12681,56.99,24,23,12,809,20
1,CA Pujara,2013-2022,89,153,8,6253,204,43.12,14212,43.99,16,33,11,755,14
2,AM Rahane,2013-2022,82,140,12,4931,188,38.52,9972,49.44,12,25,10,560,34
3,M Vijay,2013-2018,49,85,1,3373,167,40.15,7345,45.92,11,13,7,399,30
4,RG Sharma,2013-2022,45,77,9,3137,212,46.13,5625,55.76,8,14,4,335,64


In [14]:
# Creating a Function which includes all the process of converting a raw html code to a DataFrame so it can be re-used

def extract_cricinfo_table(url,tag='table',_class_='engineTable'):
    
    data=requests.get(url,headers=headers)
    print(data)
    
    soup=BeautifulSoup(data.content)
    table=soup.find_all(tag,class_=_class_)

    main_table=None
    
    for t in table:
        caption = t.find("caption")
        if caption and caption.get_text(strip=True) == "Overall figures":
            main_table = t
            break

    if main_table is None:
        raise Exception("No table found with caption 'Overall figures'!")

    df_list=pd.read_html(str(main_table))
    df=df_list[0]

    return df
        

## Fetching Test Bowlers Data

In [15]:
test_bowlers_df=extract_cricinfo_table('https://stats.espncricinfo.com/ci/engine/stats/index.html?class=1;spanmax1=31+Dec+2022;spanmin1=01+Jan+2013;spanval1=span;team=6;template=results;type=bowling')
test_bowlers_df.head()

<Response [200]>


Unnamed: 0,Player,Span,Mat,Inns,Overs,Mdns,Runs,Wkts,BBI,BBM,Ave,Econ,SR,5,10,Unnamed: 15
0,R Ashwin,2013-2022,76,143,3264.5,693,8870,386,7/59,13/140,22.97,2.71,50.7,25,6,
1,RA Jadeja,2013-2022,59,112,2388.3,584,5863,239,7/48,10/154,24.53,2.45,59.9,10,1,
2,Mohammed Shami,2013-2022,60,114,1804.3,341,5931,216,6/56,9/118,27.45,3.28,50.1,6,0,
3,I Sharma,2013-2021,58,107,1648.5,353,4872,174,7/74,9/78,28.0,2.95,56.8,8,0,
4,UT Yadav,2014-2022,45,90,1172.0,210,3944,133,6/88,10/133,29.65,3.36,52.8,2,1,


In [16]:
test_bowlers_df=test_bowlers_df.drop(columns='Unnamed: 15')
test_bowlers_df.head()

Unnamed: 0,Player,Span,Mat,Inns,Overs,Mdns,Runs,Wkts,BBI,BBM,Ave,Econ,SR,5,10
0,R Ashwin,2013-2022,76,143,3264.5,693,8870,386,7/59,13/140,22.97,2.71,50.7,25,6
1,RA Jadeja,2013-2022,59,112,2388.3,584,5863,239,7/48,10/154,24.53,2.45,59.9,10,1
2,Mohammed Shami,2013-2022,60,114,1804.3,341,5931,216,6/56,9/118,27.45,3.28,50.1,6,0
3,I Sharma,2013-2021,58,107,1648.5,353,4872,174,7/74,9/78,28.0,2.95,56.8,8,0
4,UT Yadav,2014-2022,45,90,1172.0,210,3944,133,6/88,10/133,29.65,3.36,52.8,2,1


In [17]:
test_bowlers_df.columns

Index(['Player', 'Span', 'Mat', 'Inns', 'Overs', 'Mdns', 'Runs', 'Wkts', 'BBI',
       'BBM', 'Ave', 'Econ', 'SR', '5', '10'],
      dtype='object')

In [18]:
test_bowlers_df.columns=['Player Name', 'Span', 'Matches', 'Innings', 'Overs', 'Maidens', 'Runs', 'Wickets', 'Best Bowling(Innings)',
       'Best Bowling(Match)', 'Avg', 'Econ Rate', 'Bowling Strike Rate', '5w in an Innings', '10w in a Match']

test_bowlers_df.head()

Unnamed: 0,Player Name,Span,Matches,Innings,Overs,Maidens,Runs,Wickets,Best Bowling(Innings),Best Bowling(Match),Avg,Econ Rate,Bowling Strike Rate,5w in an Innings,10w in a Match
0,R Ashwin,2013-2022,76,143,3264.5,693,8870,386,7/59,13/140,22.97,2.71,50.7,25,6
1,RA Jadeja,2013-2022,59,112,2388.3,584,5863,239,7/48,10/154,24.53,2.45,59.9,10,1
2,Mohammed Shami,2013-2022,60,114,1804.3,341,5931,216,6/56,9/118,27.45,3.28,50.1,6,0
3,I Sharma,2013-2021,58,107,1648.5,353,4872,174,7/74,9/78,28.0,2.95,56.8,8,0
4,UT Yadav,2014-2022,45,90,1172.0,210,3944,133,6/88,10/133,29.65,3.36,52.8,2,1


In [19]:
test_bowlers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Player Name            50 non-null     object
 1   Span                   50 non-null     object
 2   Matches                50 non-null     int64 
 3   Innings                50 non-null     object
 4   Overs                  50 non-null     object
 5   Maidens                50 non-null     object
 6   Runs                   50 non-null     object
 7   Wickets                50 non-null     object
 8   Best Bowling(Innings)  50 non-null     object
 9   Best Bowling(Match)    50 non-null     object
 10  Avg                    50 non-null     object
 11  Econ Rate              50 non-null     object
 12  Bowling Strike Rate    50 non-null     object
 13  5w in an Innings       50 non-null     object
 14  10w in a Match         50 non-null     object
dtypes: int64(1), object(14)
m

## Fetching Test All-Rounders Data

In [20]:
test_allrounders_df=extract_cricinfo_table('https://stats.espncricinfo.com/ci/engine/stats/index.html?class=1;spanmax1=31+Dec+2022;spanmin1=01+Jan+2013;spanval1=span;team=6;template=results;type=allround')
test_allrounders_df.head()

<Response [200]>


Unnamed: 0,Player,Span,Mat,Runs,HS,Bat Av,100,Wkts,BBI,Bowl Av,5,Ct,St,Ave Diff,Unnamed: 14
0,V Kohli,2013-2022,90,7228,254*,50.54,24,0,-,-,0,86,0,-,
1,CA Pujara,2013-2022,89,6253,204,43.12,16,0,-,-,0,56,0,-,
2,AM Rahane,2013-2022,82,4931,188,38.52,12,-,-,-,-,99,0,-,
3,R Ashwin,2013-2022,76,2447,124,24.96,4,386,7/59,22.97,25,29,0,1.99,
4,Mohammed Shami,2013-2022,60,685,56*,11.61,0,216,6/56,27.45,6,16,0,-15.84,


In [21]:
test_allrounders_df=test_allrounders_df.drop(columns='Unnamed: 14')
test_allrounders_df.columns

Index(['Player', 'Span', 'Mat', 'Runs', 'HS', 'Bat Av', '100', 'Wkts', 'BBI',
       'Bowl Av', '5', 'Ct', 'St', 'Ave Diff'],
      dtype='object')

In [22]:
test_allrounders_df.columns=['Player Name', 'Span', 'Matches', 'Runs', 'Highest Score', 'Batting Avg', '100', 'Wkts', 'Best Bowling(Innings)',
       'Bowling Avg', '5w in an Innings', 'Catches Taken', 'Stumpings Made', 'Avg Diff']

test_allrounders_df.head()

Unnamed: 0,Player Name,Span,Matches,Runs,Highest Score,Batting Avg,100,Wkts,Best Bowling(Innings),Bowling Avg,5w in an Innings,Catches Taken,Stumpings Made,Avg Diff
0,V Kohli,2013-2022,90,7228,254*,50.54,24,0,-,-,0,86,0,-
1,CA Pujara,2013-2022,89,6253,204,43.12,16,0,-,-,0,56,0,-
2,AM Rahane,2013-2022,82,4931,188,38.52,12,-,-,-,-,99,0,-
3,R Ashwin,2013-2022,76,2447,124,24.96,4,386,7/59,22.97,25,29,0,1.99
4,Mohammed Shami,2013-2022,60,685,56*,11.61,0,216,6/56,27.45,6,16,0,-15.84


In [23]:
test_allrounders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Player Name            50 non-null     object
 1   Span                   50 non-null     object
 2   Matches                50 non-null     int64 
 3   Runs                   50 non-null     int64 
 4   Highest Score          50 non-null     object
 5   Batting Avg            50 non-null     object
 6   100                    50 non-null     int64 
 7   Wkts                   50 non-null     object
 8   Best Bowling(Innings)  50 non-null     object
 9   Bowling Avg            50 non-null     object
 10  5w in an Innings       50 non-null     object
 11  Catches Taken          50 non-null     int64 
 12  Stumpings Made         50 non-null     int64 
 13  Avg Diff               50 non-null     object
dtypes: int64(5), object(9)
memory usage: 5.6+ KB


In [24]:
## Converting every data frame to csv


test_batters_df.to_csv('Test_Batters_Performance.csv',index=False)

test_bowlers_df.to_csv('Test_Bowlers_Performance.csv',index=False)

test_allrounders_df.to_csv('Test_Allrounders_Performance.csv',index=False)