# Data Scraping with BeautifulSoup and Requests

This notebook demonstrates how to scrape cricket statistics from ESPN Cricinfo using BeautifulSoup and Requests.

## Importing Libraries

We start by importing the necessary libraries.

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# **test on base requests**

from here we can see that the site has bot-detection (403 error)

In [None]:
#403--->access denied
#200 ---> granted
requests.get('https://stats.espncricinfo.com/ci/engine/stats/index.html?class=3;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=allround')

<Response [403]>

using [`curl_cffi`](https://www.reddit.com/r/webscraping/comments/15xb82e/comment/jx594uu/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button).

TO BYPASS BOT DETECTION

In [None]:
!pip install curl_cffi




In [None]:
from curl_cffi import requests

In [None]:

response = requests.get('https://stats.espncricinfo.com/ci/engine/stats/index.html?class=3;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=allround',impersonate="chrome110")
response

<Response [200]>

as we get 200 we are now granted access thanks to curl_cffi


## URLs for Different Formats

We define the base URLs for different cricket formats and types (all-round, batting, bowling) starting from 24 November 2024

In [None]:

urls = {
    "TEST_ALL": "https://stats.espncricinfo.com/ci/engine/stats/index.html?class=1;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=allround",
    "ODI_ALL": "https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=allround",
    "T20Is_ALL": "https://stats.espncricinfo.com/ci/engine/stats/index.html?class=3;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=allround",
    "Tests_bat": "https://stats.espncricinfo.com/ci/engine/stats/index.html?class=1;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=batting",
    "ODIs_bat": "https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=batting",
    "T20Is_bat": "https://stats.espncricinfo.com/ci/engine/stats/index.html?class=3;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=batting",
    "Tests_bowl": "https://stats.espncricinfo.com/ci/engine/stats/index.html?class=1;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=bowling",
    "ODIs_bowl": "https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=bowling",
    "T20Is_bowl": "https://stats.espncricinfo.com/ci/engine/stats/index.html?class=3;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=bowling"
}

## Scraping Functions

We define two functions: `scrape_page` to scrape a single page
and
 `scrape_all_pages` to handle multiple pages

In [None]:
#single page
def scrape_page(url):
    response = requests.get(url, impersonate='chrome110')
    soup = BeautifulSoup(response.content, 'html.parser')
    table_soup = soup.find_all('table', class_='engineTable')
    table = table_soup[2]
#columns
    headers = []
    for th in table.find_all('th'):
        headers.append(th.get_text(strip=True))
#rows
    rows = []
    for tr in table.find_all('tr', class_='data1'):
        data_rs = tr.find_all('td') #data in rows
        row = [data_r.get_text(strip=True) for data_r in data_rs]
        rows.append(row)
        #new dataframe
    df = pd.DataFrame(rows, columns=headers)
    return df


#handle multiple pages we only have 2 pages for each so max_pages=2
def scrape_all_pages(base, max_pages=2):
    final_df = []
    page = 1
    while page <= max_pages:
        try:
            pagelink = base + ";page=" + str(page)
            print("Scraping " + pagelink)
            df = scrape_page(pagelink)
            #no data:::::::::stops
            if df.empty:
                break
            #append df to final_df
            final_df.append(df)
            page += 1

        except Exception as e:
            print("Error on page " + str(page) + ": " + str(e))
            break

#concatenate data from multiple pages
    if final_df:
        final_df = pd.concat(final_df, ignore_index=True)
    else:
        final_df = pd.DataFrame()

    return final_df


##iterate through url ,scrap data and save the data to a .csv file

In [None]:
for name, url in urls.items():
    print("Processing "+ name+".csv")
    data_df = scrape_all_pages(url)

    if not data_df.empty:
        file_name = name+".csv"
        data_df.to_csv(file_name, index=False)
        print(file_name+"  SAVED!!!!!")
    else:
        print("NO DATA FOUND FOR"+file_name)

print("Scraping completed.")

ProcessingTEST_ALL.csv
Scraping https://stats.espncricinfo.com/ci/engine/stats/index.html?class=1;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=allround;page=1
Scraping https://stats.espncricinfo.com/ci/engine/stats/index.html?class=1;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=allround;page=2
TEST_ALL.csv  SAVED!!!!!
ProcessingODI_ALL.csv
Scraping https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=allround;page=1
Scraping https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=allround;page=2
ODI_ALL.csv  SAVED!!!!!
ProcessingT20Is_ALL.csv
Scraping https://stats.espncricinfo.com/ci/engine/stats/index.html?class=3;spanmin1=24+Nov+2012;spanval1=span;team=7;template=results;type=allround;page=1
Scraping https://stats.espncricinfo.com/ci/engine/stats/index.html?class=3;spanmin1=24+Nov+2012;spanval1=sp

### **SCRAPE INDIVIDUAL STATS FROM PLAYER PAGE AND IT HAS ONLY ONE PAGE SO NEED TO HANDLE MULTIPLE PAGES**

In [None]:

def scrape_page_individual(url):
    response = requests.get(url, impersonate='chrome110')
    soup = BeautifulSoup(response.content, 'html.parser')
    table_soup = soup.find_all('table', class_='engineTable')
    table1 = table_soup[2]
    Matches = table_soup[3]
    headers = []
    for th in Matches.find_all('th'):
        headers.append(th.get_text(strip=True))
#rows
    rows = []
    for tr in Matches.find_all('tr', class_='data1'):
        cells = tr.find_all('td')
        row = [cell.get_text(strip=True) for cell in cells]
        rows.append(row)
    df = pd.DataFrame(rows, columns=headers)
    return df

### **WE SCRAPE BABAR AZAM's BATTING STATS SINCE THE START OF HIS CAREER IN EACH FORMAT**

In [None]:
BEST_BATSMAN = {"TEST(BABAR AZAM)":"https://stats.espncricinfo.com/ci/engine/player/348144.html?class=1;template=results;type=batting;view=match",
              "ODI(BABAR AZAM)":"https://stats.espncricinfo.com/ci/engine/player/348144.html?class=2;template=results;type=batting;view=match",
              "T20I(BABAR AZAM)":"https://stats.espncricinfo.com/ci/engine/player/348144.html?class=3;template=results;type=batting;view=match"
              }


In [None]:
for name, url in BEST_BATSMAN.items():
    print("Processing"+ name+".csv")
    data_df = scrape_page_individual(url)#bcz player individual stats has only one page

    if not data_df.empty:
        file_name = name+".csv"
        data_df.to_csv(file_name, index=False)
        print(file_name+"  SAVED!!!!!")
    else:
        print("NO DATA FOUND FOR"+file_name)

print("Scraping completed.")

ProcessingTEST(BABAR AZAM).csv
TEST(BABAR AZAM).csv  SAVED!!!!!
ProcessingODI(BABAR AZAM).csv
ODI(BABAR AZAM).csv  SAVED!!!!!
ProcessingT20I(BABAR AZAM).csv
T20I(BABAR AZAM).csv  SAVED!!!!!
Scraping completed.


### **WE SCRAPE SHAHEEN SHAH AFRIDI's STATS SINCE THE START OF HIS CAREER IN EACH FORMAT**

In [None]:
BEST_BOWLER = {
    "TEST(SHAHEEN SHAH AFRIDI)":"https://stats.espncricinfo.com/ci/engine/player/1072470.html?class=1;template=results;type=bowling;view=match",
    "ODI(SHAHEEN SHAH AFRIDI)":"https://stats.espncricinfo.com/ci/engine/player/1072470.html?class=2;template=results;type=bowling;view=match",
    "T20i(SHAHEEN SHAH AFRIDI)":"https://stats.espncricinfo.com/ci/engine/player/1072470.html?class=3;template=results;type=bowling;view=match"
}

In [None]:
for name, url in BEST_BOWLER.items():
    print("Processing"+ name+".csv")
    data_df = scrape_page_individual(url)#bcz player individual stats has only one page

    if not data_df.empty:
        file_name = name+".csv"
        data_df.to_csv(file_name, index=False)
        print(file_name+"  SAVED!!!!!")
    else:
        print("NO DATA FOUND FOR"+file_name)

print("Scraping completed.")

ProcessingTEST(SHAHEEN SHAH AFRIDI).csv
TEST(SHAHEEN SHAH AFRIDI).csv  SAVED!!!!!
ProcessingODI(SHAHEEN SHAH AFRIDI).csv
ODI(SHAHEEN SHAH AFRIDI).csv  SAVED!!!!!
ProcessingT20i(SHAHEEN SHAH AFRIDI).csv
T20i(SHAHEEN SHAH AFRIDI).csv  SAVED!!!!!
Scraping completed.
