# Scraping Data from Cricbuzz Website + Pandas

### ICC Cricket World Cup 2023

#### Most Runs

##### Code :-

In [55]:
# Importing necessary libraries
from bs4 import BeautifulSoup  # For parsing HTML content
import requests  # For making HTTP requests



In [56]:
# Define the URL of the Cricbuzz stats page for ICC Cricket World Cup 2023
url = 'https://www.cricbuzz.com/cricket-series/6732/icc-cricket-world-cup-2023/stats'

# Sending a GET request to fetch the webpage content
page = requests.get(url)

# Parsing the HTML content of the webpage using BeautifulSoup
soup = BeautifulSoup(page.text, 'html.parser')

In [57]:

# Print the entire parsed HTML content (for debugging purposes)
print(soup)



<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta charset="utf-8"/><script>var is_mobile = /symbian|tizen|midp|uc(web|browser)|MSIE (5.0|6.0|7.0|8.0)|tablet/i.test(navigator.userAgent);	if(is_mobile && window.location.hostname != "www1.cricbuzz.com") window.location.hostname = "m.cricbuzz.com";</script><style>html{scroll-behavior: smooth;}	body{background:#E3E6E3; font-family: helvetica,"Segoe UI",Arial,sans-serif;color:#222;font-size:14px; line-height: 1.5; margin:0;}	body, .cb-comm-pg, .cb-hm-mid {min-height:1000px}	.container{width:980px;margin:0 auto;}	.page{max-width: 980px;margin: 0 auto;position: relative;}	.cb-col-8 {width:8%;}	.cb-col-10 {width:10%;}	.cb-col-14 {width:14%;}	.cb-col-16 {width:16%;}	.cb-col-20 {width:20%;}	.cb-col-25 {width:25%;}	.cb-col-27 {width:27%;}	.cb-col-30 {width:30%;}	.cb-col-33 {width:33%;}	.cb-col-40 {width:40%;}	.cb-col-46 {width:46%;}	.cb-col-47 {width:47%;}	.cb-col-50 {width:50%;}	.cb-col-60 {width:60%;}

In [58]:
# Extracting the first table from the parsed HTML content
table = soup.find('table')  # Finding the first occurrence of the <table> tag

# Print the extracted table (for debugging purposes)
print(table)


<table class="table table-responsive cb-series-stats"> <thead class="cb-srs-gray-strip"> <tr class="cb-srs-stats-tr"> <th class="cb-srs-stats-th cb-srs-stats-player text-right"></th> <th class="cb-srs-stats-th cb-srs-stats-player text-left">Player</th> <th class="cb-srs-stats-th cb-srs-stats-player text-right">Matches</th> <th class="cb-srs-stats-th cb-srs-stats-player text-right">Inns</th> <th class="cb-srs-stats-th cb-srs-stats-player text-right">Runs</th> <th class="cb-srs-stats-th cb-srs-stats-player text-right">Avg</th> <th class="cb-srs-stats-th cb-srs-stats-player text-right">Sr</th> <th class="cb-srs-stats-th cb-srs-stats-player text-right">4s</th> <th class="cb-srs-stats-th cb-srs-stats-player text-right">6s</th> </tr> </thead> <tbody> <tr class="cb-srs-stats-tr"> <td class="cb-srs-stats-td text-right">1</td> <td class="cb-srs-stats-td text-left"><a class="cb-text-link" href="/profiles/1413/virat-kohli">Virat Kohli</a></td> <td class="cb-srs-stats-td text-right">11</td> <td cl

In [59]:
# Finding the table element with the specified class in the HTML page
table = soup.find('table', class_="table table-responsive cb-series-stats")

# Printing the extracted table to verify its content
print(table)


<table class="table table-responsive cb-series-stats"> <thead class="cb-srs-gray-strip"> <tr class="cb-srs-stats-tr"> <th class="cb-srs-stats-th cb-srs-stats-player text-right"></th> <th class="cb-srs-stats-th cb-srs-stats-player text-left">Player</th> <th class="cb-srs-stats-th cb-srs-stats-player text-right">Matches</th> <th class="cb-srs-stats-th cb-srs-stats-player text-right">Inns</th> <th class="cb-srs-stats-th cb-srs-stats-player text-right">Runs</th> <th class="cb-srs-stats-th cb-srs-stats-player text-right">Avg</th> <th class="cb-srs-stats-th cb-srs-stats-player text-right">Sr</th> <th class="cb-srs-stats-th cb-srs-stats-player text-right">4s</th> <th class="cb-srs-stats-th cb-srs-stats-player text-right">6s</th> </tr> </thead> <tbody> <tr class="cb-srs-stats-tr"> <td class="cb-srs-stats-td text-right">1</td> <td class="cb-srs-stats-td text-left"><a class="cb-text-link" href="/profiles/1413/virat-kohli">Virat Kohli</a></td> <td class="cb-srs-stats-td text-right">11</td> <td cl

In [60]:
# Extract all table headers (<th> elements) from the table
titles = table.find_all('th')  

# Print the extracted headers (for debugging purposes)
print(titles)


[<th class="cb-srs-stats-th cb-srs-stats-player text-right"></th>, <th class="cb-srs-stats-th cb-srs-stats-player text-left">Player</th>, <th class="cb-srs-stats-th cb-srs-stats-player text-right">Matches</th>, <th class="cb-srs-stats-th cb-srs-stats-player text-right">Inns</th>, <th class="cb-srs-stats-th cb-srs-stats-player text-right">Runs</th>, <th class="cb-srs-stats-th cb-srs-stats-player text-right">Avg</th>, <th class="cb-srs-stats-th cb-srs-stats-player text-right">Sr</th>, <th class="cb-srs-stats-th cb-srs-stats-player text-right">4s</th>, <th class="cb-srs-stats-th cb-srs-stats-player text-right">6s</th>]


In [61]:
# Extract and clean the text from the headers
titles = [title.get_text(strip=True) for title in table.find_all('th')]

# Print the cleaned headers
print(titles)  # This will show a list of column names, e.g., ['Player', 'Matches', 'Inns', 'Runs', 'Avg', 'SR', '4s', '6s']



['', 'Player', 'Matches', 'Inns', 'Runs', 'Avg', 'Sr', '4s', '6s']


In [62]:
# Import pandas for data manipulation
import pandas as pd

In [63]:
# Create an empty DataFrame with column names extracted from the table
df = pd.DataFrame(columns=titles)

# Display the empty DataFrame structure
df

Unnamed: 0,Unnamed: 1,Player,Matches,Inns,Runs,Avg,Sr,4s,6s


In [64]:
# Extract all table rows, skipping the header row
column_data = table.find_all('tr')[1:]  # Skip header row if needed

# Iterate through each row in the table
for row in column_data:
    # Extract all table data (td) elements within the row
    row_data = row.find_all('td')
    
    # Extract text from each cell and strip any extra spaces
    individual_row_data = [data.get_text(strip=True) for data in row_data]
    
    # Ensure row matches column count and append it to the DataFrame
    length = len(df)
    df.loc[length] = individual_row_data

# Remove any duplicate rows from the DataFrame
df.drop_duplicates(inplace=True)



In [65]:
# Display the final DataFrame with extracted data
df

Unnamed: 0,Unnamed: 1,Player,Matches,Inns,Runs,Avg,Sr,4s,6s
0,1,Virat Kohli,11,11,765,95.62,90.32,68,9
1,2,Rohit Sharma,11,11,597,54.27,125.95,66,31
2,3,Quinton de Kock,10,10,594,59.4,107.03,57,21
3,4,Rachin Ravindra,10,10,578,64.22,106.45,55,17
4,5,Daryl Mitchell,10,9,552,69.0,111.07,48,22
5,6,David Warner,11,11,535,48.64,108.3,50,24
6,7,Shreyas Iyer,11,11,530,66.25,113.25,37,24
7,8,KL Rahul,11,10,452,75.33,90.76,38,9
8,9,Rassie van der Dussen,10,10,448,49.78,84.53,39,8
9,10,Mitchell Marsh,10,10,441,49.0,107.56,43,21


In [66]:
# Save the extracted data to a CSV file
# 'index=False' ensures that the DataFrame index is not written to the CSV file
df.to_csv(r'C:\Users\Dhruv Bansal\OneDrive\Documents\Jupyter Notebook\Web Scrapping Project\Highest Runs.csv', index=False)
