In [7]:
import pandas as pd
from requests import get
from bs4 import BeautifulSoup

In [8]:
player_data = pd.read_csv('player_{team_name_id_role}.csv')
player_data

Unnamed: 0,team_name,player_name,player_id,player_role
0,Afghanistan,Hashmatullah Shahidi,440970,Top order Batter
1,Afghanistan,Rahmanullah Gurbaz,974087,Wicketkeeper Batter
2,Afghanistan,Ibrahim Zadran,921509,Opening Batter
3,Afghanistan,Reyaz Hassan,1061090,Batter
4,Afghanistan,Rahmat Shah Zurmati,533956,Allrounder
...,...,...,...,...
145,Sri Lanka,Dunith Wellalage,1152427,Bowler
146,Sri Lanka,Kasun Rajitha,499594,Bowler
147,Sri Lanka,Matheesha Pathirana,1194795,Bowler
148,Sri Lanka,Lahiru Kumara,784375,Bowler


## Batting Data

In [9]:
# column_names = ["player_id", "dummy", "Span", "Mat", "Inns", "NO", "Runs", "HS", "Ave", "BF", "SR", "100", "50",	"0", "4s", "6s"]
# player_id = "253802"

def fetch_batting_data(player_id):
    url = f"https://stats.espncricinfo.com/ci/engine/player/{player_id}.html?class=2;template=results;type=batting;view=innings"

    try:
        fetched_data = get(url)

        soup = BeautifulSoup(fetched_data.text, 'html.parser')

        # find all caption tags where text is "Innings by innings list"
        captions = soup.find_all('caption')
        table = None
        for caption in captions:
            if caption.text == "Career averages":
                table = caption.find_parent('table')

        # Select all rows with class data1
        rows = table.find_all('tr', class_='data1')
        raw_data = []

        for row in rows:
            cols = row.find_all('td')
            row_data = [player_id]
            for col in cols:
                # If col has a links, get the text of the last link
                if col.find('a'):
                    continue
                else:
                    if col.text != '':
                        row_data.append(col.text)

            raw_data.append(row_data)
    except Exception as e:
        print(f"Skipping player {player_id}")

    return raw_data

# raw_data = fetch_batting_data(player_id)
# len(raw_data)

# raw_data

[['253802',
  'overall',
  '2008-2023',
  '287',
  '275',
  '43',
  '13437',
  '183',
  '57.91',
  '14350',
  '93.63',
  '48',
  '69',
  '16',
  '1255',
  '148']]

In [10]:
# df = pd.DataFrame(raw_data, columns=column_names)
# df

Unnamed: 0,player_id,dummy,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s
0,253802,overall,2008-2023,287,275,43,13437,183,57.91,14350,93.63,48,69,16,1255,148


In [11]:
# for player_id in player_data['player_id']: find_performance_data(player_id) and merge with df

batting_raw_data = []
for player_id in player_data['player_id']:
    try:
        batting_raw_data.extend(fetch_batting_data(player_id))
    except:
        print(f"Skipping player {player_id}")

len(batting_raw_data)


150

In [12]:
column_names = ["player_id", "dummy", "Span", "Mat", "Inns", "NO", "Runs", "HS", "Ave", "BF", "SR", "100", "50",	"0", "4s", "6s"]
df = pd.DataFrame(batting_raw_data, columns=column_names)
df

Unnamed: 0,player_id,dummy,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s
0,440970,overall,2013-2023,70,70,11,2001,97*,33.91,2962,67.55,0,18,5,173,12
1,974087,overall,2021-2023,32,32,1,1182,151,38.12,1349,87.62,5,4,1,107,44
2,921509,overall,2019-2023,25,25,2,1123,162,48.82,1366,82.21,4,5,2,119,15
3,1061090,overall,2022-2023,5,4,0,120,50,30.00,192,62.50,0,1,0,12,2
4,533956,overall,2013-2023,103,99,4,3481,114,36.64,4902,71.01,5,25,5,300,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,1152427,overall,18,15,3,207,42*,17.25,269,76.95,0,0,3,16,5,
146,499594,overall,2018-2023,32,17,7,103,33,10.30,165,62.42,0,0,3,12,1
147,1194795,overall,12,7,2,11,5,2.20,35,31.42,0,0,2,2,0,
148,784375,overall,2017-2023,28,16,6,55,10,5.50,122,45.08,0,0,1,7,1


In [13]:
df['dummy'].value_counts()

overall    150
Name: dummy, dtype: int64

In [14]:
df.drop(columns=['dummy'], inplace=True)
df

Unnamed: 0,player_id,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s
0,440970,2013-2023,70,70,11,2001,97*,33.91,2962,67.55,0,18,5,173,12
1,974087,2021-2023,32,32,1,1182,151,38.12,1349,87.62,5,4,1,107,44
2,921509,2019-2023,25,25,2,1123,162,48.82,1366,82.21,4,5,2,119,15
3,1061090,2022-2023,5,4,0,120,50,30.00,192,62.50,0,1,0,12,2
4,533956,2013-2023,103,99,4,3481,114,36.64,4902,71.01,5,25,5,300,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,1152427,18,15,3,207,42*,17.25,269,76.95,0,0,3,16,5,
146,499594,2018-2023,32,17,7,103,33,10.30,165,62.42,0,0,3,12,1
147,1194795,12,7,2,11,5,2.20,35,31.42,0,0,2,2,0,
148,784375,2017-2023,28,16,6,55,10,5.50,122,45.08,0,0,1,7,1


In [15]:
merged_df = pd.merge(player_data, df, on='player_id')
merged_df

Unnamed: 0,team_name,player_name,player_id,player_role,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s
0,Afghanistan,Hashmatullah Shahidi,440970,Top order Batter,2013-2023,70,70,11,2001,97*,33.91,2962,67.55,0,18,5,173,12
1,Afghanistan,Rahmanullah Gurbaz,974087,Wicketkeeper Batter,2021-2023,32,32,1,1182,151,38.12,1349,87.62,5,4,1,107,44
2,Afghanistan,Ibrahim Zadran,921509,Opening Batter,2019-2023,25,25,2,1123,162,48.82,1366,82.21,4,5,2,119,15
3,Afghanistan,Reyaz Hassan,1061090,Batter,2022-2023,5,4,0,120,50,30.00,192,62.50,0,1,0,12,2
4,Afghanistan,Rahmat Shah Zurmati,533956,Allrounder,2013-2023,103,99,4,3481,114,36.64,4902,71.01,5,25,5,300,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,Sri Lanka,Dunith Wellalage,1152427,Bowler,18,15,3,207,42*,17.25,269,76.95,0,0,3,16,5,
146,Sri Lanka,Kasun Rajitha,499594,Bowler,2018-2023,32,17,7,103,33,10.30,165,62.42,0,0,3,12,1
147,Sri Lanka,Matheesha Pathirana,1194795,Bowler,12,7,2,11,5,2.20,35,31.42,0,0,2,2,0,
148,Sri Lanka,Lahiru Kumara,784375,Bowler,2017-2023,28,16,6,55,10,5.50,122,45.08,0,0,1,7,1


In [16]:
merged_df.to_csv('player_{team_name_id_role}_with_career_summary.csv', index=False)


## Bowling Data

In [20]:
# player_id = "26421"

def fetch_bowling_data(player_id):
    url = f"https://stats.espncricinfo.com/ci/engine/player/{player_id}.html?class=2;template=results;type=bowling;view=innings"


    try:
        fetched_data = get(url)

        soup = BeautifulSoup(fetched_data.text, 'html.parser')

        # find all caption tags where text is "Innings by innings list"
        captions = soup.find_all('caption')
        table = None
        for caption in captions:
            if caption.text == "Career averages":
                table = caption.find_parent('table')

        # Select all rows with class data1
        rows = table.find_all('tr', class_='data1')
        raw_data = []

        for row in rows:
            cols = row.find_all('td')
            row_data = [player_id]
            for col in cols:
                # If col has a links, get the text of the last link
                if col.find('a'):
                    continue
                else:
                    if col.text != '':
                        row_data.append(col.text)

            raw_data.append(row_data)
    except:
        print(f"Skipping player {player_id}")
        return [None for i in range(12)]

    return raw_data

# raw_data = fetch_bowling_data(player_id)
# raw_data


[['26421',
  'overall',
  '2010-2023',
  '116',
  '114',
  '1050.3',
  '37',
  '5180',
  '156',
  '4/25',
  '33.20',
  '4.93',
  '40.4',
  '1',
  '0']]

In [22]:
# column_names = ["player_id", "dummy", "Span", "Mat", "Inns", "Overs", "Mdns", "Runs", "Wkts", "BBI", "Ave", "Econ", "SR", "4", "5"]
# df = pd.DataFrame(raw_data, columns=column_names)
# df

Unnamed: 0,player_id,dummy,Span,Mat,Inns,Overs,Mdns,Runs,Wkts,BBI,Ave,Econ,SR,4,5
0,26421,overall,2010-2023,116,114,1050.3,37,5180,156,4/25,33.2,4.93,40.4,1,0


In [23]:
# for player_id in player_data['player_id']: find_performance_data(player_id) and merge with df

bowling_raw_data = []
for player_id in player_data['player_id']:
    try:
        bowling_raw_data.extend(fetch_bowling_data(player_id))
    except:
        print(f"Skipping player {player_id}")

len(bowling_raw_data)


150

In [24]:
df = pd.DataFrame(bowling_raw_data, columns=column_names)
df

Unnamed: 0,player_id,dummy,Span,Mat,Inns,Overs,Mdns,Runs,Wkts,BBI,Ave,Econ,SR,4,5
0,440970,overall,2013-2023,70,2,3.0,0,25,0,-,-,8.33,-,0,0
1,974087,overall,2021-2023,32,-,-,-,-,-,-,-,-,-,-,-
2,921509,overall,2019-2023,25,-,-,-,-,-,-,-,-,-,-,-
3,1061090,overall,2022-2023,5,-,-,-,-,-,-,-,-,-,-,-
4,533956,overall,2013-2023,103,28,89.3,2,520,15,5/32,34.66,5.81,35.8,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,1152427,overall,18,17,122.2,2,678,21,5/40,32.28,5.54,34.9,0,1,
146,499594,overall,2018-2023,32,32,241.3,5,1441,43,4/50,33.51,5.96,33.6,2,0
147,1194795,overall,12,12,84.4,2,616,17,4/32,36.23,7.27,29.8,1,0,
148,784375,overall,2017-2023,28,28,188.3,4,1223,37,3/22,33.05,6.48,30.5,0,0


In [25]:
df['dummy'].value_counts()

overall    150
Name: dummy, dtype: int64

In [26]:
df.drop(columns=['dummy'], inplace=True)
df

Unnamed: 0,player_id,Span,Mat,Inns,Overs,Mdns,Runs,Wkts,BBI,Ave,Econ,SR,4,5
0,440970,2013-2023,70,2,3.0,0,25,0,-,-,8.33,-,0,0
1,974087,2021-2023,32,-,-,-,-,-,-,-,-,-,-,-
2,921509,2019-2023,25,-,-,-,-,-,-,-,-,-,-,-
3,1061090,2022-2023,5,-,-,-,-,-,-,-,-,-,-,-
4,533956,2013-2023,103,28,89.3,2,520,15,5/32,34.66,5.81,35.8,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,1152427,18,17,122.2,2,678,21,5/40,32.28,5.54,34.9,0,1,
146,499594,2018-2023,32,32,241.3,5,1441,43,4/50,33.51,5.96,33.6,2,0
147,1194795,12,12,84.4,2,616,17,4/32,36.23,7.27,29.8,1,0,
148,784375,2017-2023,28,28,188.3,4,1223,37,3/22,33.05,6.48,30.5,0,0


In [28]:
merged_df_bowling = pd.merge(player_data, df, on='player_id')
merged_df_bowling

Unnamed: 0,team_name,player_name,player_id,player_role,Span,Mat,Inns,Overs,Mdns,Runs,Wkts,BBI,Ave,Econ,SR,4,5
0,Afghanistan,Hashmatullah Shahidi,440970,Top order Batter,2013-2023,70,2,3.0,0,25,0,-,-,8.33,-,0,0
1,Afghanistan,Rahmanullah Gurbaz,974087,Wicketkeeper Batter,2021-2023,32,-,-,-,-,-,-,-,-,-,-,-
2,Afghanistan,Ibrahim Zadran,921509,Opening Batter,2019-2023,25,-,-,-,-,-,-,-,-,-,-,-
3,Afghanistan,Reyaz Hassan,1061090,Batter,2022-2023,5,-,-,-,-,-,-,-,-,-,-,-
4,Afghanistan,Rahmat Shah Zurmati,533956,Allrounder,2013-2023,103,28,89.3,2,520,15,5/32,34.66,5.81,35.8,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,Sri Lanka,Dunith Wellalage,1152427,Bowler,18,17,122.2,2,678,21,5/40,32.28,5.54,34.9,0,1,
146,Sri Lanka,Kasun Rajitha,499594,Bowler,2018-2023,32,32,241.3,5,1441,43,4/50,33.51,5.96,33.6,2,0
147,Sri Lanka,Matheesha Pathirana,1194795,Bowler,12,12,84.4,2,616,17,4/32,36.23,7.27,29.8,1,0,
148,Sri Lanka,Lahiru Kumara,784375,Bowler,2017-2023,28,28,188.3,4,1223,37,3/22,33.05,6.48,30.5,0,0


In [None]:
merged_df_bowling.to_csv('player_{team_name_id_role}_with_career_summary_bowling.csv', index=False)